Source code for nordlys.core.data.dbpedia.indexer_dbpedia_types

"""
DBpedia Types Indexer
=====================

Builds a DBpedia type index from entity abstracts.

The index is build directly from DBpedia files in .ttl.bz2 format (i.e., MongoDB
is not needed).

Usage
-----

::

  python -m nordlys.core.dbpedia.indexer_dbpedia_types -c <config_file>


Config parameters
------------------

- **index_name**: name of the index
- **dbpedia_files_path**: path to DBpedia .ttl.bz2 files


:Authors: Krisztian Balog, Dario Garigliotti
"""

import os
import argparse
from collections import defaultdict
from random import sample
from math import floor

from rdflib.plugins.parsers.ntriples import NTriplesParser
from rdflib.plugins.parsers.ntriples import ParseError
from rdflib.term import URIRef
from nordlys.core.storage.parser.nt_parser import Triple
from nordlys.core.storage.parser.uri_prefix import URIPrefix
from nordlys.core.utils.file_utils import FileUtils
from nordlys.core.retrieval.elastic import Elastic
from nordlys.config import DATA_DIR
from nordlys.config import PLOGGER

ENTITY_ABSTRACTS_FILE = "short_abstracts_en.ttl.bz2"
# Note that instance_types_en.ttl contains only the most specific types, while
# instance_types_transitive_en.ttl contains only the path to the path to the
# most specific types.  Therefore, both files need to be loaded.
ENTITY_TYPES_FILES = ["instance_types_en.ttl.bz2",
                      "instance_types_transitive_en.ttl.bz2"]

MAX_BULKING_DOC_SIZE = 20000000  # Max doc len when bulking, in chars (20MB)
AVG_SHORT_ABSTRACT_LEN = 216  # Based on DBpedia-2015-10


[docs]class IndexerDBpediaTypes(object): __DOC_TYPE = "doc" # we don't make use of types __MAPPINGS = { "id": Elastic.notanalyzed_field(), "content": Elastic.analyzed_field(), } def __init__(self, config): self.__elastic = None self.__config = config self.__index_name = config["index_name"] self.__dbpedia_path = config["dbpedia_files_path"] # For triple parsing self.__prefix = URIPrefix() self.__triple = Triple() self.__ntparser = NTriplesParser(self.__triple) # Entity abstract and type assignments kept in memory self.__entity_abstracts = {} self.__load_entity_abstracts() self.__types_entities = defaultdict(list) self.__load_entity_types() @property def name(self): return self.__index_name def __parse_line(self, line): """Parses a line from a ttl file and returns subject and object pair. It is used for parsing DBpedia abstracts and entity types. The subject is always prefixed. For object URIs, it is returned prefixed if from DBpedia otherwise None (i.e., types); literal objects are always returned (i.e., abstracts). """ line = line.decode("utf-8") if isinstance(line, bytes) else line try: self.__ntparser.parsestring(line) except ParseError: # skip lines that couldn't be parsed return None, None if self.__triple.subject() is None: # only if parsed as a triple return None, None subj = self.__prefix.get_prefixed(self.__triple.subject()) obj = None if type(self.__triple.object()) is URIRef: if self.__triple.object().startswith("http://dbpedia.org/ontology"): obj = self.__prefix.get_prefixed(self.__triple.object()) else: obj = self.__triple.object().encode("utf-8") return subj, obj def __load_entity_abstracts(self): num_lines = 0 filename = os.sep.join([self.__dbpedia_path, ENTITY_ABSTRACTS_FILE]) PLOGGER.info("Loading entity abstracts from {}".format(filename)) for line in FileUtils.read_file_as_list(filename): entity, abstract = self.__parse_line(line) if abstract and len(abstract) > 0: # skip empty objects self.__entity_abstracts[entity] = abstract num_lines += 1 if num_lines % 10000 == 0: PLOGGER.info(" {}K lines processed".format(num_lines // 1000)) PLOGGER.info(" Done.") def __load_entity_types(self): num_lines = 0 for types_file in ENTITY_TYPES_FILES: filename = os.sep.join([self.__dbpedia_path, types_file]) PLOGGER.info("Loading entity types from {}".format(filename)) for line in FileUtils.read_file_as_list(filename): entity, entity_type = self.__parse_line(line) if type(entity_type) != str: # Likely result of parsing error continue if not entity_type.startswith("<dbo:"): PLOGGER.info(" Non-DBpedia type: {}".format(entity_type)) continue if not entity.startswith("<dbpedia:"): PLOGGER.info(" Invalid entity: {}".format(entity)) continue self.__types_entities[entity_type].append(entity) num_lines += 1 if num_lines % 10000 == 0: PLOGGER.info(" {}K lines processed".format(num_lines // 1000)) PLOGGER.info(" Done.") def __make_type_doc(self, type_name): """Gets the document representation of a type to be indexed, from its entity short abstracts.""" content = "\n".join( [self.__entity_abstracts.get(e, b"").decode("utf-8") for e in self.__types_entities[type_name]]) if len(content) > MAX_BULKING_DOC_SIZE: PLOGGER.info( "Type {} has content larger than allowed: {}.".format( type_name, len(content))) # we randomly sample a subset of Y entity abstracts, s.t. # Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE num_entities = len(self.__types_entities[type_name]) amount_abstracts_to_sample = min( floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN), num_entities) entities_sample = [self.__types_entities[type_name][i] for i in sample(range(num_entities), amount_abstracts_to_sample)] content = "" # reset content for entity in entities_sample: new_content_candidate = "\n".join([content, self.__entity_abstracts.get( entity, b"").decode( "utf-8")]) # we add an abstract only if by doing so it will not exceed # MAX_BULKING_DOC_SIZE if len(new_content_candidate) > MAX_BULKING_DOC_SIZE: break content = new_content_candidate return {"content": content}
[docs] def build_index(self, force=False): """Builds the index. Note: since DBpedia only has a few hundred types, no bulk indexing is needed. :param force: True iff it is required to overwrite the index (i.e. by creating it by force); False by default. :type force: bool :return: """ PLOGGER.info("Building type index {}".format(self.__index_name)) self.__elastic = Elastic(self.__index_name) self.__elastic.create_index(mappings=self.__MAPPINGS, force=force) for type_name in self.__types_entities: PLOGGER.info(" Adding {} ...".format(type_name)) contents = self.__make_type_doc(type_name) self.__elastic.add_doc(type_name, contents) PLOGGER.info(" Done.")
[docs]def main(args): config = FileUtils.load_config(args.config) dbpedia_path = config.get("dbpedia_files_path", "") # Check DBpedia files PLOGGER.info("Checking needed DBpedia files under {}".format(dbpedia_path)) for fname in [ENTITY_ABSTRACTS_FILE] + ENTITY_TYPES_FILES: if os.path.isfile(os.sep.join([dbpedia_path, fname])): PLOGGER.info(" - {}: OK".format(fname)) else: PLOGGER.error(" - {}: Missing".format(fname)) exit(1) indexer = IndexerDBpediaTypes(config) indexer.build_index(force=True)
[docs]def arg_parser(): parser = argparse.ArgumentParser() parser.add_argument("config", help="config file", type=str) args = parser.parse_args() return args
if __name__ == "__main__": main(arg_parser())