Source code for nordlys.core.retrieval.indexer_mongo

"""
Mongo Indexer
=============

This class is a tool for creating an index from a Mongo collection.

To use this class, you need to implement :func:`callback_get_doc_content` function.
See :mod:`~nordlys.core.data.dbpedia.indexer_fsdm` for an example usage of this class.

:Author: Faegheh Hasibi
"""
from nordlys.config import MONGO_COLLECTION_DBPEDIA, MONGO_HOST, MONGO_DB, PLOGGER
from nordlys.core.retrieval.elastic import Elastic
from nordlys.core.storage.mongo import Mongo
# from nordlys.core.utils.logging_utils import PLOGGER


[docs]class IndexerMongo(object):

    def __init__(self, index_name, mappings, collection, model=Elastic.BM25):
        self.__index_name = index_name
        self.__mappings = mappings
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, collection)
        self.__model = model

[docs]    def build(self, callback_get_doc_content, bulk_size=1000):
        """Builds the DBpedia index from the mongo collection.

        To speedup indexing, we index documents as a bulk.
        There is an optimum value for the bulk size; try to figure it out.

        :param callback_get_doc_content: a function that get a documet from mongo and return the content for indexing
        :param bulk_size: Number of documents to be added to the index as a bulk
        """
        PLOGGER.info("Building " + self.__index_name + " ...")
        elastic = Elastic(self.__index_name)
        elastic.create_index(self.__mappings, model=self.__model, force=True)

        i = 0
        docs = dict()
        for mdoc in self.__mongo.find_all(no_timeout=True):
            docid = Mongo.unescape(mdoc[Mongo.ID_FIELD])

            # get back document from mongo with keys and _id field unescaped
            doc = callback_get_doc_content(Mongo.unescape_doc(mdoc))
            if doc is None:
                continue
            docs[docid] = doc

            i += 1
            if i % bulk_size == 0:
                elastic.add_docs_bulk(docs)
                docs = dict()
                PLOGGER.info(str(i / 1000) + "K documents indexed")
        # indexing the last bulk of documents
        elastic.add_docs_bulk(docs)
        PLOGGER.info("Finished indexing (" + str(i) + " documents in total)")