Source code for nordlys.core.data.word2vec.word2vec2mongo

"""
Word2vec to Mongo
--------------

Loads Word2Vec to MongoDB.

:Authors: Faegheh Hasibi, Dario Garigliotti
"""

import argparse
import os.path as op
from sys import exit

from nordlys.config import MONGO_HOST, MONGO_DB, MONGO_COLLECTION_WORD2VEC
from nordlys.core.storage.mongo import Mongo
from nordlys.core.utils.file_utils import FileUtils
from nordlys.config import PLOGGER

KEY_COLLECTION = "collection"
KEY_MAPPING_FILE = "mapping_file"


[docs]class Word2VecToMongo(object):
    def __init__(self, config):
        self.__check_config(config)
        self.__collection = config[KEY_COLLECTION]
        self.__w2v_fname = config[KEY_MAPPING_FILE]
        self.__mongo = None

    @staticmethod
    def __check_config(config):
        """Checks params and set default values."""
        try:
            if KEY_COLLECTION not in config:
                raise Exception(KEY_COLLECTION + " is missing")
            if KEY_MAPPING_FILE not in config:
                raise Exception(KEY_MAPPING_FILE + " is missing")
            if not op.exists(config[KEY_MAPPING_FILE]):
                raise Exception("Mapping file path does not exist.")
        except Exception as e:
            PLOGGER.error("Error in config file: ", e)
            exit(1)
        return config

    def __parse_line(self, line):
        """
        Parses a line of the plain-text GoogleNews 300-dim pre-trained corpus.

        :param line:
        :type line: string
        :return: a (word, vector) tuple.
        """
        word, vec_str = line.rstrip().split(maxsplit=1)
        vector = [float(x) for x in vec_str.split()]

        return word, vector

[docs]    def build(self):
        """Builds word2vec collection from GoogleNews 300-dim pre-trained corpus."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        infile = FileUtils.open_file_by_type(self.__w2v_fname)
        i = 0
        for line in infile:
            term, vector = self.__parse_line(line)
            self.__mongo.add(term, {'vector': vector})
            i += 1
            if i % 1000 == 0:
                PLOGGER.info(str(i / 1000) + "K lines are loaded.")
                # break

[docs]def arg_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("filename", help="word2vec corpus filename", type=str)
    args = parser.parse_args()
    return args


[docs]def main(args):
    config = FileUtils.load_config(args.config)
    w2v_to_mongo = Word2VecToMongo(config)
    w2v_to_mongo.build()


if __name__ == '__main__':
    main(arg_parser())