Source code for nordlys.core.data.word2vec.word2vec2mongo

"""
Word2vec to Mongo
--------------

Loads Word2Vec to MongoDB.

:Authors: Faegheh Hasibi, Dario Garigliotti
"""

import argparse
import os.path as op
from sys import exit

from nordlys.config import MONGO_HOST, MONGO_DB, MONGO_COLLECTION_WORD2VEC
from nordlys.core.storage.mongo import Mongo
from nordlys.core.utils.file_utils import FileUtils
from nordlys.config import PLOGGER

KEY_COLLECTION = "collection"
KEY_MAPPING_FILE = "mapping_file"


[docs]class Word2VecToMongo(object): def __init__(self, config): self.__check_config(config) self.__collection = config[KEY_COLLECTION] self.__w2v_fname = config[KEY_MAPPING_FILE] self.__mongo = None @staticmethod def __check_config(config): """Checks params and set default values.""" try: if KEY_COLLECTION not in config: raise Exception(KEY_COLLECTION + " is missing") if KEY_MAPPING_FILE not in config: raise Exception(KEY_MAPPING_FILE + " is missing") if not op.exists(config[KEY_MAPPING_FILE]): raise Exception("Mapping file path does not exist.") except Exception as e: PLOGGER.error("Error in config file: ", e) exit(1) return config def __parse_line(self, line): """ Parses a line of the plain-text GoogleNews 300-dim pre-trained corpus. :param line: :type line: string :return: a (word, vector) tuple. """ word, vec_str = line.rstrip().split(maxsplit=1) vector = [float(x) for x in vec_str.split()] return word, vector
[docs] def build(self): """Builds word2vec collection from GoogleNews 300-dim pre-trained corpus.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() infile = FileUtils.open_file_by_type(self.__w2v_fname) i = 0 for line in infile: term, vector = self.__parse_line(line) self.__mongo.add(term, {'vector': vector}) i += 1 if i % 1000 == 0: PLOGGER.info(str(i / 1000) + "K lines are loaded.")
# break
[docs]def arg_parser(): parser = argparse.ArgumentParser() parser.add_argument("filename", help="word2vec corpus filename", type=str) args = parser.parse_args() return args
[docs]def main(args): config = FileUtils.load_config(args.config) w2v_to_mongo = Word2VecToMongo(config) w2v_to_mongo.build()
if __name__ == '__main__': main(arg_parser())