"""
Facc to Mongo
=============
Adds entity surface forms from the Freebase Annotated ClueWeb Corpora (FACC).
The input to this script is (name variant, Freebase entity, count) triples.
See `data/facc1/README.md` for the preparation of FACC data in such format.
:Authors: Krisztian Balog, Faegheh Hasibi
"""
import argparse
import os
import sys
from nordlys.config import MONGO_HOST, MONGO_DB, MONGO_COLLECTION_SF_FACC
from nordlys.core.storage.mongo import Mongo
from nordlys.core.utils.file_utils import FileUtils
from nordlys.config import PLOGGER
# static key values
KEY_COLLECTION = "collection"
KEY_PATH = "path"
KEY_PREDICATE = "predicate"
KEY_LOWERCASE = "lowercase"
[docs]class FACCToMongo(object):
def __init__(self, config):
"""Inserts FACC surface forms to Mongo."""
self.__check_config(config)
self.__collection = config[KEY_COLLECTION]
self.__path = config[KEY_PATH]
self.__predicate = config[KEY_PREDICATE]
self.__lowercase = config[KEY_LOWERCASE]
self.__mongo = None
@staticmethod
def __check_config(config):
"""Checks config parameters and sets default values."""
try:
if KEY_COLLECTION not in config:
raise Exception(KEY_COLLECTION + " is missing")
if KEY_PATH not in config:
raise Exception(KEY_PATH + " is missing")
if KEY_PREDICATE not in config:
raise Exception(KEY_PREDICATE + " is missing")
if KEY_LOWERCASE not in config:
config[KEY_LOWERCASE] = True
except Exception as e:
PLOGGER.error("Error in config file: ", e)
sys.exit(1)
def __add_surface_form(self, surface_form, freebase_uri, count):
"""Adds a surface form."""
if self.__lowercase:
surface_form = surface_form.lower()
# Increases count; if the id is not associated with the surface form yet, it adds it with count.
freebase_id = self.__convert_to_fb_id(freebase_uri)
self.__mongo.inc_in_dict(surface_form, self.__predicate, freebase_id, count)
def __convert_to_fb_id(self, fb_uri):
"""Converts /m/047b9p0 to <fb:m.047b9p0>"""
fb_id = fb_uri.replace("/", ".")
return "<fb:" + fb_id[1:] + ">"
def __add_file(self, tsv_filename):
"""Adds name variants from an FACC tsv file."""
PLOGGER.info("Adding name variants from '" + tsv_filename + "'...")
infile = open(tsv_filename, "r")
for line in infile:
f = line.rstrip().split("\t")
self.__add_surface_form(f[0], f[1], int(f[2]))
infile.close()
[docs] def build(self):
"""Builds surface form collection from FACC annotations."""
self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
self.__mongo.drop()
for path, dirs, files in os.walk(self.__path):
for fn in files:
if fn.endswith(".tsv"):
self.__add_file(os.path.join(path, fn))
PLOGGER.info("Collection " + self.__collection + " is built.")
[docs]def main(args):
config = FileUtils.load_config(args.config)
sfm = FACCToMongo(config)
sfm.build()
[docs]def arg_parser():
parser = argparse.ArgumentParser()
parser.add_argument("config", help="config file", type=str)
args = parser.parse_args()
return args
if __name__ == '__main__':
main(arg_parser())