Source code for nordlys.logic.er.field_mapping

"""
Field Mapping for ER
====================

Computes PRMS field mapping probabilities.

:Author: Faegheh Hasibi
"""

from __future__ import division
import argparse
import json
from pprint import pprint

from nordlys.core.retrieval.elastic_cache import ElasticCache
from nordlys.core.retrieval.scorer import ScorerPRMS
from nordlys.logic.elr.top_fields import TopFields


[docs]class FieldMapping(object): DEBUG = 0 MAPPING_DEBUG = 0 def __init__(self, elastic_uri, n): self.elastic_uri = elastic_uri self.n_fields = n
[docs] def map(self, en_id): """ Gets PRMS mapping probability for a clique type :return Dictionary {field: weight, ..} """ top_fields = TopFields(self.elastic_uri).get_top_term(en_id, self.n_fields) scorer_prms = ScorerPRMS(self.elastic_uri, None, {'fields': top_fields}) field_weights = scorer_prms.get_mapping_prob(en_id) return field_weights
[docs]def load_entities(annot_file, th=0.1): annots = json.load(open(annot_file, "r")) entities = set() for qid, annot in annots.items(): for item in annot["annots"]: if item["score"] >= th: entities.add(item["entity"]) return entities
[docs]def arg_parser(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="json input file", type=str) parser.add_argument("-th", help="EL score threshold", type=float, default=0.1) parser.add_argument("-n", help="EL score threshold", type=int, default=10) args = parser.parse_args() return args
[docs]def main(args): entities = load_entities(args.input, args.th) mapper = FieldMapping(ElasticCache("dbpedia_2015_10_uri"), args.n) mappings = {} i = 0 for en_id in entities: mappings[en_id] = mapper.map(en_id) i += 1 if i % 10 == 0: print(i, "entities processed!") input_file = args.input[:args.input.rfind(".")] out_file = input_file + "_mapping" + ".json" json.dump(mappings, open(out_file, "w"), indent=4, sort_keys=True) print("Output file:", out_file)
if __name__ == "__main__": main(arg_parser())