Source code for nordlys.logic.el.ltr

"""
LTR Entity Linking Approach
===========================

Class for Learning-to-Rank entity linking approach

:Author: Faegheh Hasibi
"""
import csv
import json

from nordlys.config import PLOGGER, ELASTIC_INDICES
from nordlys.core.ml.instance import Instance
from nordlys.core.ml.instances import Instances
from nordlys.core.ml.ml import ML
from nordlys.core.retrieval.elastic_cache import ElasticCache
from nordlys.logic.el.el_utils import is_name_entity
from nordlys.logic.el.greedy import Greedy
from nordlys.logic.entity.entity import Entity
from nordlys.logic.features.feature_cache import FeatureCache
from nordlys.logic.features.ftr_entity import FtrEntity
from nordlys.logic.features.ftr_entity_similarity import FtrEntitySimilarity
from nordlys.logic.features.ftr_mention import FtrMention
from nordlys.logic.features.ftr_entity_mention import FtrEntityMention
from nordlys.logic.query.mention import Mention
from nordlys.logic.query.query import Query


[docs]class LTR(object): def __init__(self, query, entity, elastic, fcache, model=None, threshold=None, cmns_th=0.1): self.__query = query self.__entity = entity self.__elastic = elastic self.__fcache = fcache self.__model = model self.__threshold = threshold self.__cmns_th = cmns_th # ========================= # Code related to training # ========================= @staticmethod def __check_config(config): """Checks config parameters and set default values.""" must_have = ["model_file", "training_set", "ground_truth", "query_file"] try: for i in range(0,2): if must_have[i] not in config: raise Exception(must_have[i] + "is not defined!") if config.get("gen_training_set", False): for i in range(2, 4): if must_have[i] not in config: raise Exception(must_have[i] + "is not defined!") except Exception as e: PLOGGER.error("Error in config file: ", e) exit(1)
[docs] @staticmethod def train(config): LTR.__check_config(config) if config.get("gen_training_set", False): gt = LTR.load_yerd(config["ground_truth"]) LTR.gen_train_set(gt, config["query_file"], config["training_set"]) instances = Instances.from_json(config["training_set"]) ML(config).train_model(instances)
[docs] @staticmethod def load_yerd(gt_file): """ Reads the Y-ERD collection and returns a dictionary. :param gt_file: Path to the Y-ERD collection :return: dictionary {(qid, query, en_id, mention) ...} """ PLOGGER.info("Loading the ground truth ...") gt = set() with open(gt_file, "r") as tsvfile: reader = csv.DictReader(tsvfile, delimiter="\t", quoting=csv.QUOTE_NONE) for line in reader: if line["entity"] == "": continue query = Query(line["query"]).query mention = Query(line["mention"]).query gt.add((line["qid"], query, line["entity"], mention)) return gt
[docs] @staticmethod def gen_train_set(gt, query_file, train_set): """Trains LTR model for entity linking.""" entity, elastic, fcache = Entity(), ElasticCache(ELASTIC_INDICES[0]), FeatureCache() inss = Instances() positive_annots = set() # Adds groundtruth instances (positive instances) PLOGGER.info("Adding groundtruth instances (positive instances) ....") for item in sorted(gt): # qid, query, en_id, mention ltr = LTR(Query(item[1], item[0]), entity, elastic, fcache) ins = ltr.__gen_raw_ins(item[2], item[3]) ins.features = ltr.get_features(ins) ins.target = 1 inss.add_instance(ins) positive_annots.add((item[0], item[2])) # Adds all other instances PLOGGER.info("Adding all other instances (negative instances) ...") for qid, q in sorted(json.load(open(query_file, "r")).items()): PLOGGER.info("Query [" + qid + "]") ltr = LTR(Query(q, qid), entity, elastic, fcache) q_inss = ltr.get_candidate_inss() for ins in q_inss.get_all(): if (qid, ins.get_property("en_id")) in positive_annots: continue ins.target = 0 inss.add_instance(ins) inss.to_json(train_set)
# ========================= # Code related to Linking # =========================
[docs] def get_candidate_inss(self): """Detects mentions and their candidate entities (with their commoness scores) and generates instances :return: Instances object """ instances = Instances() for ngram in self.__query.get_ngrams(): cand_ens = Mention(ngram, self.__entity, self.__cmns_th).get_cand_ens() for en_id, commonness in cand_ens.items(): if not is_name_entity(en_id): continue self.__fcache.set_feature_val("commonness", en_id + "_" + ngram, commonness) ins = self.__gen_raw_ins(en_id, ngram) ins.features = self.get_features(ins, cand_ens) instances.add_instance(ins) return instances
[docs] def rank_ens(self): """Ranks instances according to the learned LTR model :param n: length of n-gram :return: dictionary {(dbp_uri, fb_id):commonness, ..} """ if self.__model is None: PLOGGER.error("LTR model is not defined.") inss = self.get_candidate_inss() ML({}).apply_model(inss, self.__model) return inss
[docs] def disambiguate(self, inss): """Performs disambiguation""" greedy = Greedy(self.__threshold) inter_sets = greedy.disambiguate(inss) uniq_men_en = {} for iset in inter_sets: for men, (en_id, score) in iset.items(): uniq_men_en[(men, en_id)] = score linked_ens = [] for men_en, score in uniq_men_en.items(): linked_ens.append({"mention": men_en[0], "entity": men_en[1], "score": score}) return linked_ens
[docs] def get_features(self, ins, cand_ens=None): """Generates the features set for each instance. :param ins: instance object :param cand_ens: dictionary of candidate entities {en_id: cmns, ...} :return: dictionary of features {ftr_name: value, ...} """ e = ins.get_property("en_id") m = ins.get_property("mention") q = ins.get_property("query") features = {} # --- entity features --- ftr_entity = FtrEntity(e, self.__entity) features["outlinks"] = self.__fcache.get_feature_val("outlinks", e, ftr_entity.outlinks) features["redirects"] = self.__fcache.get_feature_val("redirects", e, ftr_entity.redirects) # --- mention features --- ftr_mention = FtrMention(m, self.__entity, cand_ens) features["len_ratio"] = ftr_mention.len_ratio(q) features["len"] = ftr_mention.mention_len() features["matches"] = self.__fcache.get_feature_val("matches", m, ftr_mention.matches) # --- mention-entity features --- ftr_entity_mention = FtrEntityMention(e, m, self.__entity) key = e + "_" + m features["commonness"] = self.__fcache.get_feature_val("commonness", key, ftr_entity_mention.commonness) features["mct"] = ftr_entity_mention.mct() features["tcm"] = ftr_entity_mention.tcm() features["tem"] = ftr_entity_mention.tem() features["pos1"] = ftr_entity_mention.pos1() ftr_sim_mention = FtrEntitySimilarity(m, e, self.__elastic) features["sim_m"] = self.__fcache.get_feature_val("sim", key, ftr_sim_mention.lm_score) # --- entity-query features --- ftr_entity_query = FtrEntityMention(e, q, self.__entity) features["qct"] = ftr_entity_query.mct() features["tcq"] = ftr_entity_query.tcm() features["teq"] = ftr_entity_query.tem() key = e + "_" + q ftr_sim_query = FtrEntitySimilarity(q, e, self.__elastic) features["sim_q"] = self.__fcache.get_feature_val("sim", key, ftr_sim_query.lm_score) features["context_sim"] = self.__fcache.get_feature_val("context_sim", key, ftr_sim_query.context_sim, m) return features
def __gen_raw_ins(self, en_id, mention): """Generates an instance without features""" ins_id = self.__query.qid + "_" + en_id + "_" + mention index = self.__query.qid.rfind("_") session = self.__query.qid[:index] if index != -1 else self.__query.qid ins = Instance(ins_id) ins.add_property("qid", self.__query.qid) ins.add_property("query", self.__query.query) ins.add_property("en_id", en_id) ins.add_property("mention", mention) ins.add_property("session", session) return ins