"""
Commonness Entity Linking Approach
==================================
Class for commonness entity linking approach
:Author: Faegheh Hasibi
"""
from collections import defaultdict
import sys
from nordlys.logic.el.el_utils import is_name_entity
from nordlys.logic.entity.entity import Entity
from nordlys.logic.query.mention import Mention
from nordlys.logic.query.query import Query
[docs]class Cmns(object):
def __init__(self, query, entity, threshold=None, cmns_th=0.1):
self.__query = query
self.__entity = entity
self.__threshold = threshold
self.__cmns_th = cmns_th
self.__ngrams = None
self.__ranked_ens = {}
self.__mentions = set()
[docs] def link(self):
"""Links the query to the entity.
dictionary {mention: (en_id, score), ..}
"""
self.rank_ens()
linked_ens = self.disambiguate()
return linked_ens
[docs] def rank_ens(self):
"""Detects mention and rank entities for each mention"""
self.__get_ngrams()
self.__recursive_rank_ens(len(self.__query.query.split()))
def __get_ngrams(self):
"""Returns n-grams grouped by length.
:return: dictionary {1:["xx", ...], 2: ["xx yy", ...], ...}
"""
if self.__ngrams is None:
self.__ngrams = defaultdict(list)
for ngram in self.__query.get_ngrams():
self.__ngrams[len(ngram.split())].append(ngram)
def __recursive_rank_ens(self, n):
"""Generates list of entities for each mention in the query.
The algorithm starts from the longest possible n-gram and gets all matched entities.
If no entities found, the algorithm recurse and tries to find entities with (n-1)-gram.
:param n: length of n-gram
:return: dictionary {(dbp_uri, fb_id):commonness, ..}
"""
if n == 0:
return
for ngram in self.__ngrams[n]:
if not self.__is_overlapping(ngram):
all_cand_ens = Mention(ngram, self.__entity, self.__cmns_th).get_cand_ens()
# Keeps only proper named entities (if applicable)
cand_ens = {}
for en_id, commonness in all_cand_ens.items():
if not is_name_entity(en_id):
continue
cand_ens[en_id] = commonness
if len(cand_ens) > 0:
self.__ranked_ens[ngram] = cand_ens
self.__mentions.add(ngram)
self.__recursive_rank_ens(n - 1)
[docs] def disambiguate(self):
"""Selects only one entity per mention.
:return [{"mention": xx, "entity": yy, "score": zz}, ...] #dictionary {mention: (en_id, score), ..}
"""
linked_ens = [] # {}
for men, ens in self.__ranked_ens.items():
sorted_ens = sorted(ens.items(), key=lambda x: x[1], reverse=True)
score = sorted_ens[0][1]
if score >= self.__threshold:
linked_ens.append({"mention": men, "entity": sorted_ens[0][0], "score": sorted_ens[0][1]})
# linked_ens[men] = sorted_ens[0]
return linked_ens
def __is_overlapping(self, ngram):
"""Checks whether the ngram is contained in one of the currently identified mentions."""
for mention in self.__mentions:
if ngram in mention:
return True
return False
[docs]def main(args):
entity = Entity()
query = Query(args[0])
cmns = Cmns(query, entity, cmns_th=0.1)
print(cmns.link())
if __name__ == "__main__":
main(sys.argv[1:])