"""
Generative model for interpretation set finding
@author: Faegheh Hasibi
"""
from __future__ import division
from nordlys.core.ml.instances import Instances
[docs]class Greedy(object):
def __init__(self, score_th):
self.__score_th = score_th
[docs] def disambiguate(self, inss):
"""
Takes instances and generates set of entity linking interpretations.
:param inss: Instances object
:return: sets of interpretations [{mention: (en_id, score), ..}, ...]
"""
pruned_inss = self.prune_by_score(inss)
pruned_inss = self.prune_containment_mentions(pruned_inss)
interpretations = self.create_interpretations(pruned_inss)
return interpretations
[docs] def prune_by_score(self, query_inss):
""" prunes based on a static threshold of ranking score."""
valid_inss = []
for ins in query_inss.get_all():
if ins.score >= self.__score_th:
valid_inss.append(ins)
return Instances(valid_inss)
[docs] def prune_containment_mentions(self, query_inss):
"""Deletes containment mentions, if they have lower score."""
if len(query_inss.get_all()) == 0:
return query_inss
valid_inss = [] #dict() # {mention: ins}
valid_mens = set()
for ins in sorted(query_inss.get_all(), key=lambda item: item.score, reverse=True):
is_contained = False
cand_men = ins.get_property("mention")
for men in valid_mens:
if (cand_men != men) and ((cand_men in men) or (men in cand_men)):
is_contained = True
if not is_contained:
# valid_inss[ins.get_property("mention")] = ins
valid_inss.append(ins)
valid_mens.add(ins.get_property("mention")) # @todo: This line should be fixed
return Instances(valid_inss) #list(valid_inss.values()))
[docs] def create_interpretations(self, query_inss):
"""
Groups CER instances as interpretation sets.
:return list of interpretations, where each interpretation is a dictionary {mention: (en_id, score), ..}
"""
interpretations = [dict()] # list of dictionaries {men: ins}
for ins in sorted(query_inss.get_all(), key=lambda item:item.score, reverse=True):
added = False
for inter in interpretations:
mentions = list(inter.keys())
mentions.append(ins.get_property("mention"))
if not self.is_overlapping(mentions):
inter[ins.get_property("mention")] = (ins.get_property("en_id"), ins.score)
added = True
if not added:
interpretations.append({ins.get_property("mention"): (ins.get_property("en_id"), ins.score)})
return interpretations
[docs] def is_overlapping(self, mentions):
"""
Checks whether the strings of a set overlapping or not.
i.e. if there exists a term that appears twice in the whole set.
E.g. {"the", "music man"} is not overlapping
{"the", "the man", "music"} is overlapping.
NOTE: If a query is "yxxz" the mentions {"yx", "xz"} and {"yx", "x"} are overlapping.
:param mentions: A list of strings
:return True/False
"""
word_list = []
for mention in mentions:
word_list += set(mention.split())
if len(word_list) == len(set(word_list)):
return False
else:
return True