Source code for nordlys.logic.el.greedy

"""
Generative model for interpretation set finding

@author: Faegheh Hasibi
"""

from __future__ import division

from nordlys.core.ml.instances import Instances


[docs]class Greedy(object):

    def __init__(self, score_th):
        self.__score_th = score_th

[docs]    def disambiguate(self, inss):
        """
        Takes instances and generates set of entity linking interpretations.

        :param inss: Instances object
        :return: sets of interpretations [{mention: (en_id, score), ..}, ...]
        """
        pruned_inss = self.prune_by_score(inss)
        pruned_inss = self.prune_containment_mentions(pruned_inss)
        interpretations = self.create_interpretations(pruned_inss)
        return interpretations

[docs]    def prune_by_score(self, query_inss):
        """ prunes based on a static threshold of ranking score."""
        valid_inss = []
        for ins in query_inss.get_all():
            if ins.score >= self.__score_th:
                valid_inss.append(ins)
        return Instances(valid_inss)

[docs]    def prune_containment_mentions(self, query_inss):
        """Deletes containment mentions, if they have lower score."""
        if len(query_inss.get_all()) == 0:
            return query_inss

        valid_inss = [] #dict()  # {mention: ins}
        valid_mens = set()
        for ins in sorted(query_inss.get_all(), key=lambda item: item.score, reverse=True):
            is_contained = False
            cand_men = ins.get_property("mention")
            for men in valid_mens:
                if (cand_men != men) and ((cand_men in men) or (men in cand_men)):
                    is_contained = True
            if not is_contained:
                # valid_inss[ins.get_property("mention")] = ins
                valid_inss.append(ins)
                valid_mens.add(ins.get_property("mention"))  # @todo: This line should be fixed
        return Instances(valid_inss) #list(valid_inss.values()))

[docs]    def create_interpretations(self, query_inss):
        """
        Groups CER instances as interpretation sets.

        :return list of interpretations, where each interpretation is a dictionary {mention: (en_id, score), ..}
        """
        interpretations = [dict()]  # list of dictionaries {men: ins}
        for ins in sorted(query_inss.get_all(), key=lambda item:item.score, reverse=True):
            added = False
            for inter in interpretations:
                mentions = list(inter.keys())
                mentions.append(ins.get_property("mention"))
                if not self.is_overlapping(mentions):
                    inter[ins.get_property("mention")] = (ins.get_property("en_id"), ins.score)
                    added = True
            if not added:
                interpretations.append({ins.get_property("mention"): (ins.get_property("en_id"), ins.score)})
        return interpretations

[docs]    def is_overlapping(self, mentions):
        """
        Checks whether the strings of a set overlapping or not.
        i.e. if there exists a term that appears twice in the whole set.

        E.g. {"the", "music man"} is not overlapping
             {"the", "the man", "music"} is overlapping.

        NOTE: If a query is "yxxz" the mentions {"yx", "xz"} and {"yx", "x"} are overlapping.

        :param mentions: A list of strings
        :return True/False
        """
        word_list = []
        for mention in mentions:
            word_list += set(mention.split())
        if len(word_list) == len(set(word_list)):
            return False
        else:
            return True