Source code for nordlys.logic.features.ftr_entity_similarity

"""
FTR Entity Similarity
=====================

Implements features capturing the similarity between entity and a query.

:Author: Faegheh Hasibi
"""

from __future__ import division
import re
import math

from nordlys.core.retrieval.elastic import Elastic
from nordlys.core.retrieval.scorer import ScorerMLM


[docs]class FtrEntitySimilarity(object):
    DEBUG = 0

    def __init__(self, query, en_id, elastic):
        self.__query = query
        self.__en_id = en_id
        self.__elastic = elastic

[docs]    def lm_score(self, field=Elastic.FIELD_CATCHALL):
        """
        Query length normalized LM score between entity field and query

        :param field: field name
        :return MLM score
        """
        raw_score = self.nllr(self.__query, {field: 1})
        score = math.exp(raw_score) if raw_score else 0
        return score

[docs]    def mlm_score(self, field_weights):
        """
        Query length normalized MLM similarity between the entity and query

        :param field_weights: dictionary {field: weight, ...}
        :return MLM score
        """
        raw_score = self.nllr(self.__query, field_weights)
        score = math.exp(raw_score) if raw_score else 0
        return score

[docs]    def context_sim(self, mention, field=Elastic.FIELD_CATCHALL):
        """
        LM score of entity to the context of query (context means query - mention)
            E.g. given the query "uss yorktown charleston" and mention "uss",
                query context is " yorktown charleston"

        :param mention: string
        :param field: field name
        :return context similarity score
        """
        # get query context
        match = re.search(mention, self.__query)
        if match is None:
            raise Exception("NOTE: Mention \"" + mention + "\" is not found in the query \"" + self.__query + "\"")
        mention_scope = match.span()
        q_context = self.__query[:mention_scope[0]] + self.__query[mention_scope[1]:]
        # scoring
        raw_score = self.nllr(q_context.strip(), {field: 1})
        score = math.exp(raw_score) if raw_score else 0
        return score

[docs]    def nllr(self, query, field_weights):
        """
        Computes Normalized query likelihood (NLLR):
            NLLR(q,d) = \sum_{t \in q} P(t|q) log P(t|\theta_d) - \sum_{t \in q} p(t|q) log P(t|C)
            where:
                P(t|q) = n(t,q)/|q|
                P(t|C) =  \sum_{f} \mu_f * P(t|C_f)
                P(t|\theta_d) = smoothed LM/MLM score

        :param query: query
        :param field_weights: dictionary {field: weight, ...}
        :return: NLLR score
        """
        query = self.__elastic.analyze_query(query)
        scorer_mlm = ScorerMLM(self.__elastic, query, {"fields": field_weights})
        term_probs = scorer_mlm.get_mlm_term_probs(self.__en_id)

        # none of query terms are in the collection
        if sum(term_probs.values()) == 0:
            if self.DEBUG:
                print("\t\tP_mlm(q|theta_d) = None")
            return None

        # computes the NLLR score
        query_len = len(query.split())
        left_sum, right_sum = 0, 0
        for t, p_t_theta_d in term_probs.items():
            if p_t_theta_d == 0:  # Skips the term if it is not in the collection
                continue
            query_tf = query.split().count(t)
            p_t_C = self.__term_collec_prob(t, field_weights)
            p_t_q = query_tf / query_len
            left_sum += p_t_q * math.log(p_t_theta_d)
            right_sum += p_t_q * math.log(p_t_C)
            if self.DEBUG:
                print("\tP(\"" + t + "\"|d) =", p_t_theta_d, "\tP(\"" + t + "\"|C) =", p_t_C, "\tp(\"" + t + "\"|q) =", p_t_q)
        nllr_q_d = left_sum - right_sum
        if self.DEBUG:
            print("\t\tNLLR(" + query + "|theta_d) = " + str(nllr_q_d))
        return nllr_q_d

    def __term_collec_prob(self, term, fields):
        """
        Computes term collection probability for NLLR: P(t|C) =  \sum_{f} \mu_f * P(t|C_f)

        :param term: string
        :param fields:  dictionary {field: weight, ...}
        :return: probability P(t|C)
        """
        p_t_C = 0
        for f, mu_f in fields.items():
            len_C_f = self.__elastic.coll_length(f)
            tf_t_C_f = self.__elastic.coll_term_freq(term, f)
            p_t_C += mu_f * (tf_t_C_f / len_C_f)
        return p_t_C