Source code for nordlys.services.el

"""
Entity Linking
==============

The command-line application for entity linking

Usage
-----

::

  python -m nordlys.services.el -c <config_file> -q <query>

If `-q <query>` is passed, it returns the results for the specified query and prints them in terminal.

Config parameters
-----------------

- **method**: name of the method
    - **cmns**  The baseline method that uses the overall popularity of entities as link targets
    - **ltr** The learning-to-rank model
- **threshold**: Entity linking threshold; varies depending on the method *(default: 0.1)*
- **step**: The step of entity linking process: [linking|ranking|disambiguation], *(default: linking)*
- **kb_snapshot**: File containing the KB snapshot of proper named entities; required for LTR, and optional for CMNS
- **query_file**: name of query file (JSON)
- **output_file**: name of output file

*Parameters of LTR method:*

- **model_file**: The trained model file; *(default:"data/el/model.txt")*
- **ground_truth**: The ground truth file; *(optional)*
- **gen_training_set**: If True, generates the training set from the groundtruth and query files; *(default: False)*
- **gen_model**: If True, trains the model from the training set; *(default: False)*
- The other parameters are similar to the nordlys.core.ml.ml settings


Example config
---------------

.. code:: python

	{
	  "method": "cmns",
	  "threshold": 0.1,
	  "query_file": "path/to/queries.json"
	  "output_file": "path/to/output.json"
	}

------------------------

:Author: Faegheh Hasibi
"""

import argparse
import json
from pprint import pprint

import pickle

from nordlys.config import ELASTIC_INDICES, PLOGGER
from nordlys.core.ml.instances import Instances
from nordlys.core.retrieval.elastic_cache import ElasticCache
from nordlys.core.utils.file_utils import FileUtils
from nordlys.logic.el.cmns import Cmns
from nordlys.logic.el.el_utils import load_kb_snapshot, to_elq_eval
from nordlys.logic.el.ltr import LTR
from nordlys.logic.entity.entity import Entity
from nordlys.logic.features.feature_cache import FeatureCache
from nordlys.logic.query.query import Query

# Constants
DBPEDIA_INDEX = ELASTIC_INDICES[0]


[docs]class EL(object):

    def __init__(self, config, entity, elastic=None, fcache=None):
        self.__check_config(config)
        self.__config = config
        self.__method = config["method"]
        self.__threshold = float(config["threshold"])
        self.__query_file = config.get("query_file", None)
        self.__output_file = config.get("output_file", None)

        self.__entity = entity
        self.__elastic = elastic
        self.__fcache = fcache
        self.__model = None
        if "kb_snapshot" in self.__config:
            load_kb_snapshot(self.__config["kb_snapshot"])

    @staticmethod
    def __check_config(config):
        """Checks config parameters and set default values."""
        if config.get("method", None) is None:
            config["method"] = "ltr"
        if config.get("step", None) is None:
            config["step"] = "linking"
        if config.get("threshold", None) is None:
            config["threshold"] = 0.1
        if config["method"] == "ltr":
            if config.get("model_file", None) is None:
                config["model_file"] = "data/el/model.txt"
        if config.get("kb_snapshot", None) is None:
            config["kb_snapshot"] = "data/el/snapshot_2015_10.txt"
        return config

    def __get_linker(self, query):
        """Returns the entity linker based on the given model and parameters

        :param query: query object
        :return: entity linking object
        """
        if self.__method.lower() == "cmns":
            return Cmns(query, self.__entity, threshold=self.__threshold)
        if self.__method.lower() == "ltr":
            if self.__model is None:
                self.__model = pickle.load(open(self.__config["model_file"], "rb"))
            return LTR(query, self.__entity, self.__elastic, self.__fcache, self.__model, threshold=self.__threshold)
        else:
            raise Exception("Unknown model " + self.__method)

[docs]    def link(self, query, qid=""):
        """Performs entity linking for the query.

        :param query: query string
        :return: annotated query
        """
        PLOGGER.info("Linking query " + qid + " [" + query + "] ")
        q = Query(query, qid)
        linker = self.__get_linker(q)
        if self.__config["step"] == "ranking":
            res = linker.rank_ens()
        else:
            linked_ens = linker.link()
            res = {"query": q.raw_query,
                   "processed_query": q.query,
                   "results": linked_ens}
        return res

[docs]    def batch_linking(self):
        """Scores queries in a batch and outputs results."""
        results = {}

        if self.__config["step"] == "linking":
            queries = json.load(open(self.__query_file))
            for qid in sorted(queries):
                results[qid] = self.link(queries[qid], qid)
            to_elq_eval(results, self.__output_file)
            # json.dump(results, open(self.__output_file, "w"), indent=4, sort_keys=True)

        # only ranking step
        if self.__config["step"] == "ranking":
            queries = json.load(open(self.__query_file))
            for qid in sorted(queries):
                linker = self.__get_linker(Query(queries[qid], qid))
                results[qid] = linker.rank_ens()
            ranked_inss = Instances(sum([inss.get_all() for inss in results.values()], []))
            ranked_inss.to_treceval(self.__output_file)
            if self.__config.get("json_file", None):
                ranked_inss.to_json(self.__config["json_file"])

        # only disambiguation step
        if self.__config["step"] == "disambiguation":
            inss = Instances.from_json(self.__config["test_set"])
            inss_by_query = inss.group_by_property("qid")
            for qid, q_inss in sorted(inss_by_query.items()):
                linker = self.__get_linker("")
                results[qid] = {"results": linker.disambiguate(Instances(q_inss))}
            if self.__config.get("json_file", None):
                json.dump(open(self.__config["json_file"], "w"), results, indent=4, sort_keys=True)
            to_elq_eval(results, self.__output_file)

        PLOGGER.info("Output file: " + self.__output_file)


[docs]def arg_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("-q", "--query", help="query string", type=str, default=None)
    parser.add_argument("-c", "--config", help="config file", type=str, default={})
    args = parser.parse_args()
    return args


[docs]def main(args):
    conf = FileUtils.load_config(args.config)
    el = EL(conf, Entity(), ElasticCache(DBPEDIA_INDEX), FeatureCache())

    if conf.get("gen_model", False):
        LTR.train(conf)
    if args.query:
        res = el.link(args.query)
        pprint(res)
    else:
        el.batch_linking()

if __name__ == '__main__':
    main(arg_parser())