Source code for nordlys.core.ml.instances

"""
Instances
=========

Instances used for Machine learning algorithms.

    - Manages a set of Instance objects
    - Loads instance-data from JSON or TSV files
        - When using TSV, instance properties, target, and features are loaded from separate files
    - Generates a list of instances in JSON or RankLib format

:Authors: Faegheh Hasibi, Krisztian Balog
"""

import csv
import json
from sys import argv

from collections import defaultdict
from nordlys.core.ml.instance import Instance
from nordlys.config import PLOGGER


[docs]class Instances(object): """ Class attributes: instances: Instance objects stored in a dictionary indexed by instance id """ def __init__(self, instances=None): """ :param instances: instances in a list or dict - if list then list index is used as the instance ID - if dict then the key is used as the instance ID """ self.__instances = {} if type(instances) == list: for ins in instances: self.add_instance(ins) elif type(instances) == dict: for ins_id, fields in instances.items(): instance = Instance.from_json(ins_id, fields) self.add_instance(instance)
[docs] def append_instances(self, ins_list): """Appends the list of Instances objects. :param ins_list: list of Instance objects """ for ins in ins_list: self.add_instance(ins)
[docs] def add_instance(self, instance): """Adds an Instance object to the list of instances. :param instance: Instance object """ self.__instances[instance.id] = instance
[docs] def get_instance(self, instance_id): """Returns an instance by instance id. :param instance_id: (string) :return: Instance object """ return self.__instances.get(instance_id, None)
[docs] def get_all(self): """Returns list of all instances.""" return list(self.__instances.values())
[docs] def get_all_ids(self): """Returns list of all instance ids.""" return list(self.__instances.keys())
def __load_from_tsv(self, tsv_file, type, params): """Loads instances from a TSV file. :param tsv_file: name of the TSV file :param type: type of the data: "properties", "features" or "target" :param params: list of columns mapped to properties or features """ with open(tsv_file, "rb") as tsvfile: reader = csv.DictReader(tsvfile, delimiter="\t", quoting=csv.QUOTE_NONE) # print "Processing gold file with following fields:\n" + str(reader.fieldnames) # Checks all the params are in the TSV file header if set(params) != set(reader.fieldnames[1:]): raise Exception("TSV header does not match params \"" + ",".join(params) + "\" in file:\n\t" + tsv_file) # Reads tsv lines for line in reader: ins_id = line["id"] # Generating instance if ins_id in self.__instances: # existing instance ins = self.get_instance(ins_id) else: # new instance ins = Instance(ins_id) self.add_instance(ins) # adding params for param in params: if type == "properties": ins.add_property(param, line[param]) elif type == "features": ins.add_feature(param, line[param]) elif type == "target": ins.target = line[param]
[docs] def add_properties_from_tsv(self, tsv_file, properties): self.__load_from_tsv(tsv_file, "properties", properties)
[docs] def add_features_from_tsv(self, tsv_file, features): self.__load_from_tsv(tsv_file, "features", features)
[docs] def add_target_from_tsv(self, tsv_file): self.__load_from_tsv(tsv_file, "target", ["target"])
[docs] @classmethod def from_json(cls, json_file): """Loads instances from a JSON file. :param json_file: (string) :return Instances object """ PLOGGER.info("Reading JSON file " + json_file + " ...") json_data = open(json_file) data = json.load(json_data) instance_list = [] # read instances for ins_id, fields in data.items(): instance = Instance.from_json(ins_id, fields) instance_list.append(instance) return cls(instance_list)
[docs] def group_by_property(self, property): """Groups instances by a given property. :param property :return a dictionary of instance ids {id:[ml.Instance, ...], ...} """ property_dict = defaultdict(list) for ins in self.get_all(): property_dict[ins.get_property(property)].append(ins) return property_dict
[docs] def to_json(self, json_file=None): """ Converts all instances to JSON and writes it to the file :param json_file: (string) :return: JSON dump of all instances. """ inss_json = {} for ins in self.get_all(): inss_json.update(ins.to_json()) if json_file is not None: # print "Writing JSON format of instances ..." out = open(json_file, "w") json.dump(inss_json, out, indent=4, sort_keys=True) PLOGGER.info("JSON output:\t" + json_file) return inss_json
[docs] def to_str(self, file_name=None): """ Converts instances to string and write them to the given file. :param file_name :return: String format of instances """ out_file = None if file_name is not None: open(file_name, "w").close() # cleans previous contents out_file = open(file_name, "a") counter = 0 out = "" for ins in self.get_all(): out += ins.to_str() + "\n" counter += 1 # append instances to the file if (counter % 1000) == 0: # print "Converting is done until instance " + str(ins.id) if out_file is not None: out_file.write(out) out = "" if out_file is not None: out_file.write(out) PLOGGER.info("String output:\t" + file_name) return None return out
[docs] def to_treceval(self, file_name, qid_prop="qid", docid_prop="en_id"): """ Generates a TREC style run file - If there is an entity ranked more than once for the same query, the one with higher score is kept. :param file_name: File to write TREC file :param qid_prop: Name of instance property to be used as query ID (1st column) :param docid_prop: Name of instance property to be used as document ID (3rd column) """ unique_entries = defaultdict(dict) # sort and rank entities for ins in self.get_all(): if ins.score is not None: qid, doc_id = ins.get_property(qid_prop), ins.get_property(docid_prop) score = unique_entries.get(qid, {}).get(doc_id, None) if (score is None) or (score < ins.score): unique_entries[qid][doc_id] = ins.score out_str = "" for qid, docs in sorted(unique_entries.items()): rank = 1 for doc_id, score in sorted(docs.items(), key=lambda x:x[1], reverse=True): out_str += qid + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + "{0:.5f}".format(score) + "\tnordlys\n" rank += 1 open(file_name, "w").write(out_str) PLOGGER.info("Trec-eval output:\t" + file_name)
[docs] def to_libsvm(self, file_name=None, qid_prop=None): """ Converts all instances to the LibSVM format and writes them to the file. - Libsvm format: <line> .=. <target> qid:<qid> <feature>:<value> ... # <info> <target> .=. <float> <qid> .=. <positive integer> <feature> .=. <positive integer> <value> .=. <float> <info> .=. <string> - Example: 3 qid:1 1:1 2:1 3:0 4:0.2 5:0 # 1A NOTES: - The property used for qid(qid_prop) should hold integers - For pointwise algorithms, we use instance id for qid - Lines in the RankLib input have to be sorted by increasing qid. :param file_name: File to write libsvm format of instances. :param qid_prop: property to be used as qid. If none, """ # If no entity matches query if len(self.__instances) == 0: PLOGGER.info("No instance is created!!") open(file_name, "w").write("") return "" # Getting features ins = next(iter(self.__instances.values())) features = sorted(list(ins.features.keys())) # cleans previous contents open(file_name, "w").close() out_file = open(file_name, "a") # Adding feature names as header of libsvm file out = "# target instance_Id" for feature in features: out += " " + feature out += "\n" # sort instances by qid if qid_prop is None: sorted_instances = sorted(self.get_all(), key=lambda ins: int(ins.id)) else: sorted_instances = sorted(self.get_all(), key=lambda ins: int(ins.get_property(qid_prop))) counter = 0 PLOGGER.info("Converting instances to ranklib format ...") for ins in sorted_instances: out += ins.to_libsvm(features, qid_prop) + "\n" counter += 1 # write the instances to the file if (counter % 1000) == 0: out_file.write(out) out = "" # print "Converting is done until instance " + str(ins.id) out_file.write(out) PLOGGER.info("Libsvm output:\t" + file_name)
[docs] def add_qids(self, prop): """ Generates (integer) q_id-s (for libsvm) based on a given (non-integer) property. It assigns a unique integer value to each different value for that property. :param prop: name of the property. :return: """ prop_ids = {} for ins in self.get_all(): p = ins.get_property(prop) if p in prop_ids: q_id = prop_ids[p] else: q_id = len(prop_ids) + 1 prop_ids[p] = q_id ins.add_property("q_id", q_id)
[docs]def main(args): inss = Instances() # we assume that the 1st column is always the ins_id (unique) # the list specifies which property or feature the column value should be loaded to; columns with None are ignored # one file with properties inss.add_properties_from_tsv(args[0], ["sequence"]) # one or more files with features inss.add_features_from_tsv(args[1], ["sentence_length", "article_length", "sentence_order", "predicate_tense"]) # inss.add_features_from_tsv(feat_file_2, ["feature4"]) # inss.add_features_from_tsv(feat_file_3, ["feature5", "feature6"]) # one with target value inss.add_target_from_tsv(args[2]) PLOGGER.info(inss.to_str()) inss.to_json("data/maff.json")
# *** These lines are used for converting a json file to libsvm format. *** # # load from json file # inss = Instances.from_json("data/ml/maff.json") # # add q_id -s based on transaction_id # inss.add_qids("transaction_id") # # write q_id property back to json file # inss.to_json("data/ml/maff2.json") # # write to libsvm file # inss.to_libsvm("data/ml/maff.libsvm", "q_id") if __name__ == "__main__": main(argv[1:])