"""
Instances
=========
Instances used for Machine learning algorithms.
- Manages a set of Instance objects
- Loads instance-data from JSON or TSV files
- When using TSV, instance properties, target, and features are loaded from separate files
- Generates a list of instances in JSON or RankLib format
:Authors: Faegheh Hasibi, Krisztian Balog
"""
import csv
import json
from sys import argv
from collections import defaultdict
from nordlys.core.ml.instance import Instance
from nordlys.config import PLOGGER
[docs]class Instances(object):
"""
Class attributes:
instances: Instance objects stored in a dictionary indexed by instance id
"""
def __init__(self, instances=None):
"""
:param instances: instances in a list or dict
- if list then list index is used as the instance ID
- if dict then the key is used as the instance ID
"""
self.__instances = {}
if type(instances) == list:
for ins in instances:
self.add_instance(ins)
elif type(instances) == dict:
for ins_id, fields in instances.items():
instance = Instance.from_json(ins_id, fields)
self.add_instance(instance)
[docs] def append_instances(self, ins_list):
"""Appends the list of Instances objects.
:param ins_list: list of Instance objects
"""
for ins in ins_list:
self.add_instance(ins)
[docs] def add_instance(self, instance):
"""Adds an Instance object to the list of instances.
:param instance: Instance object
"""
self.__instances[instance.id] = instance
[docs] def get_instance(self, instance_id):
"""Returns an instance by instance id.
:param instance_id: (string)
:return: Instance object
"""
return self.__instances.get(instance_id, None)
[docs] def get_all(self):
"""Returns list of all instances."""
return list(self.__instances.values())
[docs] def get_all_ids(self):
"""Returns list of all instance ids."""
return list(self.__instances.keys())
def __load_from_tsv(self, tsv_file, type, params):
"""Loads instances from a TSV file.
:param tsv_file: name of the TSV file
:param type: type of the data: "properties", "features" or "target"
:param params: list of columns mapped to properties or features
"""
with open(tsv_file, "rb") as tsvfile:
reader = csv.DictReader(tsvfile, delimiter="\t", quoting=csv.QUOTE_NONE)
# print "Processing gold file with following fields:\n" + str(reader.fieldnames)
# Checks all the params are in the TSV file header
if set(params) != set(reader.fieldnames[1:]):
raise Exception("TSV header does not match params \"" + ",".join(params) + "\" in file:\n\t" + tsv_file)
# Reads tsv lines
for line in reader:
ins_id = line["id"]
# Generating instance
if ins_id in self.__instances: # existing instance
ins = self.get_instance(ins_id)
else: # new instance
ins = Instance(ins_id)
self.add_instance(ins)
# adding params
for param in params:
if type == "properties":
ins.add_property(param, line[param])
elif type == "features":
ins.add_feature(param, line[param])
elif type == "target":
ins.target = line[param]
[docs] def add_properties_from_tsv(self, tsv_file, properties):
self.__load_from_tsv(tsv_file, "properties", properties)
[docs] def add_features_from_tsv(self, tsv_file, features):
self.__load_from_tsv(tsv_file, "features", features)
[docs] def add_target_from_tsv(self, tsv_file):
self.__load_from_tsv(tsv_file, "target", ["target"])
[docs] @classmethod
def from_json(cls, json_file):
"""Loads instances from a JSON file.
:param json_file: (string)
:return Instances object
"""
PLOGGER.info("Reading JSON file " + json_file + " ...")
json_data = open(json_file)
data = json.load(json_data)
instance_list = []
# read instances
for ins_id, fields in data.items():
instance = Instance.from_json(ins_id, fields)
instance_list.append(instance)
return cls(instance_list)
[docs] def group_by_property(self, property):
"""Groups instances by a given property.
:param property
:return a dictionary of instance ids {id:[ml.Instance, ...], ...}
"""
property_dict = defaultdict(list)
for ins in self.get_all():
property_dict[ins.get_property(property)].append(ins)
return property_dict
[docs] def to_json(self, json_file=None):
""" Converts all instances to JSON and writes it to the file
:param json_file: (string)
:return: JSON dump of all instances.
"""
inss_json = {}
for ins in self.get_all():
inss_json.update(ins.to_json())
if json_file is not None:
# print "Writing JSON format of instances ..."
out = open(json_file, "w")
json.dump(inss_json, out, indent=4, sort_keys=True)
PLOGGER.info("JSON output:\t" + json_file)
return inss_json
[docs] def to_str(self, file_name=None):
""" Converts instances to string and write them to the given file.
:param file_name
:return: String format of instances
"""
out_file = None
if file_name is not None:
open(file_name, "w").close() # cleans previous contents
out_file = open(file_name, "a")
counter = 0
out = ""
for ins in self.get_all():
out += ins.to_str() + "\n"
counter += 1
# append instances to the file
if (counter % 1000) == 0:
# print "Converting is done until instance " + str(ins.id)
if out_file is not None:
out_file.write(out)
out = ""
if out_file is not None:
out_file.write(out)
PLOGGER.info("String output:\t" + file_name)
return None
return out
[docs] def to_treceval(self, file_name, qid_prop="qid", docid_prop="en_id"):
"""
Generates a TREC style run file
- If there is an entity ranked more than once for the same query, the one with higher score is kept.
:param file_name: File to write TREC file
:param qid_prop: Name of instance property to be used as query ID (1st column)
:param docid_prop: Name of instance property to be used as document ID (3rd column)
"""
unique_entries = defaultdict(dict)
# sort and rank entities
for ins in self.get_all():
if ins.score is not None:
qid, doc_id = ins.get_property(qid_prop), ins.get_property(docid_prop)
score = unique_entries.get(qid, {}).get(doc_id, None)
if (score is None) or (score < ins.score):
unique_entries[qid][doc_id] = ins.score
out_str = ""
for qid, docs in sorted(unique_entries.items()):
rank = 1
for doc_id, score in sorted(docs.items(), key=lambda x:x[1], reverse=True):
out_str += qid + "\tQ0\t" + doc_id + "\t" + str(rank) + "\t" + "{0:.5f}".format(score) + "\tnordlys\n"
rank += 1
open(file_name, "w").write(out_str)
PLOGGER.info("Trec-eval output:\t" + file_name)
[docs] def to_libsvm(self, file_name=None, qid_prop=None):
"""
Converts all instances to the LibSVM format and writes them to the file.
- Libsvm format:
<line> .=. <target> qid:<qid> <feature>:<value> ... # <info>
<target> .=. <float>
<qid> .=. <positive integer>
<feature> .=. <positive integer>
<value> .=. <float>
<info> .=. <string>
- Example: 3 qid:1 1:1 2:1 3:0 4:0.2 5:0 # 1A
NOTES:
- The property used for qid(qid_prop) should hold integers
- For pointwise algorithms, we use instance id for qid
- Lines in the RankLib input have to be sorted by increasing qid.
:param file_name: File to write libsvm format of instances.
:param qid_prop: property to be used as qid. If none,
"""
# If no entity matches query
if len(self.__instances) == 0:
PLOGGER.info("No instance is created!!")
open(file_name, "w").write("")
return ""
# Getting features
ins = next(iter(self.__instances.values()))
features = sorted(list(ins.features.keys()))
# cleans previous contents
open(file_name, "w").close()
out_file = open(file_name, "a")
# Adding feature names as header of libsvm file
out = "# target instance_Id"
for feature in features:
out += " " + feature
out += "\n"
# sort instances by qid
if qid_prop is None:
sorted_instances = sorted(self.get_all(), key=lambda ins: int(ins.id))
else:
sorted_instances = sorted(self.get_all(), key=lambda ins: int(ins.get_property(qid_prop)))
counter = 0
PLOGGER.info("Converting instances to ranklib format ...")
for ins in sorted_instances:
out += ins.to_libsvm(features, qid_prop) + "\n"
counter += 1
# write the instances to the file
if (counter % 1000) == 0:
out_file.write(out)
out = ""
# print "Converting is done until instance " + str(ins.id)
out_file.write(out)
PLOGGER.info("Libsvm output:\t" + file_name)
[docs] def add_qids(self, prop):
"""
Generates (integer) q_id-s (for libsvm) based on a given (non-integer) property.
It assigns a unique integer value to each different value for that property.
:param prop: name of the property.
:return:
"""
prop_ids = {}
for ins in self.get_all():
p = ins.get_property(prop)
if p in prop_ids:
q_id = prop_ids[p]
else:
q_id = len(prop_ids) + 1
prop_ids[p] = q_id
ins.add_property("q_id", q_id)
[docs]def main(args):
inss = Instances()
# we assume that the 1st column is always the ins_id (unique)
# the list specifies which property or feature the column value should be loaded to; columns with None are ignored
# one file with properties
inss.add_properties_from_tsv(args[0], ["sequence"])
# one or more files with features
inss.add_features_from_tsv(args[1], ["sentence_length", "article_length", "sentence_order", "predicate_tense"])
# inss.add_features_from_tsv(feat_file_2, ["feature4"])
# inss.add_features_from_tsv(feat_file_3, ["feature5", "feature6"])
# one with target value
inss.add_target_from_tsv(args[2])
PLOGGER.info(inss.to_str())
inss.to_json("data/maff.json")
# *** These lines are used for converting a json file to libsvm format. ***
# # load from json file
# inss = Instances.from_json("data/ml/maff.json")
# # add q_id -s based on transaction_id
# inss.add_qids("transaction_id")
# # write q_id property back to json file
# inss.to_json("data/ml/maff2.json")
# # write to libsvm file
# inss.to_libsvm("data/ml/maff.libsvm", "q_id")
if __name__ == "__main__":
main(argv[1:])