"""
Trec Qrels
==========
Utility module for working with TREC qrels files.
Usage
-----
Get statistics about a qrels file
``trec_qrels <qrels_file> -o stat``
Filter qrels to contain only documents from a given set
``trec_qrels <qrels_file> -o filter_docs -d <doc_ids_file> -f <output_file>``
Filter qrels to contain only queries from a given set
``trec_qrels <qrels_file> -o filter_qs -q <query_ids_file> -f <output_file>``
:Author: Krisztian Balog
"""
import argparse
from nordlys.config import PLOGGER
[docs]class TrecQrels(object):
"""Represents relevance judments (TREC qrels)."""
def __init__(self, file_name=None):
self.__qrels = {}
if file_name is not None:
self.load(file_name)
[docs] def load(self, file_name):
"""Loads qrels from file.
:param file_name: name of qrels file
"""
with open(file_name, "r") as f_qrels:
for line in f_qrels: # <query_id> <Q0> <doc_id> <relevance>
result = line.strip().split()
if len(result) == 4:
query_id, doc_id, rel = result[0], result[2], result[3]
if query_id not in self.__qrels:
self.__qrels[query_id] = {}
self.__qrels[query_id][doc_id] = rel
[docs] def get_queries(self):
"""Returns the set of queries."""
return self.__qrels.keys()
[docs] def get_rel(self, query_id):
"""Returns relevance level for a given query.
:param query_id: queryID
:return: dict (docID as key and relevance as value) or None
"""
return self.__qrels.get(query_id)
[docs] def num_rel(self, query_id, min_rel=1):
"""Returns the number of relevant results for a given query.
:param query_id: queryID
:param min_rel: minimum relevance level
:return: number of relevant results
"""
if query_id not in self.__qrels:
return None
return sum(rel >= min_rel for rel in self.__qrels[query_id].values())
[docs] def print_stat(self):
"""Prints simple statistics."""
print("#queries: " + str(len(self.__qrels)))
print("#judments: " + str(sum(len(v) for k, v in self.__qrels.items())))
[docs] def filter_by_doc_ids(self, doc_ids_file, output_file):
"""Filters qrels for a set of selected docIDs and outputs the results to a file.
:param doc_ids_file: File with one docID per line
:param output_file: Output file name
"""
# loading docIDs (with ignoring empty lines in the input file)
with open(doc_ids_file, "r") as f:
doc_ids = [l for l in (line.strip() for line in f) if l]
# filtering qrels
with open(output_file, "w") as f:
for query_id, res in self.__qrels.items():
for doc_id, rel in res.items():
if doc_id in doc_ids:
f.write(query_id + " Q0 " + doc_id + " " + str(rel) + "\n")
[docs] def filter_by_query_ids(self, query_ids_file, output_file):
"""Filters qrels for a set of selected queryIDs and outputs the results to a file.
:param query_ids_file: File with one queryID per line
:param output_file: Output file name
"""
# loading docIDs (with ignoring empty lines in the input file)
with open(query_ids_file, "r") as f:
query_ids = [l for l in (line.strip() for line in f) if l]
# filtering qrels
with open(output_file, "w") as f:
for query_id, res in self.__qrels.items():
if query_id in query_ids:
for doc_id, rel in res.items():
f.write(query_id + " Q0 " + doc_id + " " + str(rel) + "\n")
CHOICE_STAT = "stat"
CHOICE_FILTER_DOCS = "filter_docs"
CHOICE_FILTER_QS = "filter_qs"
[docs]def arg_parser():
parser = argparse.ArgumentParser()
parser.add_argument("qrels_file", help="qrels file") # mandatory arg
parser.add_argument("-o", "--operation", help="operation name",
choices=[CHOICE_STAT, CHOICE_FILTER_DOCS, CHOICE_FILTER_QS])
parser.add_argument("-d", "--doc_ids_file", help="file with the allowed doc_ids (for filtering)", type=str)
parser.add_argument("-q", "--query_ids_file", help="file with the allowed query_ids (for filtering)",
type=str)
parser.add_argument("-f", "--output_file", help="output file", type=str)
args = parser.parse_args()
return args
[docs]def main(args):
qrels = TrecQrels(args.qrels_file)
if args.operation == CHOICE_STAT:
qrels.print_stat()
elif args.operation == CHOICE_FILTER_DOCS:
if len(args.doc_ids_file) == 0 or len(args.output_file) == 0:
PLOGGER.info("doc_ids_file or output_file missing")
else:
qrels.filter_by_doc_ids(args.doc_ids_file, args.output_file)
elif args.operation == CHOICE_FILTER_QS:
if len(args.query_ids_file) == 0 or len(args.output_file) == 0:
PLOGGER.info("query_ids_file or output_file missing")
else:
qrels.filter_by_query_ids(args.query_ids_file, args.output_file)
if __name__ == "__main__":
main(arg_parser())