Source code for nordlys.core.storage.parser.nt_parser

"""
NTriples Parser
===============

NTriples parser with URI prefixing

:Author: Krisztian Balog
"""

import sys
import logging
from nordlys.core.storage.parser.uri_prefix import URIPrefix
from rdflib.plugins.parsers.ntriples import NTriplesParser
from rdflib.term import URIRef
from nordlys.config import PLOGGER


[docs]class Triple(object): """Representation of a Triple to be used by the rdflib NTriplesParser.""" def __init__(self, prefix=None): self.__s = None self.__p = None self.__o = None self.__prefix = prefix
[docs] def triple(self, s, p, o): """Assign current triple object :param s: subject :param p: predicate :param o: object """ self.__s = s self.__p = p self.__o = o
def __prefix_uri(self, uri): """Prefix URI and enclose in between <> :param uri: prefix uri :return: same uri, but enclosed in between <> """ if self.__prefix is None: raise Exception("Prefix handler is not set!") return "<" + self.__prefix.get_prefixed(uri) + ">"
[docs] def subject(self): return self.__s
[docs] def subject_prefixed(self): return self.__prefix_uri(self.__s)
[docs] def predicate(self): return self.__p
[docs] def predicate_prefixed(self): return self.__prefix_uri(self.__p)
[docs] def object(self): return self.__o
[docs] def object_prefixed(self): if type(self.__o) is URIRef: # only URI objects return self.__prefix_uri(self.__o) return self.__o
[docs]class TripleHandler(object): """This is an abstract class"""
[docs] def triple_parsed(self, triple): """This method is called each time a triple is parsed, with the triple as parameter.""" pass
[docs]class NTParser(object): """NTriples parser class""" def __init__(self): logging.basicConfig(level="ERROR") # no warnings from the rdf parser
[docs] def parse_file(self, filename, triplehandler): """Parses file and calls callback function with the parsed triple""" PLOGGER.info("Processing " + filename + "...") prefix = URIPrefix() t = Triple(prefix) p = NTriplesParser(t) i = 0 with open(filename) as f: for line in f: p.parsestring(line) if t.subject() is None: # only if parsed as a triple continue # call the handler object with the parsed triple triplehandler.triple_parsed(t) i += 1 if i % 10000 == 0: PLOGGER.info(str(i / 1000) + "K lines processed")
[docs]class TripleHandlerPrinter(TripleHandler): """Example triple handler that only prints whatever it received."""
[docs] def triple_parsed(self, triple): PLOGGER.info("S: " + triple.subject() + " ==> " + triple.subject_prefixed()) PLOGGER.info(" P: " + triple.predicate() + " ==> " + triple.predicate_prefixed()) PLOGGER.info(" O: " + triple.object() + " ==> " + triple.object_prefixed())
[docs]def main(argv): parser = NTParser() thp = TripleHandlerPrinter() parser.parse_file("/scratch/data/dbpedia-3.9/labels_en.nt", thp)
if __name__ == "__main__": main(sys.argv[1:])