Source code for

NTriples Parser

NTriples parser with URI prefixing

:Author: Krisztian Balog

import sys
import logging
from import URIPrefix
from rdflib.plugins.parsers.ntriples import NTriplesParser
from rdflib.term import URIRef
from nordlys.config import PLOGGER

[docs]class Triple(object): """Representation of a Triple to be used by the rdflib NTriplesParser.""" def __init__(self, prefix=None): self.__s = None self.__p = None self.__o = None self.__prefix = prefix
[docs] def triple(self, s, p, o): """Assign current triple object :param s: subject :param p: predicate :param o: object """ self.__s = s self.__p = p self.__o = o
def __prefix_uri(self, uri): """Prefix URI and enclose in between <> :param uri: prefix uri :return: same uri, but enclosed in between <> """ if self.__prefix is None: raise Exception("Prefix handler is not set!") return "<" + self.__prefix.get_prefixed(uri) + ">"
[docs] def subject(self): return self.__s
[docs] def subject_prefixed(self): return self.__prefix_uri(self.__s)
[docs] def predicate(self): return self.__p
[docs] def predicate_prefixed(self): return self.__prefix_uri(self.__p)
[docs] def object(self): return self.__o
[docs] def object_prefixed(self): if type(self.__o) is URIRef: # only URI objects return self.__prefix_uri(self.__o) return self.__o
[docs]class TripleHandler(object): """This is an abstract class"""
[docs] def triple_parsed(self, triple): """This method is called each time a triple is parsed, with the triple as parameter.""" pass
[docs]class NTParser(object): """NTriples parser class""" def __init__(self): logging.basicConfig(level="ERROR") # no warnings from the rdf parser
[docs] def parse_file(self, filename, triplehandler): """Parses file and calls callback function with the parsed triple""""Processing " + filename + "...") prefix = URIPrefix() t = Triple(prefix) p = NTriplesParser(t) i = 0 with open(filename) as f: for line in f: p.parsestring(line) if t.subject() is None: # only if parsed as a triple continue # call the handler object with the parsed triple triplehandler.triple_parsed(t) i += 1 if i % 10000 == 0: / 1000) + "K lines processed")
[docs]class TripleHandlerPrinter(TripleHandler): """Example triple handler that only prints whatever it received."""
[docs] def triple_parsed(self, triple):"S: " + triple.subject() + " ==> " + triple.subject_prefixed())" P: " + triple.predicate() + " ==> " + triple.predicate_prefixed())" O: " + triple.object() + " ==> " + triple.object_prefixed())
[docs]def main(argv): parser = NTParser() thp = TripleHandlerPrinter() parser.parse_file("/scratch/data/dbpedia-3.9/labels_en.nt", thp)
if __name__ == "__main__": main(sys.argv[1:])