Source code for nordlys.core.storage.parser.uri_prefix

"""
URI Prefixing
=============

URI prefixing.

:Author: Krisztian Balog
"""

import json
from nordlys import config

PREFIX_JSON_FILE = config.DATA_DIR + "/uri_prefix/prefixes.json"


[docs]class URIPrefix(object): def __init__(self, prefix_file=PREFIX_JSON_FILE): self.prefixes = json.load(open(prefix_file)) def __get_prefixed(self, uri): """Get prefixed URI.""" prefix = None # if the uri contains a # then try the uri up to # pos = uri.find("#") if pos > 0: urip = uri[:pos + 1] # including trailing # if urip in self.prefixes: prefix = urip # try longest possible match until prefix is found pos = uri.rfind("/") # note: if pos is smaller than 10 then it's probably the / from http:// while prefix is None and pos > 10: urip = uri[:pos + 1] # including trailing / if urip in self.prefixes: prefix = urip pos = urip[:pos].rfind("/") if prefix is not None: return uri.replace(prefix, self.prefixes[prefix] + ":") else: return uri
[docs] def get_prefixed(self, uri, angle_brackets=True): if uri[0] == "<" and uri[-1] == ">": pref = self.__get_prefixed(uri[1:-1]) else: pref = self.__get_prefixed(uri) if angle_brackets: return "<" + pref + ">" else: return pref
[docs]def convert_txt_to_json(txt_file, json_file=PREFIX_JSON_FILE): """Convert prefixes txt file to json. This has to be done only once. And only in case there is no .json file, or any changes done in .txt. """ prefixes = {} ins = open(txt_file, "r") for line in ins: prefix, uri = line.strip().split("\t", 1) # there might be duplicates in the txt file # we only consider the first appearance for each URI # (the txt file shipped with nordlys is ordered by # URI frequency, so it's reasonable) if not uri in prefixes: prefixes[uri] = prefix ins.close() # write the prefix dictionary to json json.dump(prefixes, open(json_file, "wb"))
if __name__ == '__main__': # convert prefix txt file to json # convert_txt_to_json("../../data/uri_prefix/prefixes.txt") pre = URIPrefix() print(pre.get_prefixed("http://www.w3.org/2000/01/rdf-schema#label")) print(pre.get_prefixed("<http://dbpedia.org/resource/xxx/aaa/Audi_A4>"))