Skip to content

csv2rdf

A commandline tool for semi-automatically converting CSV to RDF.

See also https://github.com/RDFLib/pyTARQL in the RDFlib family of tools

try: csv2rdf --help

Classes:

HELP module-attribute

HELP = '\ncsv2rdf.py     -b <instance-base>     -p <property-base>     [-D <default>]     [-c <classname>]     [-i <identity column(s)>]     [-l <label columns>]     [-s <N>] [-o <output>]     [-f configfile]     [--col<N> <colspec>]     [--prop<N> <property>]     <[-d <delim>]     [-C] [files...]"\n\nReads csv files from stdin or given files\nif -d is given, use this delimiter\nif -s is given, skips N lines at the start\nCreates a URI from the columns given to -i, or automatically by numbering if\nnone is given\nOutputs RDFS labels from the columns given to -l\nif -c is given adds a type triple with the given classname\nif -C is given, the class is defined as rdfs:Class\nOutputs one RDF triple per column in each row.\nOutput is in n3 format.\nOutput is stdout, unless -o is specified\n\nLong options also supported:     --base,     --propbase,     --ident,     --class,     --label,     --out,     --defineclass\n\nLong options --col0, --col1, ...\ncan be used to specify conversion for columns.\nConversions can be:\n    ignore, float(), int(), split(sep, [more]), uri(base, [class]), date(format)\n\nLong options --prop0, --prop1, ...\ncan be used to use specific properties, rather than ones auto-generated\nfrom the headers\n\n-D sets the default conversion for columns not listed\n\n-f says to read config from a .ini/config file - the file must contain one\nsection called csv2rdf, with keys like the long options, i.e.:\n\n[csv2rdf]\nout=output.n3\nbase=http://example.org/\ncol0=split(";")\ncol1=split(";", uri("http://example.org/things/",\n                    "http://xmlns.com/foaf/0.1/Person"))\ncol2=float()\ncol3=int()\ncol4=date("%Y-%b-%d %H:%M:%S")\n\n'

__all__ module-attribute

__all__ = ['CSV2RDF']

config_functions module-attribute

config_functions = {'ignore': _config_ignore, 'uri': _config_uri, 'literal': _config_literal, 'float': _config_float, 'int': _config_int, 'date': _config_date, 'split': _config_split, 'replace': _config_replace, 'bool': _config_bool}

default_node_make module-attribute

default_node_make = NodeMaker()

uris module-attribute

uris: Dict[Any, Tuple[URIRef, Optional[URIRef]]] = {}

CSV2RDF

CSV2RDF()

Methods:

Attributes:

Source code in rdflib/tools/csv2rdf.py
def __init__(self):
    self.CLASS = None
    self.BASE = None
    self.PROPBASE = None
    self.IDENT: Union[Tuple[str, ...], str] = "auto"
    self.LABEL = None
    self.DEFINECLASS = False
    self.SKIP = 0
    self.DELIM = ","
    self.DEFAULT = None

    self.COLUMNS = {}
    self.PROPS = {}

    self.OUT = sys.stdout

    self.triples = 0

BASE instance-attribute

BASE = None

CLASS instance-attribute

CLASS = None

COLUMNS instance-attribute

COLUMNS = {}

DEFAULT instance-attribute

DEFAULT = None

DEFINECLASS instance-attribute

DEFINECLASS = False

DELIM instance-attribute

DELIM = ','

IDENT instance-attribute

IDENT: Union[Tuple[str, ...], str] = 'auto'

LABEL instance-attribute

LABEL = None

OUT instance-attribute

OUT = stdout

PROPBASE instance-attribute

PROPBASE = None

PROPS instance-attribute

PROPS = {}

SKIP instance-attribute

SKIP = 0

triples instance-attribute

triples = 0

convert

convert(csvreader)
Source code in rdflib/tools/csv2rdf.py
def convert(self, csvreader):
    start = time.time()

    if self.OUT:
        sys.stderr.write("Output to %s\n" % self.OUT.name)

    if self.IDENT != "auto" and not isinstance(self.IDENT, tuple):
        self.IDENT = (self.IDENT,)

    if not self.BASE:
        warnings.warn("No base given, using http://example.org/instances/")
        self.BASE = rdflib.Namespace("http://example.org/instances/")

    if not self.PROPBASE:
        warnings.warn("No property base given, using http://example.org/property/")
        self.PROPBASE = rdflib.Namespace("http://example.org/props/")

    # skip lines at the start
    for x in range(self.SKIP):
        next(csvreader)

    # read header line
    header_labels = list(next(csvreader))
    headers = dict(enumerate([self.PROPBASE[toProperty(x)] for x in header_labels]))
    # override header properties if some are given
    for k, v in self.PROPS.items():
        headers[k] = v
        header_labels[k] = split_uri(v)[1]

    if self.DEFINECLASS:
        # output class/property definitions
        self.triple(self.CLASS, RDF.type, RDFS.Class)
        for i in range(len(headers)):
            h, l_ = headers[i], header_labels[i]
            if h == "" or l_ == "":
                continue
            if self.COLUMNS.get(i, self.DEFAULT) == "ignore":
                continue
            self.triple(h, RDF.type, RDF.Property)
            self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l_)))
            self.triple(h, RDFS.domain, self.CLASS)
            self.triple(
                h, RDFS.range, self.COLUMNS.get(i, default_node_make).range()
            )

    rows = 0
    for l_ in csvreader:
        try:
            if self.IDENT == "auto":
                uri = self.BASE["%d" % rows]
            else:
                uri = self.BASE[
                    "_".join(
                        [
                            # type error: "int" has no attribute "encode"
                            quote(x.encode("utf8").replace(" ", "_"), safe="")  # type: ignore[attr-defined]
                            # type error: Argument 2 to "index" has incompatible type "Union[Tuple[str, ...], str]"; expected "Tuple[int, ...]"
                            for x in index(l_, self.IDENT)  # type: ignore[arg-type]
                        ]
                    )
                ]

            if self.LABEL:
                self.triple(
                    # type error: Argument 1 to "join" of "str" has incompatible type "Tuple[int, ...]"; expected "Iterable[str]"
                    uri,
                    RDFS.label,
                    rdflib.Literal(" ".join(index(l_, self.LABEL))),  # type: ignore[arg-type]
                )

            if self.CLASS:
                # type triple
                self.triple(uri, RDF.type, self.CLASS)

            for i, x in enumerate(l_):
                # type error: "int" has no attribute "strip"
                x = x.strip()  # type: ignore[attr-defined]
                if x != "":
                    if self.COLUMNS.get(i, self.DEFAULT) == "ignore":
                        continue
                    try:
                        o = self.COLUMNS.get(i, rdflib.Literal)(x)
                        if isinstance(o, list):
                            for _o in o:
                                self.triple(uri, headers[i], _o)
                        else:
                            self.triple(uri, headers[i], o)

                    except Exception as e:
                        warnings.warn(
                            "Could not process value for column "
                            + "%d:%s in row %d, ignoring: %s "
                            # type error: "Exception" has no attribute "message"
                            % (i, headers[i], rows, e.message)  # type: ignore[attr-defined]
                        )

            rows += 1
            if rows % 100000 == 0:
                sys.stderr.write(
                    "%d rows, %d triples, elapsed %.2fs.\n"
                    % (rows, self.triples, time.time() - start)
                )
        except Exception:
            sys.stderr.write("Error processing line: %d\n" % rows)
            raise

    # output types/labels for generated URIs
    classes = set()
    # type error: Incompatible types in assignment (expression has type "Tuple[URIRef, Optional[URIRef]]", variable has type "int")
    for l_, x in uris.items():  # type: ignore[assignment]
        # type error: "int" object is not iterable
        u, c = x  # type: ignore[misc]
        # type error: Cannot determine type of "u"
        self.triple(u, RDFS.label, rdflib.Literal(l_))  # type: ignore[has-type]
        # type error: Cannot determine type of "c"
        if c:  # type: ignore[has-type]
            # type error: Cannot determine type of "c"
            c = rdflib.URIRef(c)  # type: ignore[has-type]
            classes.add(c)
            # type error: Cannot determine type of "u"
            self.triple(u, RDF.type, c)  # type: ignore[has-type]

    for c in classes:
        self.triple(c, RDF.type, RDFS.Class)

    self.OUT.close()
    sys.stderr.write("Converted %d rows into %d triples.\n" % (rows, self.triples))
    sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start))

triple

triple(s, p, o)
Source code in rdflib/tools/csv2rdf.py
def triple(self, s, p, o):
    self.OUT.write("%s %s %s .\n" % (s.n3(), p.n3(), o.n3()))
    self.triples += 1

NodeBool

NodeBool(f=None)

Bases: NodeLiteral

Methods:

Source code in rdflib/tools/csv2rdf.py
def __init__(self, f=None):
    self.f = f

__call__

__call__(x)
Source code in rdflib/tools/csv2rdf.py
def __call__(self, x):
    if not self.f:
        return rdflib.Literal(bool(x))
    if callable(self.f):
        return rdflib.Literal(bool(self.f(x)))
    raise Exception("Function passed to bool is not callable")

range

range()
Source code in rdflib/tools/csv2rdf.py
def range(self):
    return rdflib.XSD.bool

NodeDate

NodeDate(f=None)

Bases: NodeLiteral

Methods:

Source code in rdflib/tools/csv2rdf.py
def __init__(self, f=None):
    self.f = f

__call__

__call__(x)
Source code in rdflib/tools/csv2rdf.py
def __call__(self, x):
    return rdflib.Literal(datetime.datetime.strptime(x, self.f))

range

range()
Source code in rdflib/tools/csv2rdf.py
def range(self):
    return rdflib.XSD.dateTime

NodeFloat

NodeFloat(f=None)

Bases: NodeLiteral

Methods:

Source code in rdflib/tools/csv2rdf.py
def __init__(self, f=None):
    self.f = f

__call__

__call__(x)
Source code in rdflib/tools/csv2rdf.py
def __call__(self, x):
    if not self.f:
        return rdflib.Literal(float(x))
    if callable(self.f):
        return rdflib.Literal(float(self.f(x)))
    raise Exception("Function passed to float is not callable")

range

range()
Source code in rdflib/tools/csv2rdf.py
def range(self):
    return rdflib.XSD.double

NodeInt

NodeInt(f=None)

Bases: NodeLiteral

Methods:

Source code in rdflib/tools/csv2rdf.py
def __init__(self, f=None):
    self.f = f

__call__

__call__(x)
Source code in rdflib/tools/csv2rdf.py
def __call__(self, x):
    if not self.f:
        return rdflib.Literal(int(x))
    if callable(self.f):
        return rdflib.Literal(int(self.f(x)))
    raise Exception("Function passed to int is not callable")

range

range()
Source code in rdflib/tools/csv2rdf.py
def range(self):
    return rdflib.XSD.int

NodeLiteral

NodeLiteral(f=None)

Bases: NodeMaker

Attributes:

  • f
Source code in rdflib/tools/csv2rdf.py
def __init__(self, f=None):
    self.f = f

f instance-attribute

f = f

NodeMaker

Methods:

__call__

__call__(x: Any)
Source code in rdflib/tools/csv2rdf.py
def __call__(self, x: Any):
    return rdflib.Literal(x)

range

range()
Source code in rdflib/tools/csv2rdf.py
def range(self):
    return rdflib.RDFS.Literal

NodeReplace

NodeReplace(a, b)

Bases: NodeMaker

Methods:

Attributes:

  • a
  • b
Source code in rdflib/tools/csv2rdf.py
def __init__(self, a, b):
    self.a = a
    self.b = b

a instance-attribute

a = a

b instance-attribute

b = b

__call__

__call__(x)
Source code in rdflib/tools/csv2rdf.py
def __call__(self, x):
    return x.replace(self.a, self.b)

NodeSplit

NodeSplit(sep, f)

Bases: NodeMaker

Methods:

Attributes:

Source code in rdflib/tools/csv2rdf.py
def __init__(self, sep, f):
    self.sep = sep
    self.f = f

f instance-attribute

f = f

sep instance-attribute

sep = sep

__call__

__call__(x)
Source code in rdflib/tools/csv2rdf.py
def __call__(self, x):
    if not self.f:
        self.f = rdflib.Literal
    if not callable(self.f):
        raise Exception("Function passed to split is not callable!")
    return [self.f(y.strip()) for y in x.split(self.sep) if y.strip() != ""]

range

range()
Source code in rdflib/tools/csv2rdf.py
def range(self):
    if self.f and isinstance(self.f, NodeMaker):
        return self.f.range()
    return NodeMaker.range(self)

NodeUri

NodeUri(prefix, class_)

Bases: NodeMaker

Methods:

Attributes:

Source code in rdflib/tools/csv2rdf.py
def __init__(self, prefix, class_):
    self.class_: Optional[URIRef] = None
    self.prefix = prefix
    if class_:
        self.class_ = rdflib.URIRef(class_)
    else:
        self.class_ = None

class_ instance-attribute

class_: Optional[URIRef] = None

prefix instance-attribute

prefix = prefix

__call__

__call__(x)
Source code in rdflib/tools/csv2rdf.py
def __call__(self, x):
    return prefixuri(x, self.prefix, self.class_)

range

range()
Source code in rdflib/tools/csv2rdf.py
def range(self):
    return self.class_ or rdflib.RDF.Resource

column

column(v)

Return a function for column mapping

Source code in rdflib/tools/csv2rdf.py
def column(v):
    """Return a function for column mapping"""

    return eval(v, config_functions)

csv_reader

csv_reader(csv_data, dialect=excel, **kwargs)
Source code in rdflib/tools/csv2rdf.py
def csv_reader(csv_data, dialect=csv.excel, **kwargs):
    csv_reader = csv.reader(csv_data, dialect=dialect, **kwargs)
    for row in csv_reader:
        yield row

index

index(l_: List[int], i: Tuple[int, ...]) -> Tuple[int, ...]

return a set of indexes from a list

index([1,2,3],(0,2)) (1, 3)

Source code in rdflib/tools/csv2rdf.py
def index(l_: List[int], i: Tuple[int, ...]) -> Tuple[int, ...]:
    """return a set of indexes from a list
    >>> index([1,2,3],(0,2))
    (1, 3)
    """
    return tuple([l_[x] for x in i])

main

main()
Source code in rdflib/tools/csv2rdf.py
def main():
    csv2rdf = CSV2RDF()

    opts: Union[Dict[str, str], List[Tuple[str, str]]]
    opts, files = getopt.getopt(
        sys.argv[1:],
        "hc:b:p:i:o:Cf:l:s:d:D:",
        [
            "out=",
            "base=",
            "delim=",
            "propbase=",
            "class=",
            "default=" "ident=",
            "label=",
            "skip=",
            "defineclass",
            "help",
        ],
    )
    opts = dict(opts)

    if "-h" in opts or "--help" in opts:
        print(HELP)
        sys.exit(-1)

    if "-f" in opts:
        config = configparser.ConfigParser()
        config.read_file(open(opts["-f"]))
        for k, v in config.items("csv2rdf"):
            if k == "out":
                csv2rdf.OUT = codecs.open(v, "w", "utf-8")
            elif k == "base":
                csv2rdf.BASE = rdflib.Namespace(v)
            elif k == "propbase":
                csv2rdf.PROPBASE = rdflib.Namespace(v)
            elif k == "class":
                csv2rdf.CLASS = rdflib.URIRef(v)
            elif k == "defineclass":
                csv2rdf.DEFINECLASS = bool(v)
            elif k == "ident":
                csv2rdf.IDENT = eval(v)
            elif k == "label":
                csv2rdf.LABEL = eval(v)
            elif k == "delim":
                csv2rdf.DELIM = v
            elif k == "skip":
                csv2rdf.SKIP = int(v)
            elif k == "default":
                csv2rdf.DEFAULT = column(v)
            elif k.startswith("col"):
                csv2rdf.COLUMNS[int(k[3:])] = column(v)
            elif k.startswith("prop"):
                csv2rdf.PROPS[int(k[4:])] = rdflib.URIRef(v)

    if "-o" in opts:
        csv2rdf.OUT = codecs.open(opts["-o"], "w", "utf-8")
    if "--out" in opts:
        csv2rdf.OUT = codecs.open(opts["--out"], "w", "utf-8")

    if "-b" in opts:
        csv2rdf.BASE = rdflib.Namespace(opts["-b"])
    if "--base" in opts:
        csv2rdf.BASE = rdflib.Namespace(opts["--base"])

    if "-d" in opts:
        csv2rdf.DELIM = opts["-d"]
    if "--delim" in opts:
        csv2rdf.DELIM = opts["--delim"]

    if "-D" in opts:
        csv2rdf.DEFAULT = column(opts["-D"])
    if "--default" in opts:
        csv2rdf.DEFAULT = column(opts["--default"])

    if "-p" in opts:
        csv2rdf.PROPBASE = rdflib.Namespace(opts["-p"])
    if "--propbase" in opts:
        csv2rdf.PROPBASE = rdflib.Namespace(opts["--propbase"])

    if "-l" in opts:
        csv2rdf.LABEL = eval(opts["-l"])
    if "--label" in opts:
        csv2rdf.LABEL = eval(opts["--label"])

    if "-i" in opts:
        csv2rdf.IDENT = eval(opts["-i"])
    if "--ident" in opts:
        csv2rdf.IDENT = eval(opts["--ident"])

    if "-s" in opts:
        csv2rdf.SKIP = int(opts["-s"])
    if "--skip" in opts:
        csv2rdf.SKIP = int(opts["--skip"])

    if "-c" in opts:
        csv2rdf.CLASS = rdflib.URIRef(opts["-c"])
    if "--class" in opts:
        csv2rdf.CLASS = rdflib.URIRef(opts["--class"])

    for k, v in opts.items():
        if k.startswith("--col"):
            csv2rdf.COLUMNS[int(k[5:])] = column(v)
        elif k.startswith("--prop"):
            csv2rdf.PROPS[int(k[6:])] = rdflib.URIRef(v)

    if csv2rdf.CLASS and ("-C" in opts or "--defineclass" in opts):
        csv2rdf.DEFINECLASS = True

    csv2rdf.convert(csv_reader(fileinput.input(files), delimiter=csv2rdf.DELIM))

prefixuri

prefixuri(x, prefix, class_: Optional[URIRef] = None)
Source code in rdflib/tools/csv2rdf.py
def prefixuri(x, prefix, class_: Optional[URIRef] = None):
    if prefix:
        r = rdflib.URIRef(prefix + quote(x.encode("utf8").replace(" ", "_"), safe=""))
    else:
        r = rdflib.URIRef(x)
    uris[x] = (r, class_)
    return r

toProperty

toProperty(label: str)

CamelCase + lowercase initial a string

FIRST_NM => firstNm

firstNm => firstNm

Source code in rdflib/tools/csv2rdf.py
def toProperty(label: str):  # noqa: N802
    """
    CamelCase + lowercase initial a string


    FIRST_NM => firstNm

    firstNm => firstNm

    """
    label = re.sub(r"[^\w]", " ", label)
    label = re.sub("([a-z])([A-Z])", "\\1 \\2", label)
    # type error: Incompatible types in assignment (expression has type "None", variable has type "BinaryIO")
    label = label.split(" ")  # type: ignore[assignment]
    return "".join([label[0].lower()] + [x.capitalize() for x in label[1:]])

toPropertyLabel

toPropertyLabel(label)
Source code in rdflib/tools/csv2rdf.py
def toPropertyLabel(label):  # noqa: N802
    if not label[1:2].isupper():
        return label[0:1].lower() + label[1:]
    return label