From b765afeb177920957b5b098a1db006e8044d4c0b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 18 Jan 2026 04:35:41 +0000 Subject: [PATCH 1/9] Initial plan From 8c67136a85940f4cb1ef0f1b1b365814827bd0f2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 18 Jan 2026 04:43:37 +0000 Subject: [PATCH 2/9] Complete migration to pyproject.toml and refactor code structure Co-authored-by: jpmccu <602385+jpmccu@users.noreply.github.com> --- pyproject.toml | 54 + setlr/__init__.py | 1034 ++------------------ setlr/core.py | 1027 +++++++++++++++++++ tests/setlr_test/test_api_compatibility.py | 98 ++ tests/setlr_test/test_error_messages.py | 14 +- 5 files changed, 1272 insertions(+), 955 deletions(-) create mode 100644 pyproject.toml create mode 100644 setlr/core.py create mode 100644 tests/setlr_test/test_api_compatibility.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6facde3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["setuptools>=68.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "setlr" +version = "1.0.1" +description = "setlr is a tool for Semantic Extraction, Transformation, and Loading." +readme = "README.md" +license = {text = "Apache License 2.0"} +authors = [ + {name = "Jamie McCusker", email = "mccusj@cs.rpi.edu"} +] +keywords = ["rdf", "semantic", "etl"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Topic :: Utilities", + "License :: OSI Approved :: Apache Software License", +] +requires-python = ">=3.8" +dependencies = [ + "future", + "pip>=9.0.0", + "cython", + "numpy", + "rdflib>=6.0.0", + "pandas>=0.23.0", + "requests", + "toposort", + "beautifulsoup4", + "jinja2", + "lxml", + "six", + "xlrd", + "ijson", + "click", + "tqdm", + "requests-testadapter", + "python-slugify", + "pyshacl[js]", +] + +[project.urls] +Homepage = "http://packages.python.org/setlr" + +[project.scripts] +setlr = "setlr:main" + +[tool.setuptools] +packages = ["setlr"] +include-package-data = true + +[tool.setuptools.package-data] +setlr = ["**/*"] diff --git a/setlr/__init__.py b/setlr/__init__.py index c8a92f0..eba796a 100644 --- a/setlr/__init__.py +++ b/setlr/__init__.py @@ -1,954 +1,90 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from builtins import str -from builtins import next -from builtins import object -from rdflib import * -from rdflib.util import guess_format -import rdflib -import csv -import json -import sys, collections -import requests -import pandas -import re -import os -from six import text_type as str - -from jinja2 import Template -from toposort import toposort_flatten -from numpy import isnan -import uuid -import tempfile -import ijson -from . import iterparse_filter -#import xml.etree.ElementTree as ET -import xml.etree.ElementTree - -from itertools import chain - -import zipfile -import gzip - -import logging - -from tqdm import tqdm - -import hashlib -from slugify import slugify -from pyshacl import validate - -from .trig_store import TrigStore - -def hash(value): - m = hashlib.sha256() - m.update(value.encode('utf-8')) - return m.hexdigest() - -csvw = Namespace('http://www.w3.org/ns/csvw#') -ov = Namespace('http://open.vocab.org/terms/') -setl = Namespace('http://purl.org/twc/vocab/setl/') -prov = Namespace('http://www.w3.org/ns/prov#') -pv = Namespace('http://purl.org/net/provenance/ns#') -sp = Namespace('http://spinrdf.org/sp#') -sd = Namespace('http://www.w3.org/ns/sparql-service-description#') -dc = Namespace('http://purl.org/dc/terms/') -void = Namespace('http://rdfs.org/ns/void#') -shacl = Namespace('http://www.w3.org/ns/shacl#') -api_vocab = Namespace('http://purl.org/linked-data/api/vocab#') - -sys.setrecursionlimit(10000) - -from requests_testadapter import Resp - -# Regex pattern for extracting Jinja2 template variables (compiled once for performance) -TEMPLATE_VAR_PATTERN = re.compile(r'\{\{([^}]+)\}\}') - -def camelcase(s): - return slugify(s).title().replace("-","") - -class LocalFileAdapter(requests.adapters.HTTPAdapter): - def build_response_from_file(self, request): - file_path = request.url[7:] - with open(file_path, 'rb') as file: - buff = bytearray(os.path.getsize(file_path)) - file.readinto(buff) - resp = Resp(buff) - r = self.build_response(request, resp) - return r - def send(self, request, stream=False, timeout=None, - verify=True, cert=None, proxies=None): - return self.build_response_from_file(request) - -requests_session = requests.session() -requests_session.mount('file://', LocalFileAdapter()) -requests_session.mount('file:///', LocalFileAdapter()) - -datatypeConverters = collections.defaultdict(lambda: str) -datatypeConverters.update({ - XSD.string: str, - XSD.decimal: float, - XSD.integer: int, - XSD.float: float, - XSD.double: float -}) - -run_samples = -1 - -_rdf_formats_to_guess = [ - 'xml', - 'json-ld', - 'trig', - 'nquads', - 'trix' -] - - -def read_csv(location, result): - args = dict( - sep = result.value(csvw.delimiter, default=Literal(",")).value, - #header = result.value(csvw.headerRow, default=Literal(0)).value), - skiprows = result.value(csvw.skipRows, default=Literal(0)).value, - dtype=str, - # dtype = object # Does not seem to play well with future and python2/3 conversion - ) - if result.value(csvw.header): - args['header'] = [0] - with get_content(location, result) as fo: - df = pandas.read_csv(fo, encoding='utf-8', **args) - #logger.debug("Loaded %s", location) - return df - -def read_graph(location, result, g = None): - if g is None: - g = ConjunctiveGraph() - graph = ConjunctiveGraph(store=g.store, identifier=result.identifier) - if len(graph) == 0: - data = get_content(location, result).read() - f = guess_format(location) - for fmt in [f] + _rdf_formats_to_guess: - try: - graph.parse(data=data, format=fmt) - break - except Exception as e: - #print e - pass - if len(graph) == 0: - logger.error("Could not parse graph: %s", location) - if result[RDF.type:OWL.Ontology]: - for ontology in graph.subjects(RDF.type, OWL.Ontology): - imports = [graph.resource(x) for x in graph.objects(ontology, OWL.imports)] - for i in imports: - read_graph(i.identifier, i, g = g) - return g - -class FileLikeFromIter(object): - _closed = False - - def __init__(self, content_iter): - self.iter = content_iter - self.data = b'' - - def __iter__(self): - return self.iter - - def readable(self): - return True - - def writable(self): - return False - - def seekable(self): - return False - - def closed(self): - if self._closed: - return True - if len(self.data) > 0: - return False - try: - self.data = next(self.iter) - except StopIteration: - self.closed = True - return True - return False - - # Enter and Exit are needed to allow this to work with with - def __enter__(self): - return self - - # Could be improved for better error/exception handling - def __exit__(self, err_type, value, tracebock): - pass - - def read(self, n=None): - if n is None: - return self.data + b''.join(l for l in self.iter) - else: - while len(self.data) < n: - try: - self.data = b''.join((self.data, next(self.iter))) - except StopIteration: - break - result, self.data = self.data[:n], self.data[n:] - return result - -def _open_local_file(location): - if location.startswith("file://"): - if os.name == 'nt': # skip the initial - return open(location.replace('file:///','').replace('file://',''),'rb') - else: - return open(location.replace('file://',''),'rb') - -content_handlers = [ - _open_local_file, - lambda location: FileLikeFromIter(requests.get(location,stream=True).iter_content(1024*1024)) +"""setlr: Semantic Extract, Transform and Load-er + +This package provides tools for generating RDF graphs from tabular data +using declarative SETL (Semantic Extract, Transform, Load) scripts. + +Main functions: + run_setl(setl_graph): Execute a SETL script (recommended) + _setl(setl_graph): Deprecated, use run_setl() instead + main(): Command-line interface entry point +""" + +# Import the core functionality +from .core import ( + # Main API functions + run_setl, + _setl, # Deprecated, but kept for backward compatibility + main, + + # Utility functions that might be used by library users + read_csv, + read_excel, + read_json, + read_xml, + read_graph, + extract, + json_transform, + transform, + load, + isempty, + hash, + camelcase, + get_content, + + # Logger for configuration + logger, + + # Namespaces + csvw, + ov, + setl, + prov, + pv, + sp, + sd, + dc, + void, + shacl, + api_vocab, +) + +# Version +__version__ = '1.0.1' + +# Define what gets imported with "from setlr import *" +__all__ = [ + 'run_setl', + 'main', + # Include commonly used utilities + 'read_csv', + 'read_excel', + 'read_json', + 'read_xml', + 'read_graph', + 'extract', + 'json_transform', + 'transform', + 'load', + 'isempty', + 'hash', + 'camelcase', + 'get_content', + # Namespaces + 'csvw', + 'ov', + 'setl', + 'prov', + 'pv', + 'sp', + 'sd', + 'dc', + 'void', + 'shacl', + 'api_vocab', + # Keep _setl for backward compatibility but not in __all__ to discourage use ] -def get_content(location, result): - response = None - for handler in content_handlers: - response = handler(location) - if response is not None: - break - if result[RDF.type:setl.Tempfile]: - result = to_tempfile(response) - - for t in result[RDF.type]: - # Do we know how to unpack this? - if t.identifier in unpackers: - response = unpackers[t.identifier](response) - return response - -def to_tempfile(f): - tf = tempfile.TemporaryFile() - logger.debug("Writing %s to disk.", f) - for chunk in f: - if chunk: # filter out keep-alive new chunks - tf.write(chunk) - tf.seek(0) - logger.debug("Finished writing %s to disk.", f) - return tf - -def unpack_zipfile(f): - zf = zipfile.ZipFile(f, mode='r') - files = zf.infolist() - return zf.open(files[0]) - -unpackers = { -# setl.Tempfile : lambda x: x, - setl.ZipFile : lambda x: unpack_zipfile(to_tempfile(x)), - setl.GZipFile : lambda f: gzip.GzipFile(fileobj=f,mode='r') -} - -packers = { -# setl.Tempfile : lambda x: x, - setl.GZipFile : lambda f: gzip.GzipFile(fileobj=f,mode='wb') -} - -def read_excel(location, result): - args = dict( - sheet_name = result.value(setl.sheetname, default=Literal(0)).value, - header = [int(x) for x in result.value(csvw.headerRow, default=Literal('0')).value.split(',')], - skiprows = result.value(csvw.skipRows, default=Literal(0)).value - ) - if result.value(csvw.header): - args['header'] = [result.value(csvw.header).value] - with get_content(location, result) as fo: - df = pandas.read_excel(fo, encoding='utf-8', **args) - return df - -def read_xml(location, result): - validate_dtd = False - if result[RDF.type:setl.DTDValidatedXML]: - validate_dtd = True - f = iterparse_filter.IterParseFilter(validate_dtd=validate_dtd) - if result.value(setl.xpath) is None: - logger.debug("no xpath to select on from %s", location) - f.iter_end("/*") - for xp in result[setl.xpath]: - f.iter_end(xp.value) - with get_content(location, result) as fo: - for (i, (event, ele)) in enumerate(tqdm(f.iterparse(fo))): - yield i, ele - - -def read_json(location, result): - selector = result.value(api_vocab.selector) - if selector is not None: - selector = selector.value - else: - selector = "" - with get_content(location, result) as fo: - yield from enumerate(tqdm(ijson.items(fo, selector))) - - -extractors = { - setl.XPORT : lambda location, result: pandas.read_sas(get_content(location, result), format='xport'), - setl.SAS7BDAT : lambda location, result: pandas.read_sas(get_content(location, result), format='sas7bdat'), - setl.Excel : read_excel, - csvw.Table : read_csv, - OWL.Ontology : read_graph, - void.Dataset : read_graph, - setl.JSON : read_json, - setl.XML : read_xml, - URIRef("https://www.iana.org/assignments/media-types/text/plain") : lambda location, result: get_content(location, result) -} - - -try: - from bs4 import BeautifulSoup - extractors[setl.HTML] = lambda location, result: BeautifulSoup(get_content(location, result).read(), 'html.parser') -except Exception as e: - pass - - -def load_csv(csv_resource): - column_descriptions = {} - for col in csv_resource[csvw.column]: - label = col.value(RDFS.label).value - column_descriptions[label] = col - csv_graph = Graph(identifier=csv_resource) - s = [x for x in csv.reader(open(str(csv_resource.value(csvw.url).identifier).replace("file://","")), - delimiter=str(csv_resource.value(csvw.delimiter,default=",").value), - quotechar=str(csv_resource.value(csvw.quoteChar,default='"').value))] - header = None - properties = [] - propertyMap = {} - skip_value = csv_resource.value(csvw.null) - if skip_value is not None: - skip_value = skip_value.value - for i, r in enumerate(s): - if header is None: - header = r - for j, h in enumerate(header): - col_desc = None - if h in column_descriptions: - col_desc = column_descriptions[h] - col = csv_graph.resource(URIRef("urn:col_"+str(h))) - col.add(RDFS.label, Literal(h)) - col.add(ov.csvCol, Literal(j)) - if col_desc is not None: - col.add(RDFS.range, col_desc.value(RDFS.range, default=XSD.string)) - properties.append(col) - propertyMap[h] = col - continue - res = csv_graph.resource(csv_resource.identifier+"_row_"+str(i)) - res.add(RDF.type, csvw.Row) - res.add(csvw.rownum, Literal(i)) - for j, value in enumerate(r): - if skip_value is not None and skip_value == value: - continue - #print i, j, value - prop = properties[j] - datatype = prop.value(RDFS['range'], default=XSD.string) - lit = Literal(value, datatype=datatype.identifier) - #print i, prop.identifier, lit.n3() - res.add(prop.identifier, lit) - logger.debug("Table has %s rows, %s columns, and %s triples", len(s), len(header), len(csv_graph)) - return csv_graph - -formats = { - None:'xml', - "application/rdf+xml":'xml', - "text/rdf":'xml', - 'text/turtle':'turtle', - 'application/turtle':'turtle', - 'application/x-turtle':'turtle', - 'text/plain':'nt', - 'text/n3':'n3', - 'application/trig':'trig', - 'application/json':'json-ld' -} - -def create_python_function(f, resources): - global_vars = {'this' : f, 'resources': resources} - local_vars = {} - script = f.value(prov.value) - for qd in f[prov.qualifiedDerivation]: - entity = resources[qd.value(prov.entity).identifier] - name = qd.value(prov.hadRole).value(dc.identifier) - local_vars[name.value] = entity - exec(script.value, local_vars, global_vars) - resources[f.identifier] = global_vars['result'] - -def get_order(setl_graph): - nodes = collections.defaultdict(set) - - for typ in actions: - for task in setl_graph.subjects(RDF.type, typ): - task = setl_graph.resource(task) - for used in task[prov.used]: - nodes[task.identifier].add(used.identifier) - - for usage in task[prov.qualifiedUsage]: - used = usage.value(prov.entity) - nodes[task.identifier].add(used.identifier) - for generated in task.subjects(prov.wasGeneratedBy): - nodes[generated.identifier].add(task.identifier) - for derivation in task[prov.qualifiedDerivation]: - derived = derivation.value(prov.entity) - nodes[task.identifier].add(derived.identifier) - - return toposort_flatten(nodes) - -def extract(e, resources): - logger.info('Extract %s',e.identifier) - used = e.value(prov.used) - for result in e.subjects(prov.wasGeneratedBy): - if used is None: - used = result - for t in result[RDF.type]: - # Do we know how to generate this? - if t.identifier in extractors: - logger.info("Using %s", used.identifier) - resources[result.identifier] = extractors[t.identifier](used.identifier, result) - return resources[result.identifier] - -def isempty(value): - try: - return isnan(value) - except (TypeError, ValueError): - return value is None - -def clone(value): - __doc__ = '''This is only a JSON-level cloning of objects. Atomic objects are invariant, and don't need to be cloned.''' - if isinstance(value, list): - return [x for x in value] - elif isinstance(value, dict): - return dict(value) - else: - return value - -functions = {} -def get_function(expr, local_keys): - used_local_keys = [k for k in local_keys if k in expr] - key = tuple([expr]+sorted(used_local_keys)) - if key not in functions: - script = '''lambda %s,**kwargs: %s'''% (', '.join(sorted(used_local_keys)), expr) - #print(script) - fn = eval(script) - fn.__name__ = expr.encode("ascii", "ignore").decode('utf8') - functions[key] = fn - return functions[key] - -templates = {} -def get_template(templ): - if templ not in templates: - t = Template(templ) - templates[templ] = t - return templates[templ] - -def flatten_lists(o): - if isinstance(o, list): - result = [] - for x in o: - flattened = flatten_lists(x) - if isinstance(flattened, list): - result.extend(flattened) - else: - result.append(flattened) - return result - elif isinstance(o, dict): - for key in o.keys(): - o[key] = flatten_lists(o[key]) - return o - else: - return o - -def process_row(row, template, rowname, table, resources, transform, variables): - result = [] - e = {'row':row, - 'name': rowname, - 'table': table, - 'resources': resources, - 'template': template, - "transform": transform, - "setl_graph": transform.graph, - "isempty":isempty, - "slugify" : slugify, - "camelcase" : camelcase, - "hash":hash, - "isinstance":isinstance, - "str":str, - "float":float, - "int":int, - "chain": lambda x: chain(*x), - "list":list - } - e.update(variables) - e.update(rdflib.__dict__) - todo = [[x, result, e] for x in template] - - while len(todo) > 0: - task, parent, env = todo.pop() - key = None - value = task - this = None - if isinstance(parent, dict): - if len(task) != 2: - logger.debug(task) - key, value = task - kt = get_template(key) - key = kt.render(**env) - if isinstance(value, dict): - if '@if' in value: - try: - fn = get_function(value['@if'], list(env.keys())) - incl = fn(**env) - if incl is None or not incl: - continue - except KeyError: - continue - except AttributeError: - continue - except TypeError: - continue - except Exception as e: - logger.error("=" * 80) - logger.error("Error evaluating @if conditional: %s", value['@if']) - transform_obj = env.get('transform', {}) - transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown' - logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown')) - logger.error("Error type: %s", type(e).__name__) - logger.error("Error message: %s", str(e)) - logger.error("Row-specific variables:") - for key in ['row', 'name']: - if key in env: - v = env[key] - try: - logger.error(" %s: %s", key, str(v)[:200]) - except Exception: - logger.error(" %s: <%s>", key, type(v).__name__) - logger.error("=" * 80) - raise RuntimeError(f"Error in @if conditional '{value['@if']}': {type(e).__name__}: {str(e)}") from e - if '@for' in value: - f = value['@for'] - if isinstance(f, list): - f = ' '.join(f) - variable_list, expression = f.split(" in ", 1) - variable_list = re.split(r',\s+', variable_list.strip()) - val = value - if '@do' in value: - val = value['@do'] - else: - del val['@for'] - try: - fn = get_function(expression, list(env.keys())) - values = fn(**env) - if values is not None: - for v in values: - if len(variable_list) == 1: - v = [v] - new_env = dict(env) - for i, variable in enumerate(variable_list): - new_env[variable] = v[i] - child = clone(val) - todo.append((child, parent, new_env)) - except KeyError: - pass - except Exception as e: - logger.error("=" * 80) - logger.error("Error in @for loop: %s", value['@for']) - transform_obj = env.get('transform', {}) - transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown' - logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown')) - logger.error("Error type: %s", type(e).__name__) - logger.error("Error message: %s", str(e)) - logger.error("Expression: %s", expression) - logger.error("Variables to assign: %s", variable_list) - logger.error("Available variables: %s", sorted([k for k in env.keys() if not k.startswith('_')])) - logger.error("=" * 80) - raise RuntimeError(f"Error in @for loop '{value['@for']}': {type(e).__name__}: {str(e)}") from e - continue - if '@with' in value: - f = value['@with'] - if isinstance(f, list): - f = ' '.join(f) - expression, variable_list = f.split(" as ", 1) - variable_list = re.split(r',\s+', variable_list.strip()) - val = value - if '@do' in value: - val = value['@do'] - else: - del val['@with'] - try: - fn = get_function(expression, list(env.keys())) - v = fn(**env) - if v is not None: - if len(variable_list) == 1 and not ( - isinstance(v, collections.Iterable) - and not isinstance(v, str)): - v = [v] - new_env = dict(env) - for i, variable in enumerate(variable_list): - new_env[variable] = v[i] - child = clone(val) - todo.append((child, parent, new_env)) - except KeyError: - pass - except Exception as e: - logger.error("=" * 80) - logger.error("Error in @with expression: %s", value['@with']) - transform_obj = env.get('transform', {}) - transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown' - logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown')) - logger.error("Error type: %s", type(e).__name__) - logger.error("Error message: %s", str(e)) - logger.error("Expression: %s", expression) - logger.error("Variables to assign: %s", variable_list) - logger.error("Available variables: %s", sorted([k for k in env.keys() if not k.startswith('_')])) - logger.error("=" * 80) - raise RuntimeError(f"Error in @with expression '{value['@with']}': {type(e).__name__}: {str(e)}") from e - continue - this = {} - for child in list(value.items()): - if child[0] == '@if': - continue - if child[0] == '@for': - continue - todo.append((child, this, env)) - elif isinstance(value, list): - this = [] - for child in value: - todo.append((child, this, env)) - elif isinstance(value, str): - try: - template = get_template(str(value)) - this = template.render(**env) - except Exception as e: - logger.error("=" * 80) - logger.error("Error rendering Jinja2 template: %s", value[:200] if len(value) > 200 else value) - transform_obj = env.get('transform', {}) - transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown' - logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown')) - logger.error("Error type: %s", type(e).__name__) - logger.error("Error message: %s", str(e)) - logger.error("Template variables referenced in template:") - # Try to extract variable references from the template - matches = TEMPLATE_VAR_PATTERN.findall(value) - if matches: - for match in matches: - var_name = match.strip().split('.')[0].split('[')[0].strip() - if var_name in env: - val = env[var_name] - if type(val).__name__ == 'Element': - # XML Element - try: - val = xml.etree.ElementTree.tostring(val).decode('utf-8', errors='replace')[:200] - except Exception: - val = "" - else: - try: - val = str(val)[:200] - except Exception: - val = f"<{type(val).__name__}>" - logger.error(" %s = %s", var_name, val) - else: - logger.error(" %s = ", var_name) - logger.error("=" * 80) - raise RuntimeError(f"Error rendering template: {type(e).__name__}: {str(e)}") from e - else: - this = value - - if key is not None: - parent[key] = this - else: - parent.append(this) - - return flatten_lists(result) - -def json_transform(transform, resources): - logger.info("Transform %s", transform.identifier) - tables = [u for u in transform[prov.used]] - variables = {} - for usage in transform[prov.qualifiedUsage]: - used = usage.value(prov.entity) - role = usage.value(prov.hadRole) - roleID = role.value(dc.identifier) - variables[roleID.value] = resources[used.identifier] - #print "Using", used.identifier, "as", roleID.value - - generated = list(transform.subjects(prov.wasGeneratedBy))[0] - logger.info("Generating %s", generated.identifier) - - connected_downstream_graph = ''' -construct { - ?target ?p ?o -} where { - ?source (<>|!<>)* ?target. - ?target ?p ?o. -} -''' - shape_graph = Graph() - for shape in transform.objects(dc.conformsTo): - if shape[RDF.type:shacl.NodeShape] or shape[RDF.type:shacl.PropertyShape]: - logger.info("Validating against SHACL shape %s", shape.identifier) - shape_graph += transform.graph.query(connected_downstream_graph, - initBindings={"source":shape.identifier}) - if generated.identifier in resources: - result = resources[generated.identifier] - else: - result = ConjunctiveGraph() - if generated[RDF.type : setl.Persisted]: - store = TrigStore() - result = ConjunctiveGraph(store=store) - if generated[RDF.type : setl.Persisted]: - tempdir = tempfile.mktemp() - logger.info("Persisting %s to %s", generated.identifier, tempdir) - result.store.open(tempdir, True) - s = transform.value(prov.value).value - try: - jslt = json.loads(s) - except json.JSONDecodeError as e: - logger.error("Error parsing JSON-LD template for transform %s", transform.identifier) - lineno = getattr(e, 'lineno', 0) - colno = getattr(e, 'colno', 0) - msg = getattr(e, 'msg', str(e)) - logger.error("JSON parsing error at line %d, column %d: %s", lineno, colno, msg) - # Show context around the error (8 lines before, 3 after for better bracket matching) - lines = s.split("\n") - start_line = max(0, lineno - 8) - end_line = min(len(lines), lineno + 3) - logger.error("Template context:") - for i in range(start_line, end_line): - prefix = ">>> " if i == lineno - 1 else " " - logger.error("%s%d: %s", prefix, i + 1, lines[i]) - raise ValueError(f"Invalid JSON-LD template in transform {transform.identifier}: {msg} at line {lineno}, column {colno}") from e - except Exception as e: - logger.error("Error parsing JSON-LD template for transform %s: %s", transform.identifier, str(e)) - logger.error("Template content:\n%s", s[:500]) # Show first 500 chars - raise ValueError(f"Invalid JSON-LD template in transform {transform.identifier}: {str(e)}") from e - context = transform.value(setl.hasContext) - if context is not None: - context = json.loads(context.value) - for t in tables: - logger.info("Using %s", t.identifier) - table = resources[t.identifier] - it = table - if isinstance(table, pandas.DataFrame): - #if run_samples: - # table = table.head() - it = tqdm(table.iterrows(), total=table.shape[0]) - #logger.info("Transforming %s rows.", len(table.index)) - else: - logger.info("Transform %s", t.identifier) - for rowname, row in it: - if run_samples > 0 and rowname >= run_samples: - break - try: - root = None - data = None - root = { - "@id": generated.identifier, - "@graph": process_row(row, jslt, rowname, table, resources, transform, variables) - } - if context is not None: - root['@context'] = context - - #logger.debug(json.dumps(root, indent=4)) - #before = len(result) - #graph = ConjunctiveGraph(identifier=generated.identifier) - #graph.parse(data=json.dumps(root),format="json-ld") - data = json.dumps(root) - #del root - - if len(shape_graph) > 0: - d = ConjunctiveGraph() - d.parse(data=data,format='json-ld') - conforms, report, message = validate(d, - shacl_graph=shape_graph, - advanced=True, - debug=False) - if not conforms: - print(message) - result.parse(data=data, format="json-ld") - #del data - #after = len(result) - #logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.") - #sys.stdout.flush() - except Exception as e: - logger.error("=" * 80) - logger.error("Error in transform %s while processing row %s", transform.identifier, rowname) - if isinstance(table, pandas.DataFrame): - # Format row data with better NaN handling - row_dict = {} - for key, value in dict(row).items(): - if pandas.isna(value): - row_dict[key] = "" - else: - row_dict[key] = value - logger.error("Row data: %s", row_dict) - else: - logger.error("Row identifier: %s", rowname) - - # Try to provide more specific error information - error_type = type(e).__name__ - if "JSON-LD" in str(e) or "json" in str(e).lower(): - logger.error("JSON-LD processing error: %s", str(e)) - if data is not None: - logger.error("Generated JSON-LD (first 1000 chars):\n%s", data[:1000]) - elif hasattr(e, 'lineno'): - logger.error("%s at line %d: %s", error_type, e.lineno, str(e)) - else: - logger.error("%s: %s", error_type, str(e)) - - logger.error("=" * 80) - raise RuntimeError(f"Failed to transform row {rowname} in transform {transform.identifier}: {error_type}: {str(e)}") from e - - resources[generated.identifier] = result - -def transform(transform_resource, resources): - logger.info('Transforming %s',transform_resource.identifier) - - transform_graph = ConjunctiveGraph() - for result in transform_graph.subjects(prov.wasGeneratedBy): - transform_graph = ConjunctiveGraph(identifier=result.identifier) - - used = set(transform_resource[prov.used]) - - for csv in [u for u in used if u[RDF.type:csvw.Table]]: - csv_graph = Graph(store=transform_graph.store, identifier=csv) - csv_graph += graphs[csv.identifier] - - - for script in [u for u in used if u[RDF.type:setl.PythonScript]]: - logger.info("Script: %s", script.identifier) - s = script.value(prov.value).value - l = dict(graph = transform_graph, setl_graph = transform_resource.graph) - gl = dict() - exec(s, gl, l) - - for jsldt in [u for u in used if u[RDF.type:setl.PythonScript]]: - logger.info("Script: %s", script.identifier) - s = script.value(prov.value).value - l = dict(graph = transform_graph, setl_graph = transform_resource.graph) - gl = dict() - exec(s, gl, l) - - for update in [u for u in used if u[RDF.type:sp.Update]]: - logger.info("Update: %s", update.identifier) - query = update.value(prov.value).value - transform_graph.update(query) - - for construct in [u for u in used if u[RDF.type:sp.Construct]]: - logger.info("Construct: %s", construct.identifier) - query = construct.value(prov.value).value - g = transform_graph.query(query) - transform_graph += g - - for csv in [u for u in used if u[RDF.type:csvw.Table]]: - g = Graph(identifier=csv.identifier,store=transform_graph.store) - g.remove((None, None, None)) - transform_graph.store.remove_graph(csv.identifier) - - for result in transform_graph.subjects(prov.wasGeneratedBy): - graphs[result.identifier] = transform_graph - -def _load_open(generated): - if generated.identifier.startswith("file://"): - if os.name == 'nt': # skip the initial - filename = generated.identifier.replace('file:///','').replace('file://','') - else: - filename = generated.identifier.replace('file://','') - - fh = open(filename, 'wb') - for type, pack in packers.items(): - if generated[RDF.type : type]: - return pack(fh) - return fh - -def load(load_resource, resources): - logger.info('Load %s',load_resource.identifier) - file_graph = Dataset(default_union=True) - to_disk = False - for used in load_resource[prov.used]: - if used[RDF.type : setl.Persisted]: - to_disk = True - file_graph = Dataset(store='Sleepycat', default_union=True) - tempdir = tempfile.mkdtemp() - logger.debug("Gathering %s into %s", load_resource.identifier, tempdir) - file_graph.store.open(tempdir, True) - break - if len(list(load_resource[prov.used])) == 1: - logger.info("Using %s",load_resource.value(prov.used).identifier) - file_graph = resources[load_resource.value(prov.used).identifier] - else: - for used in load_resource[prov.used]: - logger.info("Using %s",used.identifier) - used_graph = resources[used.identifier] - file_graph.namespace_manager = used_graph.namespace_manager - #print used_graph.serialize(format="trig") - file_graph.addN(used_graph.quads()) - - for generated in load_resource.subjects(prov.wasGeneratedBy): - # TODO: support LDP-based loading - if generated[RDF.type:pv.File]: - fmt = generated.value(dc['format']) - if fmt is not None: - fmt = fmt.value - if fmt in formats: - fmt = formats[fmt] - #print fmt - with _load_open(generated) as o: - file_graph.serialize(o, format=fmt) - - elif generated[RDF.type:sd.Service]: - from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore - endpoint = generated.value(sd.endpoint, default=generated).identifier - store = SPARQLUpdateStore(endpoint, endpoint, autocommit=False) - endpoint_graph = Dataset(store=store, identifier=generated.identifier, default_union=True) - endpoint_graph.addN(file_graph.quads()) - endpoint_graph.commit() - #if to_disk: - # file_graph.close() - - -actions = { - setl.Extract : extract, - setl.Transform : json_transform, - setl.Load : load, - setl.PythonScript : create_python_function, - setl.IsEmpty : isempty -} - -def _setl(setl_graph): - global logger - if logger is None: - logger = logging.getLogger(__name__) - resources = {} - resources.update(actions) - - tasks = [setl_graph.resource(t) for t in get_order(setl_graph)] - - for task in tasks: - action = [actions[t.identifier] for t in task[RDF.type] if t.identifier in actions] - if len(action) > 0: - action[0](task, resources) - return resources -logger = None - -import click -@click.command() -@click.option('--quiet', '-q', is_flag=True, default=False, help="Minimize logging.") -@click.option('-n', default=-1, help="Only process the first N rows.", type=int) -#@click.option('--rdf-validation', default=None, help="Save the RDF validation report to this file.") -#@click.option('--text-validation', default=None, help="Save the text validation report to this file.") -@click.argument('script', type=click.Path(exists=True)) -def main(script, rdf_validation=None, text_validation=None, quiet=False, n=-1): - logging_level = logging.DEBUG - if quiet: - logging_level = logging.WARNING - logging.basicConfig(level=logging_level) - - global logger - logger = logging.getLogger(__name__) - - global run_samples - run_samples = n - setl_graph = ConjunctiveGraph() - content = open(script).read() - setl_graph.parse(data=content, format="turtle") - - graphs = _setl(setl_graph) +# Note: _setl is still importable for backward compatibility but not in __all__ diff --git a/setlr/core.py b/setlr/core.py new file mode 100644 index 0000000..e728bcd --- /dev/null +++ b/setlr/core.py @@ -0,0 +1,1027 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from builtins import str +from builtins import next +from builtins import object +from rdflib import * +from rdflib.util import guess_format +import rdflib +import csv +import json +import sys, collections +import requests +import pandas +import re +import os +from six import text_type as str + +from jinja2 import Template +from toposort import toposort_flatten +from numpy import isnan +import uuid +import tempfile +import ijson +from . import iterparse_filter +#import xml.etree.ElementTree as ET +import xml.etree.ElementTree + +from itertools import chain + +import zipfile +import gzip + +import logging + +from tqdm import tqdm + +import hashlib +from slugify import slugify +from pyshacl import validate + +from .trig_store import TrigStore + +def hash(value): + m = hashlib.sha256() + m.update(value.encode('utf-8')) + return m.hexdigest() + +csvw = Namespace('http://www.w3.org/ns/csvw#') +ov = Namespace('http://open.vocab.org/terms/') +setl = Namespace('http://purl.org/twc/vocab/setl/') +prov = Namespace('http://www.w3.org/ns/prov#') +pv = Namespace('http://purl.org/net/provenance/ns#') +sp = Namespace('http://spinrdf.org/sp#') +sd = Namespace('http://www.w3.org/ns/sparql-service-description#') +dc = Namespace('http://purl.org/dc/terms/') +void = Namespace('http://rdfs.org/ns/void#') +shacl = Namespace('http://www.w3.org/ns/shacl#') +api_vocab = Namespace('http://purl.org/linked-data/api/vocab#') + +sys.setrecursionlimit(10000) + +from requests_testadapter import Resp + +# Regex pattern for extracting Jinja2 template variables (compiled once for performance) +TEMPLATE_VAR_PATTERN = re.compile(r'\{\{([^}]+)\}\}') + +def camelcase(s): + return slugify(s).title().replace("-","") + +class LocalFileAdapter(requests.adapters.HTTPAdapter): + def build_response_from_file(self, request): + file_path = request.url[7:] + with open(file_path, 'rb') as file: + buff = bytearray(os.path.getsize(file_path)) + file.readinto(buff) + resp = Resp(buff) + r = self.build_response(request, resp) + return r + def send(self, request, stream=False, timeout=None, + verify=True, cert=None, proxies=None): + return self.build_response_from_file(request) + +requests_session = requests.session() +requests_session.mount('file://', LocalFileAdapter()) +requests_session.mount('file:///', LocalFileAdapter()) + +datatypeConverters = collections.defaultdict(lambda: str) +datatypeConverters.update({ + XSD.string: str, + XSD.decimal: float, + XSD.integer: int, + XSD.float: float, + XSD.double: float +}) + +run_samples = -1 + +_rdf_formats_to_guess = [ + 'xml', + 'json-ld', + 'trig', + 'nquads', + 'trix' +] + + +def read_csv(location, result): + args = dict( + sep = result.value(csvw.delimiter, default=Literal(",")).value, + #header = result.value(csvw.headerRow, default=Literal(0)).value), + skiprows = result.value(csvw.skipRows, default=Literal(0)).value, + dtype=str, + # dtype = object # Does not seem to play well with future and python2/3 conversion + ) + if result.value(csvw.header): + args['header'] = [0] + with get_content(location, result) as fo: + df = pandas.read_csv(fo, encoding='utf-8', **args) + #logger.debug("Loaded %s", location) + return df + +def read_graph(location, result, g = None): + if g is None: + g = ConjunctiveGraph() + graph = ConjunctiveGraph(store=g.store, identifier=result.identifier) + if len(graph) == 0: + data = get_content(location, result).read() + f = guess_format(location) + for fmt in [f] + _rdf_formats_to_guess: + try: + graph.parse(data=data, format=fmt) + break + except Exception as e: + #print e + pass + if len(graph) == 0: + logger.error("Could not parse graph: %s", location) + if result[RDF.type:OWL.Ontology]: + for ontology in graph.subjects(RDF.type, OWL.Ontology): + imports = [graph.resource(x) for x in graph.objects(ontology, OWL.imports)] + for i in imports: + read_graph(i.identifier, i, g = g) + return g + +class FileLikeFromIter(object): + _closed = False + + def __init__(self, content_iter): + self.iter = content_iter + self.data = b'' + + def __iter__(self): + return self.iter + + def readable(self): + return True + + def writable(self): + return False + + def seekable(self): + return False + + def closed(self): + if self._closed: + return True + if len(self.data) > 0: + return False + try: + self.data = next(self.iter) + except StopIteration: + self.closed = True + return True + return False + + # Enter and Exit are needed to allow this to work with with + def __enter__(self): + return self + + # Could be improved for better error/exception handling + def __exit__(self, err_type, value, tracebock): + pass + + def read(self, n=None): + if n is None: + return self.data + b''.join(l for l in self.iter) + else: + while len(self.data) < n: + try: + self.data = b''.join((self.data, next(self.iter))) + except StopIteration: + break + result, self.data = self.data[:n], self.data[n:] + return result + +def _open_local_file(location): + if location.startswith("file://"): + if os.name == 'nt': # skip the initial + return open(location.replace('file:///','').replace('file://',''),'rb') + else: + return open(location.replace('file://',''),'rb') + +content_handlers = [ + _open_local_file, + lambda location: FileLikeFromIter(requests.get(location,stream=True).iter_content(1024*1024)) +] + +def get_content(location, result): + response = None + for handler in content_handlers: + response = handler(location) + if response is not None: + break + if result[RDF.type:setl.Tempfile]: + result = to_tempfile(response) + + for t in result[RDF.type]: + # Do we know how to unpack this? + if t.identifier in unpackers: + response = unpackers[t.identifier](response) + return response + +def to_tempfile(f): + tf = tempfile.TemporaryFile() + logger.debug("Writing %s to disk.", f) + for chunk in f: + if chunk: # filter out keep-alive new chunks + tf.write(chunk) + tf.seek(0) + logger.debug("Finished writing %s to disk.", f) + return tf + +def unpack_zipfile(f): + zf = zipfile.ZipFile(f, mode='r') + files = zf.infolist() + return zf.open(files[0]) + +unpackers = { +# setl.Tempfile : lambda x: x, + setl.ZipFile : lambda x: unpack_zipfile(to_tempfile(x)), + setl.GZipFile : lambda f: gzip.GzipFile(fileobj=f,mode='r') +} + +packers = { +# setl.Tempfile : lambda x: x, + setl.GZipFile : lambda f: gzip.GzipFile(fileobj=f,mode='wb') +} + +def read_excel(location, result): + args = dict( + sheet_name = result.value(setl.sheetname, default=Literal(0)).value, + header = [int(x) for x in result.value(csvw.headerRow, default=Literal('0')).value.split(',')], + skiprows = result.value(csvw.skipRows, default=Literal(0)).value + ) + if result.value(csvw.header): + args['header'] = [result.value(csvw.header).value] + with get_content(location, result) as fo: + df = pandas.read_excel(fo, encoding='utf-8', **args) + return df + +def read_xml(location, result): + validate_dtd = False + if result[RDF.type:setl.DTDValidatedXML]: + validate_dtd = True + f = iterparse_filter.IterParseFilter(validate_dtd=validate_dtd) + if result.value(setl.xpath) is None: + logger.debug("no xpath to select on from %s", location) + f.iter_end("/*") + for xp in result[setl.xpath]: + f.iter_end(xp.value) + with get_content(location, result) as fo: + for (i, (event, ele)) in enumerate(tqdm(f.iterparse(fo))): + yield i, ele + + +def read_json(location, result): + selector = result.value(api_vocab.selector) + if selector is not None: + selector = selector.value + else: + selector = "" + with get_content(location, result) as fo: + yield from enumerate(tqdm(ijson.items(fo, selector))) + + +extractors = { + setl.XPORT : lambda location, result: pandas.read_sas(get_content(location, result), format='xport'), + setl.SAS7BDAT : lambda location, result: pandas.read_sas(get_content(location, result), format='sas7bdat'), + setl.Excel : read_excel, + csvw.Table : read_csv, + OWL.Ontology : read_graph, + void.Dataset : read_graph, + setl.JSON : read_json, + setl.XML : read_xml, + URIRef("https://www.iana.org/assignments/media-types/text/plain") : lambda location, result: get_content(location, result) +} + + +try: + from bs4 import BeautifulSoup + extractors[setl.HTML] = lambda location, result: BeautifulSoup(get_content(location, result).read(), 'html.parser') +except Exception as e: + pass + + +def load_csv(csv_resource): + column_descriptions = {} + for col in csv_resource[csvw.column]: + label = col.value(RDFS.label).value + column_descriptions[label] = col + csv_graph = Graph(identifier=csv_resource) + s = [x for x in csv.reader(open(str(csv_resource.value(csvw.url).identifier).replace("file://","")), + delimiter=str(csv_resource.value(csvw.delimiter,default=",").value), + quotechar=str(csv_resource.value(csvw.quoteChar,default='"').value))] + header = None + properties = [] + propertyMap = {} + skip_value = csv_resource.value(csvw.null) + if skip_value is not None: + skip_value = skip_value.value + for i, r in enumerate(s): + if header is None: + header = r + for j, h in enumerate(header): + col_desc = None + if h in column_descriptions: + col_desc = column_descriptions[h] + col = csv_graph.resource(URIRef("urn:col_"+str(h))) + col.add(RDFS.label, Literal(h)) + col.add(ov.csvCol, Literal(j)) + if col_desc is not None: + col.add(RDFS.range, col_desc.value(RDFS.range, default=XSD.string)) + properties.append(col) + propertyMap[h] = col + continue + res = csv_graph.resource(csv_resource.identifier+"_row_"+str(i)) + res.add(RDF.type, csvw.Row) + res.add(csvw.rownum, Literal(i)) + for j, value in enumerate(r): + if skip_value is not None and skip_value == value: + continue + #print i, j, value + prop = properties[j] + datatype = prop.value(RDFS['range'], default=XSD.string) + lit = Literal(value, datatype=datatype.identifier) + #print i, prop.identifier, lit.n3() + res.add(prop.identifier, lit) + logger.debug("Table has %s rows, %s columns, and %s triples", len(s), len(header), len(csv_graph)) + return csv_graph + +formats = { + None:'xml', + "application/rdf+xml":'xml', + "text/rdf":'xml', + 'text/turtle':'turtle', + 'application/turtle':'turtle', + 'application/x-turtle':'turtle', + 'text/plain':'nt', + 'text/n3':'n3', + 'application/trig':'trig', + 'application/json':'json-ld' +} + +def create_python_function(f, resources): + global_vars = {'this' : f, 'resources': resources} + local_vars = {} + script = f.value(prov.value) + for qd in f[prov.qualifiedDerivation]: + entity = resources[qd.value(prov.entity).identifier] + name = qd.value(prov.hadRole).value(dc.identifier) + local_vars[name.value] = entity + exec(script.value, local_vars, global_vars) + resources[f.identifier] = global_vars['result'] + +def get_order(setl_graph): + nodes = collections.defaultdict(set) + + for typ in actions: + for task in setl_graph.subjects(RDF.type, typ): + task = setl_graph.resource(task) + for used in task[prov.used]: + nodes[task.identifier].add(used.identifier) + + for usage in task[prov.qualifiedUsage]: + used = usage.value(prov.entity) + nodes[task.identifier].add(used.identifier) + for generated in task.subjects(prov.wasGeneratedBy): + nodes[generated.identifier].add(task.identifier) + for derivation in task[prov.qualifiedDerivation]: + derived = derivation.value(prov.entity) + nodes[task.identifier].add(derived.identifier) + + return toposort_flatten(nodes) + +def extract(e, resources): + logger.info('Extract %s',e.identifier) + used = e.value(prov.used) + for result in e.subjects(prov.wasGeneratedBy): + if used is None: + used = result + for t in result[RDF.type]: + # Do we know how to generate this? + if t.identifier in extractors: + logger.info("Using %s", used.identifier) + resources[result.identifier] = extractors[t.identifier](used.identifier, result) + return resources[result.identifier] + +def isempty(value): + try: + return isnan(value) + except (TypeError, ValueError): + return value is None + +def clone(value): + __doc__ = '''This is only a JSON-level cloning of objects. Atomic objects are invariant, and don't need to be cloned.''' + if isinstance(value, list): + return [x for x in value] + elif isinstance(value, dict): + return dict(value) + else: + return value + +functions = {} +def get_function(expr, local_keys): + used_local_keys = [k for k in local_keys if k in expr] + key = tuple([expr]+sorted(used_local_keys)) + if key not in functions: + script = '''lambda %s,**kwargs: %s'''% (', '.join(sorted(used_local_keys)), expr) + #print(script) + fn = eval(script) + fn.__name__ = expr.encode("ascii", "ignore").decode('utf8') + functions[key] = fn + return functions[key] + +templates = {} +def get_template(templ): + if templ not in templates: + t = Template(templ) + templates[templ] = t + return templates[templ] + +def flatten_lists(o): + if isinstance(o, list): + result = [] + for x in o: + flattened = flatten_lists(x) + if isinstance(flattened, list): + result.extend(flattened) + else: + result.append(flattened) + return result + elif isinstance(o, dict): + for key in o.keys(): + o[key] = flatten_lists(o[key]) + return o + else: + return o + +def process_row(row, template, rowname, table, resources, transform, variables): + result = [] + e = {'row':row, + 'name': rowname, + 'table': table, + 'resources': resources, + 'template': template, + "transform": transform, + "setl_graph": transform.graph, + "isempty":isempty, + "slugify" : slugify, + "camelcase" : camelcase, + "hash":hash, + "isinstance":isinstance, + "str":str, + "float":float, + "int":int, + "chain": lambda x: chain(*x), + "list":list + } + e.update(variables) + e.update(rdflib.__dict__) + todo = [[x, result, e] for x in template] + + while len(todo) > 0: + task, parent, env = todo.pop() + key = None + value = task + this = None + if isinstance(parent, dict): + if len(task) != 2: + logger.debug(task) + key, value = task + kt = get_template(key) + key = kt.render(**env) + if isinstance(value, dict): + if '@if' in value: + try: + fn = get_function(value['@if'], list(env.keys())) + incl = fn(**env) + if incl is None or not incl: + continue + except KeyError: + continue + except AttributeError: + continue + except TypeError: + continue + except Exception as e: + logger.error("=" * 80) + logger.error("Error evaluating @if conditional: %s", value['@if']) + transform_obj = env.get('transform', {}) + transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown' + logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown')) + logger.error("Error type: %s", type(e).__name__) + logger.error("Error message: %s", str(e)) + logger.error("Row-specific variables:") + for key in ['row', 'name']: + if key in env: + v = env[key] + try: + logger.error(" %s: %s", key, str(v)[:200]) + except Exception: + logger.error(" %s: <%s>", key, type(v).__name__) + logger.error("=" * 80) + raise RuntimeError(f"Error in @if conditional '{value['@if']}': {type(e).__name__}: {str(e)}") from e + if '@for' in value: + f = value['@for'] + if isinstance(f, list): + f = ' '.join(f) + variable_list, expression = f.split(" in ", 1) + variable_list = re.split(r',\s+', variable_list.strip()) + val = value + if '@do' in value: + val = value['@do'] + else: + del val['@for'] + try: + fn = get_function(expression, list(env.keys())) + values = fn(**env) + if values is not None: + for v in values: + if len(variable_list) == 1: + v = [v] + new_env = dict(env) + for i, variable in enumerate(variable_list): + new_env[variable] = v[i] + child = clone(val) + todo.append((child, parent, new_env)) + except KeyError: + pass + except Exception as e: + logger.error("=" * 80) + logger.error("Error in @for loop: %s", value['@for']) + transform_obj = env.get('transform', {}) + transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown' + logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown')) + logger.error("Error type: %s", type(e).__name__) + logger.error("Error message: %s", str(e)) + logger.error("Expression: %s", expression) + logger.error("Variables to assign: %s", variable_list) + logger.error("Available variables: %s", sorted([k for k in env.keys() if not k.startswith('_')])) + logger.error("=" * 80) + raise RuntimeError(f"Error in @for loop '{value['@for']}': {type(e).__name__}: {str(e)}") from e + continue + if '@with' in value: + f = value['@with'] + if isinstance(f, list): + f = ' '.join(f) + expression, variable_list = f.split(" as ", 1) + variable_list = re.split(r',\s+', variable_list.strip()) + val = value + if '@do' in value: + val = value['@do'] + else: + del val['@with'] + try: + fn = get_function(expression, list(env.keys())) + v = fn(**env) + if v is not None: + if len(variable_list) == 1 and not ( + isinstance(v, collections.Iterable) + and not isinstance(v, str)): + v = [v] + new_env = dict(env) + for i, variable in enumerate(variable_list): + new_env[variable] = v[i] + child = clone(val) + todo.append((child, parent, new_env)) + except KeyError: + pass + except Exception as e: + logger.error("=" * 80) + logger.error("Error in @with expression: %s", value['@with']) + transform_obj = env.get('transform', {}) + transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown' + logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown')) + logger.error("Error type: %s", type(e).__name__) + logger.error("Error message: %s", str(e)) + logger.error("Expression: %s", expression) + logger.error("Variables to assign: %s", variable_list) + logger.error("Available variables: %s", sorted([k for k in env.keys() if not k.startswith('_')])) + logger.error("=" * 80) + raise RuntimeError(f"Error in @with expression '{value['@with']}': {type(e).__name__}: {str(e)}") from e + continue + this = {} + for child in list(value.items()): + if child[0] == '@if': + continue + if child[0] == '@for': + continue + todo.append((child, this, env)) + elif isinstance(value, list): + this = [] + for child in value: + todo.append((child, this, env)) + elif isinstance(value, str): + try: + template = get_template(str(value)) + this = template.render(**env) + except Exception as e: + logger.error("=" * 80) + logger.error("Error rendering Jinja2 template: %s", value[:200] if len(value) > 200 else value) + transform_obj = env.get('transform', {}) + transform_id = transform_obj.identifier if hasattr(transform_obj, 'identifier') else 'unknown' + logger.error("Transform: %s, Row: %s", transform_id, env.get('name', 'unknown')) + logger.error("Error type: %s", type(e).__name__) + logger.error("Error message: %s", str(e)) + logger.error("Template variables referenced in template:") + # Try to extract variable references from the template + matches = TEMPLATE_VAR_PATTERN.findall(value) + if matches: + for match in matches: + var_name = match.strip().split('.')[0].split('[')[0].strip() + if var_name in env: + val = env[var_name] + if type(val).__name__ == 'Element': + # XML Element + try: + val = xml.etree.ElementTree.tostring(val).decode('utf-8', errors='replace')[:200] + except Exception: + val = "" + else: + try: + val = str(val)[:200] + except Exception: + val = f"<{type(val).__name__}>" + logger.error(" %s = %s", var_name, val) + else: + logger.error(" %s = ", var_name) + logger.error("=" * 80) + raise RuntimeError(f"Error rendering template: {type(e).__name__}: {str(e)}") from e + else: + this = value + + if key is not None: + parent[key] = this + else: + parent.append(this) + + return flatten_lists(result) + +def json_transform(transform, resources): + logger.info("Transform %s", transform.identifier) + tables = [u for u in transform[prov.used]] + variables = {} + for usage in transform[prov.qualifiedUsage]: + used = usage.value(prov.entity) + role = usage.value(prov.hadRole) + roleID = role.value(dc.identifier) + variables[roleID.value] = resources[used.identifier] + #print "Using", used.identifier, "as", roleID.value + + generated = list(transform.subjects(prov.wasGeneratedBy))[0] + logger.info("Generating %s", generated.identifier) + + connected_downstream_graph = ''' +construct { + ?target ?p ?o +} where { + ?source (<>|!<>)* ?target. + ?target ?p ?o. +} +''' + shape_graph = Graph() + for shape in transform.objects(dc.conformsTo): + if shape[RDF.type:shacl.NodeShape] or shape[RDF.type:shacl.PropertyShape]: + logger.info("Validating against SHACL shape %s", shape.identifier) + shape_graph += transform.graph.query(connected_downstream_graph, + initBindings={"source":shape.identifier}) + if generated.identifier in resources: + result = resources[generated.identifier] + else: + result = ConjunctiveGraph() + if generated[RDF.type : setl.Persisted]: + store = TrigStore() + result = ConjunctiveGraph(store=store) + if generated[RDF.type : setl.Persisted]: + tempdir = tempfile.mktemp() + logger.info("Persisting %s to %s", generated.identifier, tempdir) + result.store.open(tempdir, True) + s = transform.value(prov.value).value + try: + jslt = json.loads(s) + except json.JSONDecodeError as e: + logger.error("Error parsing JSON-LD template for transform %s", transform.identifier) + lineno = getattr(e, 'lineno', 0) + colno = getattr(e, 'colno', 0) + msg = getattr(e, 'msg', str(e)) + logger.error("JSON parsing error at line %d, column %d: %s", lineno, colno, msg) + # Show context around the error (8 lines before, 3 after for better bracket matching) + lines = s.split("\n") + start_line = max(0, lineno - 8) + end_line = min(len(lines), lineno + 3) + logger.error("Template context:") + for i in range(start_line, end_line): + prefix = ">>> " if i == lineno - 1 else " " + logger.error("%s%d: %s", prefix, i + 1, lines[i]) + raise ValueError(f"Invalid JSON-LD template in transform {transform.identifier}: {msg} at line {lineno}, column {colno}") from e + except Exception as e: + logger.error("Error parsing JSON-LD template for transform %s: %s", transform.identifier, str(e)) + logger.error("Template content:\n%s", s[:500]) # Show first 500 chars + raise ValueError(f"Invalid JSON-LD template in transform {transform.identifier}: {str(e)}") from e + context = transform.value(setl.hasContext) + if context is not None: + context = json.loads(context.value) + for t in tables: + logger.info("Using %s", t.identifier) + table = resources[t.identifier] + it = table + if isinstance(table, pandas.DataFrame): + #if run_samples: + # table = table.head() + it = tqdm(table.iterrows(), total=table.shape[0]) + #logger.info("Transforming %s rows.", len(table.index)) + else: + logger.info("Transform %s", t.identifier) + for rowname, row in it: + if run_samples > 0 and rowname >= run_samples: + break + try: + root = None + data = None + root = { + "@id": generated.identifier, + "@graph": process_row(row, jslt, rowname, table, resources, transform, variables) + } + if context is not None: + root['@context'] = context + + #logger.debug(json.dumps(root, indent=4)) + #before = len(result) + #graph = ConjunctiveGraph(identifier=generated.identifier) + #graph.parse(data=json.dumps(root),format="json-ld") + data = json.dumps(root) + #del root + + if len(shape_graph) > 0: + d = ConjunctiveGraph() + d.parse(data=data,format='json-ld') + conforms, report, message = validate(d, + shacl_graph=shape_graph, + advanced=True, + debug=False) + if not conforms: + print(message) + result.parse(data=data, format="json-ld") + #del data + #after = len(result) + #logger.debug("Row "+str(rowname))#+" added "+str(after-before)+" triples.") + #sys.stdout.flush() + except Exception as e: + logger.error("=" * 80) + logger.error("Error in transform %s while processing row %s", transform.identifier, rowname) + if isinstance(table, pandas.DataFrame): + # Format row data with better NaN handling + row_dict = {} + for key, value in dict(row).items(): + if pandas.isna(value): + row_dict[key] = "" + else: + row_dict[key] = value + logger.error("Row data: %s", row_dict) + else: + logger.error("Row identifier: %s", rowname) + + # Try to provide more specific error information + error_type = type(e).__name__ + if "JSON-LD" in str(e) or "json" in str(e).lower(): + logger.error("JSON-LD processing error: %s", str(e)) + if data is not None: + logger.error("Generated JSON-LD (first 1000 chars):\n%s", data[:1000]) + elif hasattr(e, 'lineno'): + logger.error("%s at line %d: %s", error_type, e.lineno, str(e)) + else: + logger.error("%s: %s", error_type, str(e)) + + logger.error("=" * 80) + raise RuntimeError(f"Failed to transform row {rowname} in transform {transform.identifier}: {error_type}: {str(e)}") from e + + resources[generated.identifier] = result + +def transform(transform_resource, resources): + logger.info('Transforming %s',transform_resource.identifier) + + transform_graph = ConjunctiveGraph() + for result in transform_graph.subjects(prov.wasGeneratedBy): + transform_graph = ConjunctiveGraph(identifier=result.identifier) + + used = set(transform_resource[prov.used]) + + for csv in [u for u in used if u[RDF.type:csvw.Table]]: + csv_graph = Graph(store=transform_graph.store, identifier=csv) + csv_graph += graphs[csv.identifier] + + + for script in [u for u in used if u[RDF.type:setl.PythonScript]]: + logger.info("Script: %s", script.identifier) + s = script.value(prov.value).value + l = dict(graph = transform_graph, setl_graph = transform_resource.graph) + gl = dict() + exec(s, gl, l) + + for jsldt in [u for u in used if u[RDF.type:setl.PythonScript]]: + logger.info("Script: %s", script.identifier) + s = script.value(prov.value).value + l = dict(graph = transform_graph, setl_graph = transform_resource.graph) + gl = dict() + exec(s, gl, l) + + for update in [u for u in used if u[RDF.type:sp.Update]]: + logger.info("Update: %s", update.identifier) + query = update.value(prov.value).value + transform_graph.update(query) + + for construct in [u for u in used if u[RDF.type:sp.Construct]]: + logger.info("Construct: %s", construct.identifier) + query = construct.value(prov.value).value + g = transform_graph.query(query) + transform_graph += g + + for csv in [u for u in used if u[RDF.type:csvw.Table]]: + g = Graph(identifier=csv.identifier,store=transform_graph.store) + g.remove((None, None, None)) + transform_graph.store.remove_graph(csv.identifier) + + for result in transform_graph.subjects(prov.wasGeneratedBy): + graphs[result.identifier] = transform_graph + +def _load_open(generated): + if generated.identifier.startswith("file://"): + if os.name == 'nt': # skip the initial + filename = generated.identifier.replace('file:///','').replace('file://','') + else: + filename = generated.identifier.replace('file://','') + + fh = open(filename, 'wb') + for type, pack in packers.items(): + if generated[RDF.type : type]: + return pack(fh) + return fh + +def load(load_resource, resources): + logger.info('Load %s',load_resource.identifier) + file_graph = Dataset(default_union=True) + to_disk = False + for used in load_resource[prov.used]: + if used[RDF.type : setl.Persisted]: + to_disk = True + file_graph = Dataset(store='Sleepycat', default_union=True) + tempdir = tempfile.mkdtemp() + logger.debug("Gathering %s into %s", load_resource.identifier, tempdir) + file_graph.store.open(tempdir, True) + break + if len(list(load_resource[prov.used])) == 1: + logger.info("Using %s",load_resource.value(prov.used).identifier) + file_graph = resources[load_resource.value(prov.used).identifier] + else: + for used in load_resource[prov.used]: + logger.info("Using %s",used.identifier) + used_graph = resources[used.identifier] + file_graph.namespace_manager = used_graph.namespace_manager + #print used_graph.serialize(format="trig") + file_graph.addN(used_graph.quads()) + + for generated in load_resource.subjects(prov.wasGeneratedBy): + # TODO: support LDP-based loading + if generated[RDF.type:pv.File]: + fmt = generated.value(dc['format']) + if fmt is not None: + fmt = fmt.value + if fmt in formats: + fmt = formats[fmt] + #print fmt + with _load_open(generated) as o: + file_graph.serialize(o, format=fmt) + + elif generated[RDF.type:sd.Service]: + from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore + endpoint = generated.value(sd.endpoint, default=generated).identifier + store = SPARQLUpdateStore(endpoint, endpoint, autocommit=False) + endpoint_graph = Dataset(store=store, identifier=generated.identifier, default_union=True) + endpoint_graph.addN(file_graph.quads()) + endpoint_graph.commit() + #if to_disk: + # file_graph.close() + + +actions = { + setl.Extract : extract, + setl.Transform : json_transform, + setl.Load : load, + setl.PythonScript : create_python_function, + setl.IsEmpty : isempty +} + +def _setl(setl_graph): + """Internal implementation function. Use run_setl() instead. + + This function is deprecated and maintained for backward compatibility. + + Args: + setl_graph: A ConjunctiveGraph containing the SETL script. + + Returns: + dict: A dictionary of resources created during the SETL process. + """ + import warnings + warnings.warn( + "_setl() is deprecated and will be removed in a future version. " + "Use run_setl() instead, which provides the same functionality with better documentation.", + DeprecationWarning, + stacklevel=2 + ) + return run_setl(setl_graph) + + +def run_setl(setl_graph): + """Execute a SETL (Semantic Extract, Transform, Load) script. + + This is the main entry point for programmatically running SETL scripts. + It processes a SETL graph containing extraction, transformation, and loading + instructions for working with RDF data. + + Args: + setl_graph (ConjunctiveGraph): A ConjunctiveGraph containing the SETL script + in RDF format. The graph should define resources with types from the + SETL vocabulary (http://purl.org/twc/vocab/setl/) including: + - setl:Extract: Extract data from sources + - setl:Transform: Transform data using JSON-LD templates + - setl:Load: Load data to destinations + + Returns: + dict: A dictionary mapping resource URIs to their generated content. + The dictionary contains: + - Extracted data (DataFrames, RDF graphs, etc.) + - Transformed RDF graphs + - References to action functions + + Example: + >>> from rdflib import ConjunctiveGraph + >>> from setlr import run_setl + >>> + >>> # Load a SETL script + >>> setl_graph = ConjunctiveGraph() + >>> setl_graph.parse("my_script.setl.ttl", format="turtle") + >>> + >>> # Execute the script + >>> resources = run_setl(setl_graph) + >>> + >>> # Access generated resources + >>> output_graph = resources['http://example.com/output'] + + Raises: + RuntimeError: If there are errors during extraction, transformation, or loading. + ValueError: If the SETL script contains invalid JSON-LD templates or configuration. + + Note: + This function initializes the module logger if not already set and processes + all SETL tasks in topological order based on their dependencies. + """ + global logger + if logger is None: + logger = logging.getLogger(__name__) + resources = {} + resources.update(actions) + + tasks = [setl_graph.resource(t) for t in get_order(setl_graph)] + + for task in tasks: + action = [actions[t.identifier] for t in task[RDF.type] if t.identifier in actions] + if len(action) > 0: + action[0](task, resources) + return resources + + +logger = None + +import click +@click.command() +@click.option('--quiet', '-q', is_flag=True, default=False, help="Minimize logging.") +@click.option('-n', default=-1, help="Only process the first N rows.", type=int) +#@click.option('--rdf-validation', default=None, help="Save the RDF validation report to this file.") +#@click.option('--text-validation', default=None, help="Save the text validation report to this file.") +@click.argument('script', type=click.Path(exists=True)) +def main(script, rdf_validation=None, text_validation=None, quiet=False, n=-1): + """Command-line interface for running SETL scripts. + + Args: + script: Path to the SETL script file (Turtle format). + quiet: If True, minimize logging output. + n: Only process the first N rows (-1 for all rows). + """ + logging_level = logging.DEBUG + if quiet: + logging_level = logging.WARNING + logging.basicConfig(level=logging_level) + + global logger + logger = logging.getLogger(__name__) + + global run_samples + run_samples = n + setl_graph = ConjunctiveGraph() + content = open(script).read() + setl_graph.parse(data=content, format="turtle") + + graphs = run_setl(setl_graph) diff --git a/tests/setlr_test/test_api_compatibility.py b/tests/setlr_test/test_api_compatibility.py new file mode 100644 index 0000000..698fbf2 --- /dev/null +++ b/tests/setlr_test/test_api_compatibility.py @@ -0,0 +1,98 @@ +import unittest +import warnings +from rdflib import ConjunctiveGraph + +# Import setlr module +import setlr + + +class TestBackwardCompatibility(unittest.TestCase): + """Test that backward compatibility with _setl() is maintained""" + + def test_setl_deprecated_warning(self): + """Test that _setl() shows deprecation warning""" + setl_graph = ConjunctiveGraph() + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = setlr._setl(setl_graph) + + # Find our specific deprecation warning + our_warnings = [warning for warning in w if "_setl()" in str(warning.message)] + self.assertTrue(len(our_warnings) > 0, "Expected deprecation warning for _setl()") + self.assertIn("Use run_setl() instead", str(our_warnings[0].message)) + + def test_setl_still_works(self): + """Test that _setl() still functions correctly despite deprecation""" + from rdflib import URIRef + setl_graph = ConjunctiveGraph() + + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + result = setlr._setl(setl_graph) + + # Check that result is a dictionary + self.assertIsInstance(result, dict) + # Check that it contains the expected actions (keys are URIRef objects) + self.assertIn(URIRef('http://purl.org/twc/vocab/setl/Extract'), result) + self.assertIn(URIRef('http://purl.org/twc/vocab/setl/Transform'), result) + + +class TestNewAPI(unittest.TestCase): + """Test the new run_setl() API""" + + def test_run_setl_exists(self): + """Test that run_setl() is accessible""" + self.assertTrue(hasattr(setlr, 'run_setl')) + self.assertTrue(callable(setlr.run_setl)) + + def test_run_setl_basic_functionality(self): + """Test that run_setl() works correctly""" + from rdflib import URIRef + setl_graph = ConjunctiveGraph() + result = setlr.run_setl(setl_graph) + + # Check that result is a dictionary + self.assertIsInstance(result, dict) + # Check that it contains the expected actions (keys are URIRef objects) + self.assertIn(URIRef('http://purl.org/twc/vocab/setl/Extract'), result) + self.assertIn(URIRef('http://purl.org/twc/vocab/setl/Transform'), result) + self.assertIn(URIRef('http://purl.org/twc/vocab/setl/Load'), result) + + def test_run_setl_no_deprecation_warning(self): + """Test that run_setl() does not produce deprecation warning""" + setl_graph = ConjunctiveGraph() + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = setlr.run_setl(setl_graph) + + # Filter to only our deprecation warnings (not rdflib's) + our_warnings = [warning for warning in w if "_setl()" in str(warning.message)] + self.assertEqual(len(our_warnings), 0, "run_setl() should not produce deprecation warning") + + def test_run_setl_has_documentation(self): + """Test that run_setl() has proper documentation""" + self.assertIsNotNone(setlr.run_setl.__doc__) + self.assertIn("Execute a SETL", setlr.run_setl.__doc__) + self.assertIn("Args:", setlr.run_setl.__doc__) + self.assertIn("Returns:", setlr.run_setl.__doc__) + self.assertIn("Example:", setlr.run_setl.__doc__) + + def test_setl_and_run_setl_equivalent(self): + """Test that _setl() and run_setl() produce the same results""" + setl_graph1 = ConjunctiveGraph() + setl_graph2 = ConjunctiveGraph() + + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + result1 = setlr._setl(setl_graph1) + + result2 = setlr.run_setl(setl_graph2) + + # Both should return dictionaries with the same keys + self.assertEqual(set(result1.keys()), set(result2.keys())) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/setlr_test/test_error_messages.py b/tests/setlr_test/test_error_messages.py index ec3d5fe..39eb791 100644 --- a/tests/setlr_test/test_error_messages.py +++ b/tests/setlr_test/test_error_messages.py @@ -19,19 +19,21 @@ class TestErrorMessages(unittest.TestCase): def setUp(self): """Set up logging to capture error messages""" - # Initialize the setlr logger - setlr.logger = logging.getLogger('setlr') - setlr.logger.setLevel(logging.ERROR) + # Initialize the setlr.core logger + import setlr.core + setlr.core.logger = logging.getLogger('setlr') + setlr.core.logger.setLevel(logging.ERROR) self.log_capture = StringIO() self.handler = logging.StreamHandler(self.log_capture) self.handler.setLevel(logging.ERROR) - setlr.logger.addHandler(self.handler) + setlr.core.logger.addHandler(self.handler) def tearDown(self): """Clean up logging""" - if setlr.logger: - setlr.logger.removeHandler(self.handler) + import setlr.core + if setlr.core.logger: + setlr.core.logger.removeHandler(self.handler) def test_invalid_json_template(self): """Test error message when JSON template has syntax errors""" From f2ac96573d6f746315c0b9b144c23fe7729432d6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 18 Jan 2026 04:46:06 +0000 Subject: [PATCH 3/9] Add migration documentation and verify all functionality Co-authored-by: jpmccu <602385+jpmccu@users.noreply.github.com> --- MIGRATION.md | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 MIGRATION.md diff --git a/MIGRATION.md b/MIGRATION.md new file mode 100644 index 0000000..459a730 --- /dev/null +++ b/MIGRATION.md @@ -0,0 +1,166 @@ +# Migration to pyproject.toml and API Improvements + +This document describes the changes made to migrate the project to modern Python packaging standards and improve the API. + +## Changes Made + +### 1. Migration to pyproject.toml + +The project has been migrated from `setup.py` to `pyproject.toml`, following PEP 517/518 standards for modern Python packaging. + +- **New file**: `pyproject.toml` - Contains all project metadata, dependencies, and build configuration +- **Status of setup.py**: The old `setup.py` file is still present for compatibility but is no longer the primary packaging configuration + +### 2. Code Restructuring + +The implementation code has been moved from `setlr/__init__.py` to `setlr/core.py` following best practices: + +- **setlr/core.py**: Contains all implementation code (916+ lines) +- **setlr/__init__.py**: Now serves as a clean public API interface (~90 lines) + +This separation provides: +- Better code organization +- Clearer public API surface +- Easier maintenance +- Improved IDE support and code navigation + +### 3. New Public API: `run_setl()` + +A new, well-documented public function `run_setl()` has been introduced: + +```python +from rdflib import ConjunctiveGraph +from setlr import run_setl + +# Load a SETL script +setl_graph = ConjunctiveGraph() +setl_graph.parse("my_script.setl.ttl", format="turtle") + +# Execute the script +resources = run_setl(setl_graph) + +# Access generated resources +output_graph = resources['http://example.com/output'] +``` + +**Features:** +- Comprehensive docstring with examples +- Proper type hints in documentation +- Clear description of parameters and return values +- Usage examples + +### 4. Backward Compatibility + +The old `_setl()` function is still available for backward compatibility: + +```python +from setlr import _setl # Still works, but deprecated + +# Old code continues to work +resources = _setl(setl_graph) +``` + +**Deprecation Warning:** +- Using `_setl()` will emit a `DeprecationWarning` +- The warning suggests using `run_setl()` instead +- No breaking changes - existing code continues to work + +### 5. Exported API + +The following are now officially exported from the `setlr` package: + +**Main Functions:** +- `run_setl()` - Primary API function (recommended) +- `_setl()` - Deprecated, use `run_setl()` instead +- `main()` - CLI entry point + +**Utility Functions:** +- `read_csv()`, `read_excel()`, `read_json()`, `read_xml()`, `read_graph()` +- `extract()`, `json_transform()`, `transform()`, `load()` +- `isempty()`, `hash()`, `camelcase()`, `get_content()` + +**Namespaces:** +- `csvw`, `ov`, `setl`, `prov`, `pv`, `sp`, `sd`, `dc`, `void`, `shacl`, `api_vocab` + +## Migration Guide for Users + +### If you were using `_setl()`: + +**Before:** +```python +from setlr import _setl + +resources = _setl(setl_graph) +``` + +**After (recommended):** +```python +from setlr import run_setl + +resources = run_setl(setl_graph) +``` + +**Note:** Your old code will continue to work, but you'll see a deprecation warning. Update at your convenience. + +### If you were importing internal functions: + +**Before:** +```python +from setlr import read_csv, extract +``` + +**After:** +```python +from setlr import read_csv, extract # Still works! +``` + +No changes needed - all utility functions are properly exported. + +## For Package Maintainers + +### Building the Package + +With pyproject.toml, you can now build the package using modern tools: + +```bash +# Install build tool +pip install build + +# Build the package +python -m build +``` + +This creates both wheel and source distributions in the `dist/` directory. + +### Installing from Source + +```bash +# Development installation +pip install -e . + +# Regular installation +pip install . +``` + +### Running Tests + +```bash +# Install test dependencies +pip install nose2 coverage + +# Run tests +nose2 --verbose +``` + +## Benefits of This Migration + +1. **Modern Standards**: Uses PEP 517/518 standards for Python packaging +2. **Better Documentation**: Clear, comprehensive API documentation +3. **Improved Structure**: Cleaner separation between public API and implementation +4. **Backward Compatible**: No breaking changes for existing users +5. **Future-Proof**: Follows current Python best practices +6. **Better IDE Support**: Clearer module structure aids code completion and navigation + +## Questions or Issues? + +If you encounter any issues with the migration or have questions about the new API, please open an issue on GitHub. From d1a4dff9f9a29f570a9d02dd3559a2fa8e75d28f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 18 Jan 2026 04:48:07 +0000 Subject: [PATCH 4/9] Remove pip from runtime dependencies (not needed) Co-authored-by: jpmccu <602385+jpmccu@users.noreply.github.com> --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6facde3..0b1c7f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,6 @@ classifiers = [ requires-python = ">=3.8" dependencies = [ "future", - "pip>=9.0.0", "cython", "numpy", "rdflib>=6.0.0", From 912b6007d8f9c093b3e40d1d0c6b72230ea3c855 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 18 Jan 2026 04:49:27 +0000 Subject: [PATCH 5/9] Include _setl in __all__ for wildcard import compatibility Co-authored-by: jpmccu <602385+jpmccu@users.noreply.github.com> --- setlr/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setlr/__init__.py b/setlr/__init__.py index eba796a..96fd12b 100644 --- a/setlr/__init__.py +++ b/setlr/__init__.py @@ -57,6 +57,7 @@ # Define what gets imported with "from setlr import *" __all__ = [ 'run_setl', + '_setl', # Deprecated but included for backward compatibility with wildcard imports 'main', # Include commonly used utilities 'read_csv', @@ -84,7 +85,4 @@ 'void', 'shacl', 'api_vocab', - # Keep _setl for backward compatibility but not in __all__ to discourage use ] - -# Note: _setl is still importable for backward compatibility but not in __all__ From 6f7583e2168ee4a07b861de10ae607ceaaeca060 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 18 Jan 2026 16:04:36 +0000 Subject: [PATCH 6/9] Add development scripts for bootstrap, build, and release Co-authored-by: jpmccu <602385+jpmccu@users.noreply.github.com> --- script/README.md | 101 +++++++++++++++++++++++++++++++++++++++++++++ script/bootstrap | 80 ++++++++++++++++++++++++++++++++++++ script/build | 79 +++++++++++++++++++++++++++++++++++ script/release | 105 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 365 insertions(+) create mode 100644 script/README.md create mode 100755 script/bootstrap create mode 100755 script/build create mode 100755 script/release diff --git a/script/README.md b/script/README.md new file mode 100644 index 0000000..bc0aa41 --- /dev/null +++ b/script/README.md @@ -0,0 +1,101 @@ +# Development Scripts + +This directory contains scripts for setting up, building, and releasing the setlr project. + +## Scripts + +### `bootstrap` + +Set up a virtual environment suitable for developing and using the project, including all package requirements for build and release. + +**Usage:** +```bash +./script/bootstrap +``` + +This script will: +- Create a Python virtual environment in `venv/` +- Install the project in editable mode with all dependencies +- Install development dependencies (nose2, coverage, flake8, pylint, etc.) +- Install build and release tools (build, wheel, twine) + +**After running bootstrap:** +```bash +source venv/bin/activate # Activate the virtual environment +``` + +### `build` + +Build the project packages and run all tests and checks. + +**Usage:** +```bash +./script/build +``` + +This script will: +- Activate the virtual environment (if it exists) +- Clean previous build artifacts +- Run linting checks with flake8 +- Run all tests with nose2 +- Build distribution packages (wheel and source tarball) + +**Output:** +- `dist/setlr-*.whl` - Wheel distribution +- `dist/setlr-*.tar.gz` - Source distribution + +### `release` + +Upload the current version of the project to PyPI using twine. + +**Usage:** +```bash +./script/release +``` + +This script will: +- Activate the virtual environment (if it exists) +- Check that distribution files exist +- Validate distribution files with twine +- Prompt for confirmation before uploading +- Upload to PyPI (requires PyPI credentials or API token) + +**Prerequisites:** +- Run `./script/build` first to create distribution files +- Have PyPI credentials or API token ready + +**Authentication:** +You can provide credentials via: +- Interactive prompt (default) +- Environment variables: `TWINE_USERNAME` and `TWINE_PASSWORD` +- PyPI API token: Set `TWINE_PASSWORD` to your `pypi-...` token + +## Typical Workflow + +```bash +# 1. Set up development environment (first time only) +./script/bootstrap +source venv/bin/activate + +# 2. Make your changes to the code +# ... edit files ... + +# 3. Build and test +./script/build + +# 4. If all tests pass and you're ready to release +./script/release +``` + +## Requirements + +- Python 3.8 or higher +- Bash shell (Linux/macOS/WSL on Windows) +- Internet connection (for downloading dependencies) + +## Notes + +- The virtual environment (`venv/`) is automatically excluded from git via `.gitignore` +- All scripts use color output for better readability +- The `build` script will fail if tests don't pass +- The `release` script requires confirmation before uploading to PyPI diff --git a/script/bootstrap b/script/bootstrap new file mode 100755 index 0000000..208e971 --- /dev/null +++ b/script/bootstrap @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# Bootstrap script: Set up a virtual environment suitable for developing and using the project +set -e + +# Colors for output +GREEN='\033[0;32m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}==> Setting up development environment for setlr${NC}" + +# Determine project root (one level up from script directory) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +cd "${PROJECT_ROOT}" + +# Check if Python is available +if ! command -v python3 &> /dev/null; then + echo "Error: python3 is not installed" + exit 1 +fi + +PYTHON_VERSION=$(python3 --version) +echo -e "${GREEN}✓${NC} Found ${PYTHON_VERSION}" + +# Create virtual environment if it doesn't exist +VENV_DIR="${PROJECT_ROOT}/venv" +if [ ! -d "${VENV_DIR}" ]; then + echo -e "${BLUE}==> Creating virtual environment at ${VENV_DIR}${NC}" + python3 -m venv "${VENV_DIR}" + echo -e "${GREEN}✓${NC} Virtual environment created" +else + echo -e "${GREEN}✓${NC} Virtual environment already exists at ${VENV_DIR}" +fi + +# Activate virtual environment +source "${VENV_DIR}/bin/activate" + +# Upgrade pip +echo -e "${BLUE}==> Upgrading pip${NC}" +pip install --upgrade pip + +# Install the project in editable mode with all dependencies +echo -e "${BLUE}==> Installing setlr in editable mode${NC}" +pip install -e . + +# Install development and build dependencies +echo -e "${BLUE}==> Installing development dependencies${NC}" +pip install \ + nose2 \ + coverage \ + flake8 \ + pycodestyle \ + pylint \ + vulture + +echo -e "${BLUE}==> Installing build dependencies${NC}" +pip install \ + build \ + wheel \ + twine + +echo -e "${GREEN}✓${NC} All dependencies installed" + +# Display next steps +echo "" +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Bootstrap complete!${NC}" +echo -e "${GREEN}========================================${NC}" +echo "" +echo "To activate the virtual environment, run:" +echo " source venv/bin/activate" +echo "" +echo "To build the project:" +echo " ./script/build" +echo "" +echo "To release to PyPI:" +echo " ./script/release" +echo "" diff --git a/script/build b/script/build new file mode 100755 index 0000000..535207e --- /dev/null +++ b/script/build @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +# Build script: Build the project packages and run all tests and checks +set -e + +# Colors for output +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +echo -e "${BLUE}==> Building setlr${NC}" + +# Determine project root (one level up from script directory) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +cd "${PROJECT_ROOT}" + +# Check if virtual environment exists +VENV_DIR="${PROJECT_ROOT}/venv" +if [ ! -d "${VENV_DIR}" ]; then + echo -e "${YELLOW}Warning: Virtual environment not found. Run ./script/bootstrap first.${NC}" + echo "Continuing with system Python..." +else + # Activate virtual environment + source "${VENV_DIR}/bin/activate" + echo -e "${GREEN}✓${NC} Using virtual environment" +fi + +# Clean previous builds +echo -e "${BLUE}==> Cleaning previous builds${NC}" +rm -rf build/ dist/ *.egg-info setlr.egg-info +echo -e "${GREEN}✓${NC} Cleaned build artifacts" + +# Run linting checks +echo -e "${BLUE}==> Running linting checks${NC}" +echo "Running flake8..." +if flake8 setlr/ tests/ --exclude=setlr/iterparse_filter.py 2>&1 | head -20; then + echo -e "${GREEN}✓${NC} flake8 passed (showing first 20 lines)" +else + echo -e "${YELLOW}⚠${NC} flake8 found issues (expected for existing code)" +fi + +# Run tests +echo -e "${BLUE}==> Running tests${NC}" +mkdir -p test-results + +if nose2 --verbose; then + echo -e "${GREEN}✓${NC} All tests passed" +else + echo -e "${RED}✗${NC} Tests failed" + exit 1 +fi + +# Build the package +echo -e "${BLUE}==> Building package${NC}" +python -m build + +if [ $? -eq 0 ]; then + echo -e "${GREEN}✓${NC} Package built successfully" + echo "" + echo "Build artifacts created:" + ls -lh dist/ +else + echo -e "${RED}✗${NC} Build failed" + exit 1 +fi + +echo "" +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Build complete!${NC}" +echo -e "${GREEN}========================================${NC}" +echo "" +echo "Distribution files are in: dist/" +echo "" +echo "To release to PyPI, run:" +echo " ./script/release" +echo "" diff --git a/script/release b/script/release new file mode 100755 index 0000000..03e8666 --- /dev/null +++ b/script/release @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# Release script: Upload the current version of the project to PyPI using twine +set -e + +# Colors for output +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +echo -e "${BLUE}==> Releasing setlr to PyPI${NC}" + +# Determine project root (one level up from script directory) +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +cd "${PROJECT_ROOT}" + +# Check if virtual environment exists +VENV_DIR="${PROJECT_ROOT}/venv" +if [ ! -d "${VENV_DIR}" ]; then + echo -e "${YELLOW}Warning: Virtual environment not found. Run ./script/bootstrap first.${NC}" + echo "Continuing with system Python..." +else + # Activate virtual environment + source "${VENV_DIR}/bin/activate" + echo -e "${GREEN}✓${NC} Using virtual environment" +fi + +# Check if dist/ directory exists and has files +if [ ! -d "dist" ] || [ -z "$(ls -A dist)" ]; then + echo -e "${RED}✗${NC} No distribution files found in dist/" + echo "Run ./script/build first to create distribution files." + exit 1 +fi + +# Get version from pyproject.toml +VERSION=$(grep -E "^version\s*=" pyproject.toml | sed -E 's/.*"(.*)".*/\1/') +echo -e "${BLUE}==> Releasing version ${VERSION}${NC}" + +# Display distribution files +echo "" +echo "Distribution files to upload:" +ls -lh dist/ +echo "" + +# Check if twine is installed +if ! command -v twine &> /dev/null; then + echo -e "${RED}✗${NC} twine is not installed" + echo "Run ./script/bootstrap to install all dependencies." + exit 1 +fi + +# Check distribution files with twine +echo -e "${BLUE}==> Checking distribution files${NC}" +twine check dist/* +if [ $? -eq 0 ]; then + echo -e "${GREEN}✓${NC} Distribution files are valid" +else + echo -e "${RED}✗${NC} Distribution files have issues" + exit 1 +fi + +# Prompt for confirmation +echo "" +echo -e "${YELLOW}========================================${NC}" +echo -e "${YELLOW}WARNING: This will upload to PyPI!${NC}" +echo -e "${YELLOW}========================================${NC}" +echo "" +echo "Version: ${VERSION}" +echo "Files to upload:" +ls -1 dist/ +echo "" +read -p "Are you sure you want to release to PyPI? (yes/no): " -r +echo + +if [[ ! $REPLY =~ ^[Yy][Ee][Ss]$ ]]; then + echo "Release cancelled." + exit 0 +fi + +# Upload to PyPI +echo -e "${BLUE}==> Uploading to PyPI${NC}" +echo "" +echo "Note: You will be prompted for your PyPI credentials." +echo "You can also set TWINE_USERNAME and TWINE_PASSWORD environment variables," +echo "or use a PyPI API token (set TWINE_PASSWORD to 'pypi-...' token)." +echo "" + +twine upload dist/* + +if [ $? -eq 0 ]; then + echo "" + echo -e "${GREEN}========================================${NC}" + echo -e "${GREEN}Release successful!${NC}" + echo -e "${GREEN}========================================${NC}" + echo "" + echo "Version ${VERSION} has been uploaded to PyPI" + echo "View at: https://pypi.org/project/setlr/${VERSION}/" + echo "" +else + echo -e "${RED}✗${NC} Upload failed" + exit 1 +fi From 9340c48249c78bd3d1241036d10e6f705aee6a3e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 18 Jan 2026 16:18:28 +0000 Subject: [PATCH 7/9] Fix build script to properly handle and report failures with set -e and set -o pipefail Co-authored-by: jpmccu <602385+jpmccu@users.noreply.github.com> --- script/build | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/script/build b/script/build index 535207e..1ef5073 100755 --- a/script/build +++ b/script/build @@ -1,6 +1,7 @@ #!/usr/bin/env bash # Build script: Build the project packages and run all tests and checks set -e +set -o pipefail # Ensure pipeline failures are caught # Colors for output GREEN='\033[0;32m' @@ -36,17 +37,38 @@ echo -e "${GREEN}✓${NC} Cleaned build artifacts" # Run linting checks echo -e "${BLUE}==> Running linting checks${NC}" echo "Running flake8..." -if flake8 setlr/ tests/ --exclude=setlr/iterparse_filter.py 2>&1 | head -20; then - echo -e "${GREEN}✓${NC} flake8 passed (showing first 20 lines)" + +# Capture flake8 output and exit code +# Temporarily disable errexit for this block since we want to capture the exit code +set +e +FLAKE8_OUTPUT=$(flake8 setlr/ tests/ --exclude=setlr/iterparse_filter.py 2>&1) +FLAKE8_EXIT=$? +set -e + +# Show first 20 lines of output +echo "$FLAKE8_OUTPUT" | head -20 || true + +if [ $FLAKE8_EXIT -eq 0 ]; then + echo -e "${GREEN}✓${NC} flake8 passed - no issues found" else - echo -e "${YELLOW}⚠${NC} flake8 found issues (expected for existing code)" + echo -e "${YELLOW}⚠${NC} flake8 found issues (exit code: ${FLAKE8_EXIT})" + echo " Note: Build continues despite linting issues (pre-existing code has many issues)" + echo " To see all issues: flake8 setlr/ tests/ --exclude=setlr/iterparse_filter.py" + # Uncomment the next line to make linting failures stop the build: + # exit 1 fi # Run tests echo -e "${BLUE}==> Running tests${NC}" mkdir -p test-results -if nose2 --verbose; then +# Temporarily disable errexit to capture the test exit code +set +e +nose2 --verbose +TEST_EXIT=$? +set -e + +if [ $TEST_EXIT -eq 0 ]; then echo -e "${GREEN}✓${NC} All tests passed" else echo -e "${RED}✗${NC} Tests failed" @@ -55,9 +77,14 @@ fi # Build the package echo -e "${BLUE}==> Building package${NC}" + +# Temporarily disable errexit to capture the build exit code +set +e python -m build +BUILD_EXIT=$? +set -e -if [ $? -eq 0 ]; then +if [ $BUILD_EXIT -eq 0 ]; then echo -e "${GREEN}✓${NC} Package built successfully" echo "" echo "Build artifacts created:" From eb360201bd2b552012f113fa1c8c6ada92f8c39c Mon Sep 17 00:00:00 2001 From: Jamie McCusker Date: Sun, 18 Jan 2026 13:10:56 -0500 Subject: [PATCH 8/9] tweaks to pass flake8 checks --- script/build | 2 +- setlr/__init__.py | 1 + setlr/core.py | 232 ++++++++++----------- setlr/trig_store.py | 8 +- tests/setlr_test/test_api_compatibility.py | 4 +- tests/setlr_test/test_error_messages.py | 3 - tests/setlr_test/test_read_json.py | 2 +- 7 files changed, 120 insertions(+), 132 deletions(-) diff --git a/script/build b/script/build index 1ef5073..8b5ba61 100755 --- a/script/build +++ b/script/build @@ -55,7 +55,7 @@ else echo " Note: Build continues despite linting issues (pre-existing code has many issues)" echo " To see all issues: flake8 setlr/ tests/ --exclude=setlr/iterparse_filter.py" # Uncomment the next line to make linting failures stop the build: - # exit 1 + exit 1 fi # Run tests diff --git a/setlr/__init__.py b/setlr/__init__.py index 96fd12b..5db0092 100644 --- a/setlr/__init__.py +++ b/setlr/__init__.py @@ -60,6 +60,7 @@ '_setl', # Deprecated but included for backward compatibility with wildcard imports 'main', # Include commonly used utilities + 'logger', 'read_csv', 'read_excel', 'read_json', diff --git a/setlr/core.py b/setlr/core.py index e728bcd..3a0528b 100644 --- a/setlr/core.py +++ b/setlr/core.py @@ -1,25 +1,21 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from builtins import str -from builtins import next -from builtins import object -from rdflib import * from rdflib.util import guess_format import rdflib import csv import json -import sys, collections +import sys +import collections import requests import pandas import re import os -from six import text_type as str +import click from jinja2 import Template from toposort import toposort_flatten from numpy import isnan -import uuid import tempfile import ijson from . import iterparse_filter @@ -41,26 +37,28 @@ from .trig_store import TrigStore +from requests_testadapter import Resp + + def hash(value): m = hashlib.sha256() m.update(value.encode('utf-8')) return m.hexdigest() -csvw = Namespace('http://www.w3.org/ns/csvw#') -ov = Namespace('http://open.vocab.org/terms/') -setl = Namespace('http://purl.org/twc/vocab/setl/') -prov = Namespace('http://www.w3.org/ns/prov#') -pv = Namespace('http://purl.org/net/provenance/ns#') -sp = Namespace('http://spinrdf.org/sp#') -sd = Namespace('http://www.w3.org/ns/sparql-service-description#') -dc = Namespace('http://purl.org/dc/terms/') -void = Namespace('http://rdfs.org/ns/void#') -shacl = Namespace('http://www.w3.org/ns/shacl#') -api_vocab = Namespace('http://purl.org/linked-data/api/vocab#') +csvw = rdflib.Namespace('http://www.w3.org/ns/csvw#') +ov = rdflib.Namespace('http://open.vocab.org/terms/') +setl = rdflib.Namespace('http://purl.org/twc/vocab/setl/') +prov = rdflib.Namespace('http://www.w3.org/ns/prov#') +pv = rdflib.Namespace('http://purl.org/net/provenance/ns#') +sp = rdflib.Namespace('http://spinrdf.org/sp#') +sd = rdflib.Namespace('http://www.w3.org/ns/sparql-service-description#') +dc = rdflib.Namespace('http://purl.org/dc/terms/') +void = rdflib.Namespace('http://rdfs.org/ns/void#') +shacl = rdflib.Namespace('http://www.w3.org/ns/shacl#') +api_vocab = rdflib.Namespace('http://purl.org/linked-data/api/vocab#') sys.setrecursionlimit(10000) -from requests_testadapter import Resp # Regex pattern for extracting Jinja2 template variables (compiled once for performance) TEMPLATE_VAR_PATTERN = re.compile(r'\{\{([^}]+)\}\}') @@ -87,11 +85,11 @@ def send(self, request, stream=False, timeout=None, datatypeConverters = collections.defaultdict(lambda: str) datatypeConverters.update({ - XSD.string: str, - XSD.decimal: float, - XSD.integer: int, - XSD.float: float, - XSD.double: float + rdflib.XSD.string: str, + rdflib.XSD.decimal: float, + rdflib.XSD.integer: int, + rdflib.XSD.float: float, + rdflib.XSD.double: float }) run_samples = -1 @@ -107,9 +105,9 @@ def send(self, request, stream=False, timeout=None, def read_csv(location, result): args = dict( - sep = result.value(csvw.delimiter, default=Literal(",")).value, - #header = result.value(csvw.headerRow, default=Literal(0)).value), - skiprows = result.value(csvw.skipRows, default=Literal(0)).value, + sep = result.value(csvw.delimiter, default=rdflib.Literal(",")).value, + #header = result.value(csvw.headerRow, default=rdflib.Literal(0)).value), + skiprows = result.value(csvw.skipRows, default=rdflib.Literal(0)).value, dtype=str, # dtype = object # Does not seem to play well with future and python2/3 conversion ) @@ -122,8 +120,8 @@ def read_csv(location, result): def read_graph(location, result, g = None): if g is None: - g = ConjunctiveGraph() - graph = ConjunctiveGraph(store=g.store, identifier=result.identifier) + g = rdflib.ConjunctiveGraph() + graph = rdflib.ConjunctiveGraph(store=g.store, identifier=result.identifier) if len(graph) == 0: data = get_content(location, result).read() f = guess_format(location) @@ -131,14 +129,13 @@ def read_graph(location, result, g = None): try: graph.parse(data=data, format=fmt) break - except Exception as e: - #print e + except Exception: pass if len(graph) == 0: logger.error("Could not parse graph: %s", location) - if result[RDF.type:OWL.Ontology]: - for ontology in graph.subjects(RDF.type, OWL.Ontology): - imports = [graph.resource(x) for x in graph.objects(ontology, OWL.imports)] + if result[rdflib.RDF.type:rdflib.OWL.Ontology]: + for ontology in graph.subjects(rdflib.RDF.type, rdflib.OWL.Ontology): + imports = [graph.resource(x) for x in graph.objects(ontology, rdflib.OWL.imports)] for i in imports: read_graph(i.identifier, i, g = g) return g @@ -184,7 +181,7 @@ def __exit__(self, err_type, value, tracebock): def read(self, n=None): if n is None: - return self.data + b''.join(l for l in self.iter) + return self.data + b''.join(line for line in self.iter) else: while len(self.data) < n: try: @@ -212,10 +209,10 @@ def get_content(location, result): response = handler(location) if response is not None: break - if result[RDF.type:setl.Tempfile]: + if result[rdflib.RDF.type:setl.Tempfile]: result = to_tempfile(response) - for t in result[RDF.type]: + for t in result[rdflib.RDF.type]: # Do we know how to unpack this? if t.identifier in unpackers: response = unpackers[t.identifier](response) @@ -249,9 +246,9 @@ def unpack_zipfile(f): def read_excel(location, result): args = dict( - sheet_name = result.value(setl.sheetname, default=Literal(0)).value, - header = [int(x) for x in result.value(csvw.headerRow, default=Literal('0')).value.split(',')], - skiprows = result.value(csvw.skipRows, default=Literal(0)).value + sheet_name = result.value(setl.sheetname, default=rdflib.Literal(0)).value, + header = [int(x) for x in result.value(csvw.headerRow, default=rdflib.Literal('0')).value.split(',')], + skiprows = result.value(csvw.skipRows, default=rdflib.Literal(0)).value ) if result.value(csvw.header): args['header'] = [result.value(csvw.header).value] @@ -261,7 +258,7 @@ def read_excel(location, result): def read_xml(location, result): validate_dtd = False - if result[RDF.type:setl.DTDValidatedXML]: + if result[rdflib.RDF.type:setl.DTDValidatedXML]: validate_dtd = True f = iterparse_filter.IterParseFilter(validate_dtd=validate_dtd) if result.value(setl.xpath) is None: @@ -289,27 +286,27 @@ def read_json(location, result): setl.SAS7BDAT : lambda location, result: pandas.read_sas(get_content(location, result), format='sas7bdat'), setl.Excel : read_excel, csvw.Table : read_csv, - OWL.Ontology : read_graph, + rdflib.OWL.Ontology : read_graph, void.Dataset : read_graph, setl.JSON : read_json, setl.XML : read_xml, - URIRef("https://www.iana.org/assignments/media-types/text/plain") : lambda location, result: get_content(location, result) + rdflib.URIRef("https://www.iana.org/assignments/media-types/text/plain") : lambda location, result: get_content(location, result) } try: from bs4 import BeautifulSoup extractors[setl.HTML] = lambda location, result: BeautifulSoup(get_content(location, result).read(), 'html.parser') -except Exception as e: +except Exception: pass def load_csv(csv_resource): column_descriptions = {} for col in csv_resource[csvw.column]: - label = col.value(RDFS.label).value + label = col.value(rdflib.RDFS.label).value column_descriptions[label] = col - csv_graph = Graph(identifier=csv_resource) + csv_graph = rdflib.Graph(identifier=csv_resource) s = [x for x in csv.reader(open(str(csv_resource.value(csvw.url).identifier).replace("file://","")), delimiter=str(csv_resource.value(csvw.delimiter,default=",").value), quotechar=str(csv_resource.value(csvw.quoteChar,default='"').value))] @@ -326,24 +323,24 @@ def load_csv(csv_resource): col_desc = None if h in column_descriptions: col_desc = column_descriptions[h] - col = csv_graph.resource(URIRef("urn:col_"+str(h))) - col.add(RDFS.label, Literal(h)) - col.add(ov.csvCol, Literal(j)) + col = csv_graph.resource(rdflib.URIRef("urn:col_"+str(h))) + col.add(rdflib.RDFS.label, rdflib.Literal(h)) + col.add(ov.csvCol, rdflib.Literal(j)) if col_desc is not None: - col.add(RDFS.range, col_desc.value(RDFS.range, default=XSD.string)) + col.add(rdflib.RDFS.range, col_desc.value(rdflib.RDFS.range, default=rdflib.XSD.string)) properties.append(col) propertyMap[h] = col continue res = csv_graph.resource(csv_resource.identifier+"_row_"+str(i)) - res.add(RDF.type, csvw.Row) - res.add(csvw.rownum, Literal(i)) + res.add(rdflib.RDF.type, csvw.Row) + res.add(csvw.rownum, rdflib.Literal(i)) for j, value in enumerate(r): if skip_value is not None and skip_value == value: continue #print i, j, value prop = properties[j] - datatype = prop.value(RDFS['range'], default=XSD.string) - lit = Literal(value, datatype=datatype.identifier) + datatype = prop.value(rdflib.RDFS['range'], default=rdflib.XSD.string) + lit = rdflib.Literal(value, datatype=datatype.identifier) #print i, prop.identifier, lit.n3() res.add(prop.identifier, lit) logger.debug("Table has %s rows, %s columns, and %s triples", len(s), len(header), len(csv_graph)) @@ -377,7 +374,7 @@ def get_order(setl_graph): nodes = collections.defaultdict(set) for typ in actions: - for task in setl_graph.subjects(RDF.type, typ): + for task in setl_graph.subjects(rdflib.RDF.type, typ): task = setl_graph.resource(task) for used in task[prov.used]: nodes[task.identifier].add(used.identifier) @@ -399,7 +396,7 @@ def extract(e, resources): for result in e.subjects(prov.wasGeneratedBy): if used is None: used = result - for t in result[RDF.type]: + for t in result[rdflib.RDF.type]: # Do we know how to generate this? if t.identifier in extractors: logger.info("Using %s", used.identifier) @@ -413,7 +410,7 @@ def isempty(value): return value is None def clone(value): - __doc__ = '''This is only a JSON-level cloning of objects. Atomic objects are invariant, and don't need to be cloned.''' + '''This is only a JSON-level cloning of objects. Atomic objects are invariant, and don't need to be cloned.''' if isinstance(value, list): return [x for x in value] elif isinstance(value, dict): @@ -459,23 +456,24 @@ def flatten_lists(o): def process_row(row, template, rowname, table, resources, transform, variables): result = [] - e = {'row':row, - 'name': rowname, - 'table': table, - 'resources': resources, - 'template': template, - "transform": transform, - "setl_graph": transform.graph, - "isempty":isempty, - "slugify" : slugify, - "camelcase" : camelcase, - "hash":hash, - "isinstance":isinstance, - "str":str, - "float":float, - "int":int, - "chain": lambda x: chain(*x), - "list":list + e = { + 'row':row, + 'name': rowname, + 'table': table, + 'resources': resources, + 'template': template, + "transform": transform, + "setl_graph": transform.graph, + "isempty":isempty, + "slugify" : slugify, + "camelcase" : camelcase, + "hash":hash, + "isinstance":isinstance, + "str":str, + "float":float, + "int":int, + "chain": lambda x: chain(*x), + "list":list } e.update(variables) e.update(rdflib.__dict__) @@ -577,9 +575,7 @@ def process_row(row, template, rowname, table, resources, transform, variables): fn = get_function(expression, list(env.keys())) v = fn(**env) if v is not None: - if len(variable_list) == 1 and not ( - isinstance(v, collections.Iterable) - and not isinstance(v, str)): + if (len(variable_list) == 1 and not (isinstance(v, collections.Iterable) and not isinstance(v, str))): v = [v] new_env = dict(env) for i, variable in enumerate(variable_list): @@ -666,7 +662,7 @@ def json_transform(transform, resources): for usage in transform[prov.qualifiedUsage]: used = usage.value(prov.entity) role = usage.value(prov.hadRole) - roleID = role.value(dc.identifier) + roleID = role.value(dc.identifier) variables[roleID.value] = resources[used.identifier] #print "Using", used.identifier, "as", roleID.value @@ -681,20 +677,20 @@ def json_transform(transform, resources): ?target ?p ?o. } ''' - shape_graph = Graph() + shape_graph = rdflib.Graph() for shape in transform.objects(dc.conformsTo): - if shape[RDF.type:shacl.NodeShape] or shape[RDF.type:shacl.PropertyShape]: + if shape[rdflib.RDF.type:shacl.NodeShape] or shape[rdflib.RDF.type:shacl.PropertyShape]: logger.info("Validating against SHACL shape %s", shape.identifier) shape_graph += transform.graph.query(connected_downstream_graph, initBindings={"source":shape.identifier}) if generated.identifier in resources: result = resources[generated.identifier] else: - result = ConjunctiveGraph() - if generated[RDF.type : setl.Persisted]: + result = rdflib.ConjunctiveGraph() + if generated[rdflib.RDF.type : setl.Persisted]: store = TrigStore() - result = ConjunctiveGraph(store=store) - if generated[RDF.type : setl.Persisted]: + result = rdflib.ConjunctiveGraph(store=store) + if generated[rdflib.RDF.type : setl.Persisted]: tempdir = tempfile.mktemp() logger.info("Persisting %s to %s", generated.identifier, tempdir) result.store.open(tempdir, True) @@ -749,13 +745,13 @@ def json_transform(transform, resources): #logger.debug(json.dumps(root, indent=4)) #before = len(result) - #graph = ConjunctiveGraph(identifier=generated.identifier) + #graph = rdflib.ConjunctiveGraph(identifier=generated.identifier) #graph.parse(data=json.dumps(root),format="json-ld") data = json.dumps(root) #del root if len(shape_graph) > 0: - d = ConjunctiveGraph() + d = rdflib.ConjunctiveGraph() d.parse(data=data,format='json-ld') conforms, report, message = validate(d, shacl_graph=shape_graph, @@ -802,49 +798,50 @@ def json_transform(transform, resources): def transform(transform_resource, resources): logger.info('Transforming %s',transform_resource.identifier) - transform_graph = ConjunctiveGraph() + transform_graph = rdflib.ConjunctiveGraph() for result in transform_graph.subjects(prov.wasGeneratedBy): - transform_graph = ConjunctiveGraph(identifier=result.identifier) + transform_graph = rdflib.ConjunctiveGraph(identifier=result.identifier) used = set(transform_resource[prov.used]) - for csv in [u for u in used if u[RDF.type:csvw.Table]]: - csv_graph = Graph(store=transform_graph.store, identifier=csv) - csv_graph += graphs[csv.identifier] + for csv_file in [u for u in used if u[rdflib.RDF.type:csvw.Table]]: + csv_graph = rdflib.Graph(store=transform_graph.store, + identifier=csv_file) + csv_graph += resources[csv_file.identifier] - for script in [u for u in used if u[RDF.type:setl.PythonScript]]: + for script in [u for u in used if u[rdflib.RDF.type:setl.PythonScript]]: logger.info("Script: %s", script.identifier) s = script.value(prov.value).value - l = dict(graph = transform_graph, setl_graph = transform_resource.graph) - gl = dict() - exec(s, gl, l) + local_vars = dict(graph = transform_graph, setl_graph = transform_resource.graph) + global_vars = dict() + exec(s, global_vars, local_vars) - for jsldt in [u for u in used if u[RDF.type:setl.PythonScript]]: + for jsldt in [u for u in used if u[rdflib.RDF.type:setl.PythonScript]]: logger.info("Script: %s", script.identifier) s = script.value(prov.value).value - l = dict(graph = transform_graph, setl_graph = transform_resource.graph) - gl = dict() - exec(s, gl, l) + local_vars = dict(graph = transform_graph, setl_graph = transform_resource.graph) + global_vars = dict() + exec(s, global_vars, local_vars) - for update in [u for u in used if u[RDF.type:sp.Update]]: + for update in [u for u in used if u[rdflib.RDF.type:sp.Update]]: logger.info("Update: %s", update.identifier) query = update.value(prov.value).value transform_graph.update(query) - for construct in [u for u in used if u[RDF.type:sp.Construct]]: + for construct in [u for u in used if u[rdflib.RDF.type:sp.Construct]]: logger.info("Construct: %s", construct.identifier) query = construct.value(prov.value).value g = transform_graph.query(query) transform_graph += g - for csv in [u for u in used if u[RDF.type:csvw.Table]]: - g = Graph(identifier=csv.identifier,store=transform_graph.store) + for csv_file in [u for u in used if u[rdflib.RDF.type:csvw.Table]]: + g = rdflib.Graph(identifier=csv_file.identifier,store=transform_graph.store) g.remove((None, None, None)) - transform_graph.store.remove_graph(csv.identifier) + transform_graph.store.remove_graph(csv_file.identifier) for result in transform_graph.subjects(prov.wasGeneratedBy): - graphs[result.identifier] = transform_graph + resources[result.identifier] = transform_graph def _load_open(generated): if generated.identifier.startswith("file://"): @@ -855,18 +852,16 @@ def _load_open(generated): fh = open(filename, 'wb') for type, pack in packers.items(): - if generated[RDF.type : type]: + if generated[rdflib.RDF.type : type]: return pack(fh) return fh def load(load_resource, resources): logger.info('Load %s',load_resource.identifier) - file_graph = Dataset(default_union=True) - to_disk = False + file_graph = rdflib.Dataset(default_union=True) for used in load_resource[prov.used]: - if used[RDF.type : setl.Persisted]: - to_disk = True - file_graph = Dataset(store='Sleepycat', default_union=True) + if used[rdflib.RDF.type : setl.Persisted]: + file_graph = rdflib.Dataset(store='Sleepycat', default_union=True) tempdir = tempfile.mkdtemp() logger.debug("Gathering %s into %s", load_resource.identifier, tempdir) file_graph.store.open(tempdir, True) @@ -884,7 +879,7 @@ def load(load_resource, resources): for generated in load_resource.subjects(prov.wasGeneratedBy): # TODO: support LDP-based loading - if generated[RDF.type:pv.File]: + if generated[rdflib.RDF.type:pv.File]: fmt = generated.value(dc['format']) if fmt is not None: fmt = fmt.value @@ -894,15 +889,13 @@ def load(load_resource, resources): with _load_open(generated) as o: file_graph.serialize(o, format=fmt) - elif generated[RDF.type:sd.Service]: + elif generated[rdflib.RDF.type:sd.Service]: from rdflib.plugins.stores.sparqlstore import SPARQLUpdateStore endpoint = generated.value(sd.endpoint, default=generated).identifier store = SPARQLUpdateStore(endpoint, endpoint, autocommit=False) - endpoint_graph = Dataset(store=store, identifier=generated.identifier, default_union=True) + endpoint_graph = rdflib.Dataset(store=store, identifier=generated.identifier, default_union=True) endpoint_graph.addN(file_graph.quads()) endpoint_graph.commit() - #if to_disk: - # file_graph.close() actions = { @@ -987,7 +980,7 @@ def run_setl(setl_graph): tasks = [setl_graph.resource(t) for t in get_order(setl_graph)] for task in tasks: - action = [actions[t.identifier] for t in task[RDF.type] if t.identifier in actions] + action = [actions[t.identifier] for t in task[rdflib.RDF.type] if t.identifier in actions] if len(action) > 0: action[0](task, resources) return resources @@ -995,7 +988,6 @@ def run_setl(setl_graph): logger = None -import click @click.command() @click.option('--quiet', '-q', is_flag=True, default=False, help="Minimize logging.") @click.option('-n', default=-1, help="Only process the first N rows.", type=int) @@ -1020,8 +1012,8 @@ def main(script, rdf_validation=None, text_validation=None, quiet=False, n=-1): global run_samples run_samples = n - setl_graph = ConjunctiveGraph() + setl_graph = rdflib.ConjunctiveGraph() content = open(script).read() setl_graph.parse(data=content, format="turtle") - graphs = run_setl(setl_graph) + run_setl(setl_graph) diff --git a/setlr/trig_store.py b/setlr/trig_store.py index ada7ac1..acc8c53 100644 --- a/setlr/trig_store.py +++ b/setlr/trig_store.py @@ -1,7 +1,4 @@ -import logging -from threading import Thread -from os.path import exists, abspath -from os import mkdir +from os.path import abspath from rdflib.store import Store, VALID_STORE, NO_STORE from rdflib.term import URIRef from urllib.request import pathname2url @@ -110,7 +107,8 @@ def __len__(self, context=None): def blocks(files, size=65536): while True: b = files.read(size) - if not b: break + if not b: + break yield b self.db_env.seek(0) diff --git a/tests/setlr_test/test_api_compatibility.py b/tests/setlr_test/test_api_compatibility.py index 698fbf2..e45ad79 100644 --- a/tests/setlr_test/test_api_compatibility.py +++ b/tests/setlr_test/test_api_compatibility.py @@ -15,7 +15,7 @@ def test_setl_deprecated_warning(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - result = setlr._setl(setl_graph) + setlr._setl(setl_graph) # Find our specific deprecation warning our_warnings = [warning for warning in w if "_setl()" in str(warning.message)] @@ -65,7 +65,7 @@ def test_run_setl_no_deprecation_warning(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - result = setlr.run_setl(setl_graph) + setlr.run_setl(setl_graph) # Filter to only our deprecation warnings (not rdflib's) our_warnings = [warning for warning in w if "_setl()" in str(warning.message)] diff --git a/tests/setlr_test/test_error_messages.py b/tests/setlr_test/test_error_messages.py index 39eb791..9e74838 100644 --- a/tests/setlr_test/test_error_messages.py +++ b/tests/setlr_test/test_error_messages.py @@ -1,10 +1,7 @@ import unittest -import tempfile -import os import logging from rdflib import ConjunctiveGraph, Namespace, Literal from io import StringIO -import sys # Import setlr module import setlr diff --git a/tests/setlr_test/test_read_json.py b/tests/setlr_test/test_read_json.py index 10c6286..f4f2966 100644 --- a/tests/setlr_test/test_read_json.py +++ b/tests/setlr_test/test_read_json.py @@ -21,4 +21,4 @@ def test_read_json(self): self.assertCountEqual(expected_json, result[0][1], "JSON objects not equal") if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() From 80350d184bfe36823e85705deaf48a2f28d4b03a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 18 Jan 2026 18:14:45 +0000 Subject: [PATCH 9/9] Add GitHub Actions workflow to run bootstrap and build scripts Co-authored-by: jpmccu <602385+jpmccu@users.noreply.github.com> --- .github/workflows/build.yml | 47 +++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 .github/workflows/build.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..126323e --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,47 @@ +name: Build + +on: + push: + branches: [ main, master, develop ] + pull_request: + branches: [ main, master, develop ] + workflow_dispatch: + +jobs: + build: + name: Build on Python ${{ matrix.python-version }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Bootstrap development environment + run: ./script/bootstrap + + - name: Build project + run: ./script/build + + - name: Upload build artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: dist-${{ matrix.python-version }} + path: dist/ + if-no-files-found: warn + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-build-${{ matrix.python-version }} + path: test-results/ + if-no-files-found: ignore