diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..1cff235 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,72 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [1.0.2] - 2026-01-18 + +### Changed +- Migrated from `setup.py` to `pyproject.toml` following PEP 517/518 standards for modern Python packaging +- Restructured codebase: moved implementation from `setlr/__init__.py` to `setlr/core.py` (~1020 lines) +- `setlr/__init__.py` now serves as a clean public API interface (~90 lines) + +### Added +- New public API function `run_setl()` with comprehensive documentation and type hints +- Proper deprecation warning for `_setl()` function (still available for backward compatibility) +- Improved error messages for NaN/missing values (now displays `` instead of `nan`) +- Extended JSON error context from 4 to 8 lines before error for better debugging +- Comprehensive API documentation with usage examples +- Development scripts for bootstrap, build, and release +- GitHub Actions workflows for automated testing and linting +- Migration documentation (MIGRATION.md) + +### Fixed +- Improved error reporting for missing data scenarios +- Better context display for JSON syntax errors in templates +- Python version compatibility for JSON error handling + +## [1.0.1] - 2024-08-09 + +### Changed +- Moved version information from `_version.py` directly into `setup.py` +- Modified `setup.py` to support `--version` flag + +### Fixed +- Fixed SHACL constraint in ontology example (changed `sh:minCount` from 1 to 0 for `rdfs:subClassOf`) + +## [1.0.0] - 2024-04-29 + +### Added +- Initial stable release of setlr +- Core SETL (Semantic Extract, Transform, Load) functionality +- Support for generating RDF graphs from tabular data +- CLI tool via `setlr` command +- Data source readers: CSV, Excel, JSON, XML, and RDF graphs +- Template-based transformation using Jinja2 +- Named graph support via ConjunctiveGraph +- RDF namespaces: csvw, ov, setl, prov, pv, sp, sd, dc, void, shacl +- Utility functions: `extract()`, `transform()`, `load()`, `hash()`, `camelcase()` +- SHACL validation support with pyshacl[js] +- Python 3.8+ support +- Comprehensive test suite + +### Dependencies +- rdflib >= 6.0.0 +- pandas >= 0.23.0 +- jinja2 +- click (CLI support) +- tqdm (progress bars) +- pyshacl[js] (validation) +- beautifulsoup4, lxml (XML/HTML parsing) +- requests (HTTP support) +- toposort (dependency ordering) +- Other utility libraries: numpy, xlrd, ijson, python-slugify + +[Unreleased]: https://github.com/tetherless-world/setlr/compare/v1.0.2...HEAD +[1.0.2]: https://github.com/tetherless-world/setlr/compare/v1.0.1...v1.0.2 +[1.0.1]: https://github.com/tetherless-world/setlr/compare/v1.0.0...v1.0.1 +[1.0.0]: https://github.com/tetherless-world/setlr/releases/tag/v1.0.0 diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..7e3dbc8 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,33 @@ +# Include important files +include README.md +include LICENSE +include CHANGELOG.md +include MIGRATION.md +include pyproject.toml +include setup.py +include setup.cfg + +# Include example files +recursive-include example *.csv *.ttl *.setl.ttl + +# Exclude unwanted files and directories +global-exclude __pycache__ +global-exclude *.py[cod] +global-exclude *.so +global-exclude .DS_Store +global-exclude *.egg-info +recursive-exclude * __pycache__ +recursive-exclude * *.py[cod] + +# Exclude test files +prune tests +prune .github +prune .circleci +prune script +prune docs/_build + +# Exclude development files +exclude .gitignore +exclude .pylintrc +exclude unittest.cfg +exclude IMPROVEMENT_SUMMARY.md diff --git a/README.md b/README.md index 7146f04..87b1fce 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,174 @@ -# setlr: The Semantic Extract, Transform and Load-er +# setlr: Semantic Extract, Transform and Load [![Unit Tests](https://github.com/tetherless-world/setlr/actions/workflows/test.yml/badge.svg)](https://github.com/tetherless-world/setlr/actions/workflows/test.yml) [![Lint](https://github.com/tetherless-world/setlr/actions/workflows/lint.yml/badge.svg)](https://github.com/tetherless-world/setlr/actions/workflows/lint.yml) -setlr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data. +**SETLr** is a powerful Python tool for generating RDF graphs from tabular data using declarative SETL (Semantic Extract, Transform, Load) scripts. -# Installation +## Features -Simply check out the code, optionally create a python virtual environment, and install it using pip: +✨ **Multiple Data Sources**: CSV, Excel, JSON, XML, RDF, SAS files +🔄 **Flexible Transformations**: JSON-LD templates with Jinja2, Python functions, SPARQL +⚡ **High Performance**: Streaming XML parsing, pandas DataFrames, progress tracking +🐍 **Python Integration**: Use as library or CLI tool +✅ **Validation**: Built-in SHACL validation +📝 **Well Documented**: Comprehensive guides and API reference + +## Quick Start + +### Installation ```bash pip install setlr ``` -# Learning how to SETL +### Simple Example + +Create `data.csv`: +```csv +ID,Name,Email +1,Alice,alice@example.com +2,Bob,bob@example.com +``` + +Create `transform.setl.ttl`: +```turtle +@prefix setl: . +@prefix prov: . +@prefix csvw: . +@prefix void: . +@prefix : . + +:table a csvw:Table, setl:Table ; + prov:wasGeneratedBy [ a setl:Extract ; prov:used ] . + +:output a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :table ; + prov:value '''[{ + "@id": "http://example.com/person/{{row.ID}}", + "@type": "http://xmlns.com/foaf/0.1/Person", + "http://xmlns.com/foaf/0.1/name": "{{row.Name}}", + "http://xmlns.com/foaf/0.1/mbox": "mailto:{{row.Email}}" + }]''' + ] . +``` + +Run SETLr: +```bash +setlr transform.setl.ttl +``` + +### Using from Python + +```python +from rdflib import Graph, URIRef +import setlr + +# Load SETL script +setl_graph = Graph() +setl_graph.parse("transform.setl.ttl", format="turtle") + +# Execute ETL pipeline +resources = setlr.run_setl(setl_graph) + +# Access generated RDF +output = resources[URIRef('http://example.com/output')] +print(f"Generated {len(output)} RDF triples") +``` + +## Documentation + +📚 **[Complete Documentation](docs/README.md)** - Full guides and references + +**Quick Links:** +- [Tutorial](docs/tutorial.md) - Step-by-step guide to SETLr +- [JSLDT Template Language](docs/jsldt.md) - Transform syntax reference +- [Python API](docs/python-api.md) - Using SETLr from Python +- [Quick Start](docs/quickstart.md) - Get started in 5 minutes +- [Examples](docs/examples.md) - Real-world examples + +**Advanced Topics:** +- [Streaming XML with XPath](docs/streaming-xml.md) - Efficient large file processing +- [Python Functions](docs/python-functions.md) - Custom Python transforms +- [SPARQL Support](docs/sparql.md) - Query and update endpoints +- [SHACL Validation](docs/shacl.md) - Validate your RDF output + +## Key Concepts + +SETLr uses RDF (with PROV-O vocabulary) to describe ETL workflows: + +1. **Extract**: Load data from sources (CSV, Excel, JSON, XML, RDF, SAS) +2. **Transform**: Apply templates or Python scripts to generate RDF +3. **Load**: Save to files or SPARQL endpoints + +## Supported Formats + +**Input:** +- Tabular: CSV, TSV, Excel (XLS/XLSX), SAS (XPORT/SAS7BDAT) +- Structured: JSON (with ijson selectors), XML (with XPath streaming) +- Semantic: RDF (Turtle, JSON-LD, RDF/XML, etc.), OWL Ontologies + +**Output:** +- RDF: Turtle, TriG, N-Triples, N3, RDF/XML, JSON-LD +- Destinations: Files, SPARQL Update endpoints + +## Examples + +See the [examples/](example/) directory for complete working examples: + +- `social.setl.ttl` - Basic CSV to RDF with conditionals and loops +- `ontology.setl.ttl` - OWL ontology transformation with SHACL shapes + +## Development + +```bash +# Clone repository +git clone https://github.com/tetherless-world/setlr.git +cd setlr + +# Bootstrap (creates venv and installs dependencies) +./script/bootstrap + +# Activate virtual environment +source venv/bin/activate + +# Run tests +./script/build + +# Run linter +flake8 setlr/ +``` + +## Contributing + +Contributions are welcome! Please: +1. Fork the repository +2. Create a feature branch +3. Add tests for new functionality +4. Ensure all tests pass +5. Submit a pull request + +## License + +Apache License 2.0 - see [LICENSE](LICENSE) file for details. + +## Citation + +If you use SETLr in your research, please cite: + +```bibtex +@software{setlr, + title = {SETLr: Semantic Extract, Transform and Load}, + author = {McCusker, Jamie}, + year = {2024}, + url = {https://github.com/tetherless-world/setlr} +} +``` + +## Support -To learn how to use setlr please visit [the tutorial wiki page](https://github.com/tetherless-world/setlr/wiki/SETLr-Basics-Tutorial). +- 📖 [Documentation](docs/README.md) +- 🐛 [Issue Tracker](https://github.com/tetherless-world/setlr/issues) +- 💬 [Discussions](https://github.com/tetherless-world/setlr/discussions) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..015d036 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,59 @@ +# SETLr Documentation + +Welcome to the SETLr (Semantic Extract, Transform and Load-er) documentation! + +## Table of Contents + +1. [Quick Start](quickstart.md) +2. [Installation](installation.md) +3. [Tutorial](tutorial.md) +4. [JSLDT Template Language](jsldt.md) +5. [Python API](python-api.md) +6. [Advanced Features](advanced.md) + - [Streaming XML with XPath](streaming-xml.md) + - [Python Functions in Transforms](python-functions.md) + - [SPARQL Support](sparql.md) + - [SHACL Validation](shacl.md) +7. [Examples](examples.md) +8. [CLI Reference](cli.md) + +## What is SETLr? + +SETLr is a powerful tool for generating RDF graphs from tabular data sources. It uses declarative SETL (Semantic Extract, Transform, Load) scripts to: + +- **Extract** data from CSV, Excel, JSON, XML, and RDF sources +- **Transform** data using JSON-LD templates with Jinja2 templating +- **Load** results to files or SPARQL endpoints + +## Key Features + +- 📊 **Multiple Data Formats**: CSV, Excel, JSON, XML, RDF, SAS files +- 🔄 **Powerful Transformations**: JSON-LD templates with @if, @for, @with control structures +- 🐍 **Python Integration**: Call from Python code or use custom Python functions +- ⚡ **Streaming**: Efficient XML parsing for large files with XPath filtering +- ✅ **Validation**: Built-in SHACL validation support +- 🎯 **SPARQL**: Execute SPARQL queries and load to endpoints + +## Quick Example + +```python +from rdflib import Graph +import setlr + +# Load your SETL script +setl_graph = Graph() +setl_graph.parse("my_script.setl.ttl", format="turtle") + +# Execute the ETL pipeline +resources = setlr.run_setl(setl_graph) + +# Access generated RDF +output_graph = resources[URIRef('http://example.com/output')] +``` + +## Learn More + +- New to SETLr? Start with the [Quick Start Guide](quickstart.md) +- Want to learn the basics? Follow the [Tutorial](tutorial.md) +- Need to write transforms? Check the [JSLDT Template Language](jsldt.md) +- Using Python? See the [Python API Documentation](python-api.md) diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 0000000..c6a22e9 --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,320 @@ +# Command-Line Interface (CLI) Reference + +Complete reference for the `setlr` command-line tool. + +## Synopsis + +```bash +setlr [OPTIONS] SCRIPT +``` + +## Description + +Execute a SETL script to perform Extract, Transform, and Load operations on data sources. + +## Arguments + +### SCRIPT + +Path to the SETL script file (Turtle format). + +```bash +setlr my_transform.setl.ttl +``` + +## Options + +### `--rdf-validation FILE` + +Validate output RDF against SHACL shapes. + +```bash +setlr transform.setl.ttl --rdf-validation shapes.ttl +``` + +**Details:** +- `FILE` should contain SHACL shapes in Turtle format +- Validation runs after transform but before load +- Non-conforming output generates warnings + +### `--text-validation FILE` + +Validate output against text-based validation rules. + +```bash +setlr transform.setl.ttl --text-validation rules.txt +``` + +### `--quiet, -q` + +Suppress progress bars and informational output. + +```bash +setlr transform.setl.ttl --quiet +``` + +Useful for: +- Running in scripts/automation +- Cleaner log output +- CI/CD pipelines + +### `-n, --samples N` + +Process only the first N rows of each table (for testing). + +```bash +setlr transform.setl.ttl -n 10 +``` + +Process first 10 rows only: +- Faster execution for testing +- Verify template logic +- Debug issues with specific rows + +Use `-n -1` to process all rows (default). + +### `--help` + +Show help message and exit. + +```bash +setlr --help +``` + +## Exit Codes + +| Code | Meaning | +|------|---------| +| 0 | Success | +| 1 | Error (invalid script, transform failure, etc.) | + +## Examples + +### Basic Usage + +```bash +# Run SETL script +setlr social.setl.ttl +``` + +### Test with Sample Data + +```bash +# Process only first 5 rows +setlr large_dataset.setl.ttl -n 5 +``` + +### Quiet Mode for Scripts + +```bash +#!/bin/bash +# automation script +if setlr --quiet transform.setl.ttl; then + echo "Transform successful" +else + echo "Transform failed" + exit 1 +fi +``` + +### With SHACL Validation + +```bash +# Validate output against shapes +setlr transform.setl.ttl --rdf-validation shapes.ttl +``` + +## Input Files + +### SETL Script Format + +SETL scripts must be valid RDF in Turtle format: + +```turtle +@prefix setl: . +@prefix prov: . + +# Extract, Transform, Load definitions... +``` + +### Data Files + +Data files are referenced in the SETL script: + +```turtle +:table a setl:Table ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; # Relative or absolute path + ] . +``` + +Paths can be: +- **Relative**: `` (relative to SETL script) +- **Absolute**: `` +- **File URL**: `` +- **HTTP URL**: `` + +## Output Files + +Output files are defined in Load activities: + +```turtle + a pv:File ; + dcterms:format "text/turtle" ; + prov:wasGeneratedBy [ + a setl:Load ; + prov:used :graph ; + ] . +``` + +## Environment Variables + +### `SETLR_LOG_LEVEL` + +Set logging level: + +```bash +export SETLR_LOG_LEVEL=DEBUG +setlr transform.setl.ttl +``` + +Valid levels: DEBUG, INFO, WARNING, ERROR, CRITICAL + +## Logging + +SETLr logs to stderr with the following levels: + +- **INFO**: Progress messages, row counts +- **WARNING**: Non-fatal issues (empty results, etc.) +- **ERROR**: Transform failures, template errors + +### Example Log Output + +``` +INFO:setlr:Extracting data from data.csv +100%|██████████| 1000/1000 [00:02<00:00, 456.78it/s] +INFO:setlr:Transforming table with 1000 rows +INFO:setlr:Generated 5000 triples +INFO:setlr:Loading to output.ttl +``` + +## Error Messages + +SETLr provides detailed error messages for common issues: + +### Template Error + +``` +ERROR:setlr:Error rendering template: 'NoneType' object has no attribute 'split' +ERROR:setlr:Row data: {'ID': '3', 'Name': 'Alice', 'Friends': ''} +ERROR:setlr:Template context: +ERROR:setlr:>>> 5: "@for": "f in row.Friends.split(';')", +``` + +### File Not Found + +``` +ERROR:setlr:Cannot read file: data.csv (No such file or directory) +``` + +### Invalid RDF + +``` +ERROR:setlr:Failed to parse JSON-LD: Expecting property name enclosed in double quotes +``` + +## Performance Tips + +### 1. Use Quiet Mode + +```bash +setlr --quiet script.setl.ttl # Faster without progress bars +``` + +### 2. Test with Samples + +```bash +setlr -n 100 script.setl.ttl # Test with 100 rows first +``` + +### 3. Use Persisted Datasets + +For large outputs, use `setl:Persisted` in your script: + +```turtle +:largeGraph a void:Dataset, setl:Persisted ; + prov:wasGeneratedBy [ ... ] . +``` + +### 4. Profile Performance + +```bash +time setlr script.setl.ttl # Measure execution time +``` + +## Integration Examples + +### Shell Script + +```bash +#!/bin/bash +set -e # Exit on error + +echo "Running ETL pipeline..." + +# Extract and transform +setlr --quiet extract.setl.ttl + +# Validate +if setlr --quiet --rdf-validation shapes.ttl transform.setl.ttl; then + echo "✓ Validation passed" +else + echo "✗ Validation failed" + exit 1 +fi + +echo "Pipeline complete" +``` + +### Makefile + +```makefile +.PHONY: all clean test + +all: output.ttl + +output.ttl: transform.setl.ttl data.csv + setlr transform.setl.ttl + +test: + setlr -n 10 transform.setl.ttl + +clean: + rm -f output.ttl +``` + +### Python Subprocess + +```python +import subprocess +import sys + +try: + result = subprocess.run( + ['setlr', 'transform.setl.ttl'], + check=True, + capture_output=True, + text=True + ) + print("Success:", result.stdout) +except subprocess.CalledProcessError as e: + print("Error:", e.stderr, file=sys.stderr) + sys.exit(1) +``` + +## See Also + +- [Python API](python-api.md) - Using setlr as a library +- [Tutorial](tutorial.md) - Writing SETL scripts +- [Examples](examples.md) - Complete examples diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 0000000..b5f1b95 --- /dev/null +++ b/docs/examples.md @@ -0,0 +1,457 @@ +# Examples + +Complete working examples demonstrating SETLr features. + +## Example 1: Basic CSV to RDF + +Transform a simple CSV file into FOAF RDF. + +### Input: people.csv + +```csv +ID,Name,Email,Age +1,Alice Smith,alice@example.com,30 +2,Bob Jones,bob@example.com,25 +3,Carol White,carol@example.com,35 +``` + +### SETL Script: people.setl.ttl + +```turtle +@prefix setl: . +@prefix prov: . +@prefix csvw: . +@prefix void: . +@prefix dcterms: . +@prefix pv: . +@prefix : . + +:table a csvw:Table, setl:Table ; + csvw:delimiter "," ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . + +:graph a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :table ; + setl:hasContext '''{ + "foaf": "http://xmlns.com/foaf/0.1/" + }''' ; + prov:value '''[{ + "@id": "http://example.com/person/{{row.ID}}", + "@type": "foaf:Person", + "foaf:name": "{{row.Name}}", + "foaf:mbox": "mailto:{{row.Email}}", + "foaf:age": "{{row.Age}}" + }]''' ; + ] . + + a pv:File ; + dcterms:format "text/turtle" ; + prov:wasGeneratedBy [ + a setl:Load ; + prov:used :graph ; + ] . +``` + +### Run + +```bash +setlr people.setl.ttl +``` + +### Output: people.ttl + +```turtle +@prefix foaf: . + + a foaf:Person ; + foaf:age "30" ; + foaf:mbox "mailto:alice@example.com" ; + foaf:name "Alice Smith" . + + a foaf:Person ; + foaf:age "25" ; + foaf:mbox "mailto:bob@example.com" ; + foaf:name "Bob Jones" . + + a foaf:Person ; + foaf:age "35" ; + foaf:mbox "mailto:carol@example.com" ; + foaf:name "Carol White" . +``` + +## Example 2: Conditionals and Iteration + +Handle optional fields and delimited values. + +### Input: social.csv + +```csv +ID,Name,MarriedTo,Friends +Alice,Alice Smith,Bob,Bob; Carol +Bob,Bob Smith,Alice,Alice; Carol; Dave +Carol,Carol White,,Alice; Bob +Dave,Dave Jones,,Bob +``` + +### SETL Script: social.setl.ttl + +```turtle +@prefix setl: . +@prefix prov: . +@prefix csvw: . +@prefix void: . +@prefix dcterms: . +@prefix pv: . +@prefix : . + +:table a csvw:Table, setl:Table ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . + +:graph a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :table ; + setl:hasContext '''{ + "foaf": "http://xmlns.com/foaf/0.1/", + "schema": "http://schema.org/" + }''' ; + prov:value '''[{ + "@id": "http://example.com/person/{{row.ID}}", + "@type": "foaf:Person", + "foaf:name": "{{row.Name}}", + "schema:spouse": [{ + "@if": "not isempty(row.MarriedTo)", + "@id": "http://example.com/person/{{row.MarriedTo}}" + }], + "foaf:knows": [{ + "@if": "not isempty(row.Friends)", + "@for": "friend in row.Friends.split('; ')", + "@do": { "@id": "http://example.com/person/{{friend}}" } + }] + }]''' ; + ] . + + a pv:File ; + dcterms:format "text/turtle" ; + prov:wasGeneratedBy [ + a setl:Load ; + prov:used :graph ; + ] . +``` + +**Key Features:** +- `@if` checks for empty MarriedTo field +- `@for` loops over semicolon-separated friends +- Only generates triples when data exists + +## Example 3: XML to RDF with XPath + +Extract book data from XML with XPath filtering. + +### Input: books.xml + +```xml + + + + Gambardella, Matthew + XML Developer's Guide + Computer + 44.95 + + + Ralls, Kim + Midnight Rain + Fantasy + 5.95 + + +``` + +### SETL Script: books.setl.ttl + +```turtle +@prefix setl: . +@prefix prov: . +@prefix void: . +@prefix : . + +:table a setl:Table ; + setl:xpath "//book" ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . + +:graph a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :table ; + prov:value '''[{ + "@id": "http://example.com/book/{{row['@id']}}", + "@type": "http://schema.org/Book", + "http://schema.org/author": "{{row.author}}", + "http://schema.org/name": "{{row.title}}", + "http://schema.org/genre": "{{row.genre}}", + "http://schema.org/price": "{{row.price}}" + }]''' ; + ] . +``` + +**Key Features:** +- `setl:xpath` filters to only `` elements +- XML attributes accessed with `row['@id']` +- Efficient streaming parse for large XML files + +## Example 4: Python Function Transform + +Use custom Python code for complex processing. + +### Input: sales.csv + +```csv +Product,Quantity,Price +Widget,10,15.99 +Gadget,5,29.99 +Doohickey,3,9.99 +``` + +### SETL Script: sales.setl.ttl + +```turtle +@prefix setl: . +@prefix prov: . +@prefix void: . +@prefix : . + +:table a setl:Table ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . + +:graph a void:Dataset ; + prov:wasGeneratedBy [ + a setl:PythonScript ; + prov:used :table ; + prov:value ''' +from rdflib import Namespace, Literal +from rdflib.namespace import RDF + +ex = Namespace("http://example.com/") +schema = Namespace("http://schema.org/") + +# Calculate totals +for index, row in table.iterrows(): + total = float(row['Quantity']) * float(row['Price']) + + # Create product + product = ex[f"product/{index}"] + result.add((product, RDF.type, schema.Product)) + result.add((product, schema.name, Literal(row['Product']))) + result.add((product, ex.quantity, Literal(row['Quantity']))) + result.add((product, ex.price, Literal(row['Price']))) + result.add((product, ex.total, Literal(f"{total:.2f}"))) + +# Add summary +summary = ex.SalesSummary +result.add((summary, RDF.type, ex.Summary)) +result.add((summary, ex.totalRevenue, Literal(f"{table['Quantity'] * table['Price'].astype(float).sum():.2f}"))) +''' ; + ] . +``` + +**Key Features:** +- Full Python code for complex calculations +- Access pandas DataFrame methods +- Direct RDF triple generation + +## Example 5: Combining Multiple Tables + +Join data from multiple sources. + +### Input Files + +employees.csv: +```csv +EmpID,Name,DeptID +1,Alice,10 +2,Bob,20 +3,Carol,10 +``` + +departments.csv: +```csv +DeptID,DeptName +10,Engineering +20,Sales +``` + +### SETL Script: combined.setl.ttl + +```turtle +@prefix setl: . +@prefix prov: . +@prefix dcterms: . +@prefix void: . +@prefix : . + +:employees a setl:Table ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . + +:departments a setl:Table ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . + +:graph a void:Dataset ; + prov:wasGeneratedBy [ + a setl:PythonScript ; + prov:used :employees ; + prov:qualifiedUsage [ + a prov:Usage ; + prov:entity :departments ; + prov:hadRole [ dcterms:identifier "depts" ] ; + ] ; + prov:value ''' +from rdflib import Namespace, Literal +from rdflib.namespace import RDF +import pandas as pd + +ex = Namespace("http://example.com/") + +# Get departments table +depts = resources[str(URIRef("http://example.com/departments"))] + +# Join tables +merged = pd.merge(table, depts, on='DeptID') + +# Generate RDF +for index, row in merged.iterrows(): + emp = ex[f"employee/{row['EmpID']}"] + result.add((emp, RDF.type, ex.Employee)) + result.add((emp, ex.name, Literal(row['Name']))) + result.add((emp, ex.department, Literal(row['DeptName']))) +''' ; + ] . +``` + +**Key Features:** +- Multiple extract activities +- `prov:qualifiedUsage` for secondary table +- pandas merge for joining data + +## Example 6: Using from Python + +Complete Python script for ETL. + +```python +from rdflib import Graph, Namespace, Literal, URIRef +from rdflib.namespace import RDF, PROV +import setlr +import tempfile +import os + +# Define namespaces +setl = Namespace('http://purl.org/twc/vocab/setl/') +void = Namespace('http://rdfs.org/ns/void#') +csvw = Namespace('http://www.w3.org/ns/csvw#') +ex = Namespace('http://example.com/') + +# Create sample CSV +with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write('ID,Name,Value\\n') + f.write('1,Item A,100\\n') + f.write('2,Item B,200\\n') + f.write('3,Item C,150\\n') + csv_file = f.name + +try: + # Build SETL graph + setl_graph = Graph() + setl_graph.bind('setl', setl) + setl_graph.bind('prov', PROV) + setl_graph.bind('void', void) + setl_graph.bind('csvw', csvw) + setl_graph.bind('ex', ex) + + # Extract + table = ex.table + setl_graph.add((table, RDF.type, setl.Table)) + setl_graph.add((table, RDF.type, csvw.Table)) + + extract = setl_graph.resource(setl_graph.skolemize()) + extract.add(RDF.type, setl.Extract) + extract.add(PROV.used, URIRef('file://' + csv_file)) + setl_graph.add((table, PROV.wasGeneratedBy, extract.identifier)) + + # Transform + output = ex.output + setl_graph.add((output, RDF.type, void.Dataset)) + + transform = setl_graph.resource(setl_graph.skolemize()) + transform.add(RDF.type, setl.Transform) + transform.add(RDF.type, setl.JSLDT) + transform.add(PROV.used, table) + + template = '''[{ + "@id": "http://example.com/item/{{row.ID}}", + "@type": "http://example.com/Item", + "http://example.com/name": "{{row.Name}}", + "http://example.com/value": "{{row.Value}}" + }]''' + transform.add(PROV.value, Literal(template)) + setl_graph.add((output, PROV.wasGeneratedBy, transform.identifier)) + + # Execute + print("Executing SETL script...") + resources = setlr.run_setl(setl_graph) + + # Access results + table_df = resources[table] + print(f"\\nLoaded table with {len(table_df)} rows:") + print(table_df) + + output_graph = resources[output] + print(f"\\nGenerated {len(output_graph)} RDF triples") + + # Query the graph + item_type = URIRef('http://example.com/Item') + items = list(output_graph.subjects(RDF.type, item_type)) + print(f"\\nFound {len(items)} items:") + for item in items: + print(f" - {item}") + + # Save to file + output_graph.serialize('output.ttl', format='turtle') + print("\\nSaved to output.ttl") + +finally: + os.unlink(csv_file) +``` + +## More Examples + +Browse the [example/](../example/) directory for additional examples: + +- `social.setl.ttl` - Social network with conditionals and loops +- `ontology.setl.ttl` - OWL ontology transformation + +## See Also + +- [Tutorial](tutorial.md) - Step-by-step learning +- [JSLDT Reference](jsldt.md) - Template language details +- [Python API](python-api.md) - Programmatic usage +- [Advanced Features](advanced.md) - More capabilities diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000..efa6884 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,214 @@ +# Installation Guide + +How to install and set up SETLr. + +## Requirements + +- **Python**: 3.8 or higher +- **Operating System**: Linux, macOS, or Windows +- **Disk Space**: ~100 MB (including dependencies) + +## Installation Methods + +### 1. Install from PyPI (Recommended) + +```bash +pip install setlr +``` + +This installs the latest stable release from the Python Package Index. + +### 2. Install from Source + +For the latest development version: + +```bash +# Clone repository +git clone https://github.com/tetherless-world/setlr.git +cd setlr + +# Install +pip install . +``` + +### 3. Development Installation + +For contributing or development: + +```bash +# Clone repository +git clone https://github.com/tetherless-world/setlr.git +cd setlr + +# Bootstrap (creates venv, installs dependencies) +./script/bootstrap + +# Activate virtual environment +source venv/bin/activate + +# Install in editable mode +pip install -e . +``` + +## Verify Installation + +Check that setlr is installed: + +```bash +# Check CLI tool +setlr --help + +# Check Python module +python -c "import setlr; print(setlr.__version__)" +``` + +Expected output: +``` +Usage: setlr [OPTIONS] SCRIPT +... + +1.0.2 +``` + +## Dependencies + +SETLr automatically installs these dependencies: + +### Core Dependencies + +- **rdflib** (>=6.0.0) - RDF processing +- **pandas** (>=0.23.0) - DataFrame operations +- **jinja2** - Template rendering +- **click** - CLI interface +- **tqdm** - Progress bars + +### Data Format Support + +- **beautifulsoup4**, **lxml** - XML/HTML parsing +- **xlrd** - Excel files +- **ijson** - Streaming JSON + +### Additional Features + +- **pyshacl[js]** - SHACL validation +- **requests** - HTTP data sources +- **toposort** - Dependency ordering +- **python-slugify** - String slugification + +## Virtual Environment (Recommended) + +Using a virtual environment isolates setlr from system Python: + +```bash +# Create virtual environment +python3 -m venv setlr-env + +# Activate (Linux/macOS) +source setlr-env/bin/activate + +# Activate (Windows) +setlr-env\\Scripts\\activate + +# Install setlr +pip install setlr + +# When done +deactivate +``` + +## Troubleshooting + +### Issue: `ModuleNotFoundError: No module named 'rdflib'` + +**Solution**: Dependencies weren't installed. Try: + +```bash +pip install --upgrade pip +pip install setlr --force-reinstall +``` + +### Issue: `setlr: command not found` + +**Solution**: pip's bin directory not in PATH: + +```bash +# Find where pip installs scripts +python -m site --user-base + +# Add to PATH (Linux/macOS) +export PATH="$HOME/.local/bin:$PATH" + +# Or use full path +python -m setlr script.setl.ttl +``` + +### Issue: Permission denied on Linux + +**Solution**: Install for user only: + +```bash +pip install --user setlr +``` + +### Issue: SSL Certificate Error + +**Solution**: Update certificates or use --trusted-host: + +```bash +pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org setlr +``` + +## Upgrading + +Upgrade to the latest version: + +```bash +pip install --upgrade setlr +``` + +Check current version: + +```bash +pip show setlr +``` + +## Uninstalling + +Remove setlr: + +```bash +pip uninstall setlr +``` + +## Docker + +Use setlr in Docker: + +```dockerfile +FROM python:3.11-slim + +# Install setlr +RUN pip install setlr + +# Copy your scripts +COPY transform.setl.ttl data.csv /app/ + +WORKDIR /app + +# Run setlr +CMD ["setlr", "transform.setl.ttl"] +``` + +Build and run: + +```bash +docker build -t my-setlr-app . +docker run my-setlr-app +``` + +## Next Steps + +- Follow the [Quick Start Guide](quickstart.md) +- Read the [Tutorial](tutorial.md) +- See [Examples](examples.md) +- Check the [CLI Reference](cli.md) diff --git a/docs/jsldt.md b/docs/jsldt.md new file mode 100644 index 0000000..30472c4 --- /dev/null +++ b/docs/jsldt.md @@ -0,0 +1,491 @@ +# JSLDT Template Language Reference + +Complete reference for the JSON-LD Template (JSLDT) language used in SETLr transforms. + +## Overview + +JSLDT is a template language for generating RDF from tabular data. It combines: +- **JSON-LD** for RDF structure +- **Jinja2** for dynamic values +- **Control structures** (`@if`, `@for`, `@with`) for logic + +## Basic Template + +```turtle + a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :myTable ; + prov:value '''[{ + "@id": "http://example.com/item/{{row.ID}}", + "@type": "http://example.com/Item", + "http://example.com/name": "{{row.Name}}" + }]''' ; + ] . +``` + +The template is applied to each row in the table, generating separate JSON-LD documents that are merged into one RDF graph. + +## Available Variables + +Inside JSLDT templates: + +| Variable | Type | Description | +|----------|------|-------------| +| `row` | pandas.Series | Current row being processed | +| `table` | pandas.DataFrame | Full source table | +| `name` | int/str | Row index | +| `template` | str | Full JSON template | +| `transform` | rdflib.Resource | Current transform resource | +| `setl_graph` | rdflib.Graph | SETL script graph | +| `resources` | dict | All generated SETL resources | +| `re` | module | Python regex module | + +### Built-in Functions + +| Function | Description | Example | +|----------|-------------|---------| +| `isempty(value)` | Check if value is NaN/None | `not isempty(row.Email)` | +| `hash(value)` | SHA-256 hash | `hash(row.ID)` | + +## Context + +Define JSON-LD context with `setl:hasContext`: + +```turtle +setl:hasContext '''{ + "foaf": "http://xmlns.com/foaf/0.1/", + "schema": "http://schema.org/", + "@vocab": "http://example.com/vocab/" +}''' ; +``` + +Or inline in the template: + +```json +[{ + "@context": { + "foaf": "http://xmlns.com/foaf/0.1/" + }, + "@id": "...", + ... +}] +``` + +## Jinja2 Templating + +All strings (keys and values) are processed as Jinja2 templates. + +### Basic Substitution + +```json +{ + "@id": "http://example.com/person/{{row.ID}}", + "http://example.com/name": "{{row.Name}}", + "http://example.com/email": "{{row.Email}}" +} +``` + +### Expressions + +```json +{ + "@id": "http://example.com/person/{{row.FirstName}}-{{row.LastName}}", + "http://example.com/fullName": "{{row.FirstName}} {{row.LastName}}", + "http://example.com/ageInMonths": "{{row.Age * 12}}" +} +``` + +### Filters + +Jinja2 filters are available: + +```json +{ + "http://example.com/name": "{{row.Name | upper}}", + "http://example.com/email": "{{row.Email | lower}}", + "http://example.com/title": "{{row.Title | title}}" +} +``` + +### Python Methods + +Access pandas Series/DataFrame methods: + +```json +{ + "@id": "http://example.com/{{row.Name.replace(' ', '_')}}", + "http://example.com/items": "{{row.Items.split(';')[0]}}" +} +``` + +## Control Structures + +### @if - Conditional Elements + +Include elements only when condition is true: + +```json +[{ + "@id": "http://example.com/person/{{row.ID}}", + "@type": "foaf:Person", + "foaf:name": "{{row.Name}}", + "foaf:mbox": [{ + "@if": "not isempty(row.Email)", + "@id": "mailto:{{row.Email}}" + }] +}] +``` + +**Key Points:** +- Wrap in array `[{...}]` to ensure valid JSON-LD +- Condition is Python expression +- Element is omitted if condition is false +- Empty arrays are valid JSON-LD + +**Common Patterns:** + +```json +// Check for non-empty value +"@if": "not isempty(row.Field)" + +// Check string value +"@if": "row.Status == 'active'" + +// Check numeric value +"@if": "row.Age >= 18" + +// Complex condition +"@if": "not isempty(row.Email) and row.Email.endswith('@example.com')" +``` + +### @for - Iteration + +Repeat elements for each item in an iterable: + +```json +[{ + "@id": "http://example.com/person/{{row.ID}}", + "foaf:knows": [{ + "@if": "not isempty(row.Friends)", + "@for": "friend in row.Friends.split('; ')", + "@do": { + "@id": "http://example.com/person/{{friend}}" + } + }] +}] +``` + +**Key Points:** +- `@for` defines loop variable and iterable +- `@do` specifies what to repeat +- Loop variable is scoped to `@do` block +- Can combine with `@if` for filtering + +**Common Patterns:** + +```json +// Split delimited string +"@for": "item in row.Items.split('; ')" + +// Iterate list +"@for": "tag in row.Tags" + +// Enumerate with index +"@for": "i, item in enumerate(row.Items.split(','))" + +// Multiple variables (from dict/tuple) +"@for": "key, value in row.iteritems()" +``` + +### @for with Multiple Variables + +```json +[{ + "@for": "p, o in row.iteritems()", + "@do": { + "@if": "not isempty(o)", + "@id": "http://example.com/{{name}}", + "http://example.com/{{p}}": "{{o}}" + } +}] +``` + +This iterates over all columns in the row. + +### @with - Variable Binding + +Assign values to variables: + +```json +[{ + "@id": "http://example.com/person/{{row.ID}}", + "@with": { + "fullName": "{{row.FirstName}} {{row.LastName}}", + "year": "{{row.BirthDate.split('-')[0]}}" + }, + "@do": { + "foaf:name": "{{fullName}}", + "schema:birthYear": "{{year}}" + } +}] +``` + +**Benefits:** +- Avoid repeating complex expressions +- Make templates more readable +- Pre-process values + +## Advanced Patterns + +### Nested Structures + +```json +[{ + "@id": "http://example.com/person/{{row.ID}}", + "@type": "foaf:Person", + "foaf:name": "{{row.Name}}", + "schema:address": { + "@type": "schema:PostalAddress", + "schema:streetAddress": "{{row.Street}}", + "schema:addressLocality": "{{row.City}}", + "schema:addressRegion": "{{row.State}}", + "schema:postalCode": "{{row.Zip}}" + } +}] +``` + +### Arrays of Values + +```json +[{ + "@id": "http://example.com/person/{{row.ID}}", + "foaf:name": "{{row.Name}}", + "foaf:knows": [ + { "@id": "http://example.com/person/Alice" }, + { "@id": "http://example.com/person/Bob" } + ] +}] +``` + +### Typed Literals + +```json +[{ + "@id": "http://example.com/person/{{row.ID}}", + "foaf:age": { + "@value": "{{row.Age}}", + "@type": "http://www.w3.org/2001/XMLSchema#integer" + }, + "schema:birthDate": { + "@value": "{{row.BirthDate}}", + "@type": "http://www.w3.org/2001/XMLSchema#date" + } +}] +``` + +### Language Tags + +```json +[{ + "@id": "http://example.com/book/{{row.ID}}", + "dcterms:title": [ + { + "@value": "{{row.TitleEN}}", + "@language": "en" + }, + { + "@value": "{{row.TitleFR}}", + "@language": "fr" + } + ] +}] +``` + +### Named Graphs + +Generate quads (triples with graph context): + +```json +[{ + "@id": "http://example.com/graph/{{row.ID}}", + "@graph": [{ + "@id": "http://example.com/person/{{row.ID}}", + "@type": "foaf:Person", + "foaf:name": "{{row.Name}}" + }] +}] +``` + +## Secondary Resources + +Use additional tables or graphs in transforms via `prov:qualifiedUsage`: + +```turtle + a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :mainTable ; + prov:qualifiedUsage [ + a prov:Usage ; + prov:entity :lookupTable ; + prov:hadRole [ dcterms:identifier "lookup" ] ; + ] ; + prov:value '''...''' ; + ] . +``` + +Access in template via `resources`: + +```json +[{ + "@for": "lrow in resources['http://example.com/lookupTable'].itertuples()", + "@do": { + "@id": "http://example.com/{{lrow.ID}}", + "http://example.com/value": "{{lrow.Value}}" + } +}] +``` + +## Optimization + +### Persisted Datasets + +For large outputs, persist to disk instead of memory: + +```turtle + a void:Dataset, setl:Persisted ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :largeTable ; + prov:value '''...''' ; + ] . +``` + +This uses a TrigStore backend that writes triples to disk as they're generated. + +## Debugging + +### Test with Sample Rows + +Process only first N rows: + +```python +import setlr +setlr.core.run_samples = 10 # Process 10 rows only +``` + +### Print Variables + +Add debug output: + +```json +[{ + "@id": "http://example.com/{{row.ID}}", + "@type": "{{row.Type if 'Type' in row.index else 'Unknown'}}" +}] +``` + +Or use Python's logging in template: + +```python +# In transform +prov:value ''' +<% import logging %> +<% logging.info("Processing row: " + str(row.to_dict())) %> +[{...}] +''' ; +``` + +### Check Row Data + +Examine what's in each row: + +```python +# View sample data +print(table.head()) +print(table.columns) +print(table.dtypes) +``` + +## Error Messages + +SETLr provides detailed error context when templates fail: + +``` +ERROR:setlr:Error rendering template: 'NoneType' object has no attribute 'split' +ERROR:setlr:Row data: {'ID': '3', 'Name': 'Alice', 'Friends': ''} +ERROR:setlr:Template context: +ERROR:setlr: 3: "@id": "http://example.com/{{row.ID}}", +ERROR:setlr: 4: "foaf:knows": [{ +ERROR:setlr:>>> 5: "@for": "f in row.Friends.split(';')", +ERROR:setlr: 6: "@do": { "@id": "http://example.com/{{f}}" } +ERROR:setlr: 7: }] +``` + +## Best Practices + +### 1. Always Check for Empty Values + +```json +// Good +"foaf:mbox": [{ + "@if": "not isempty(row.Email)", + "@id": "mailto:{{row.Email}}" +}] + +// Bad - will fail on empty cells +"foaf:mbox": "mailto:{{row.Email}}" +``` + +### 2. Use Meaningful Variable Names + +```json +// Good +"@for": "category in row.Categories.split(';')", +"@do": { "@id": "http://example.com/category/{{category}}" } + +// Less clear +"@for": "c in row.Categories.split(';')", +"@do": { "@id": "http://example.com/category/{{c}}" } +``` + +### 3. Keep Templates Readable + +```json +// Good - split complex logic +"@with": { + "fullName": "{{row.First}} {{row.Last}}", + "email": "{{row.Email.lower() if not isempty(row.Email) else ''}}" +}, +"@do": { + "foaf:name": "{{fullName}}", + "foaf:mbox": "mailto:{{email}}" +} + +// Harder to read +"foaf:name": "{{row.First}} {{row.Last}}", +"foaf:mbox": "mailto:{{row.Email.lower() if not isempty(row.Email) else ''}}" +``` + +### 4. Use Consistent Prefixes + +Define all prefixes in context: + +```json +{ + "foaf": "http://xmlns.com/foaf/0.1/", + "schema": "http://schema.org/", + "dc": "http://purl.org/dc/terms/" +} +``` + +## Examples + +See [examples documentation](examples.md) for complete working examples. + +## See Also + +- [Tutorial](tutorial.md) - Step-by-step JSLDT guide +- [Python API](python-api.md) - Building JSLDT from Python +- [Advanced Features](advanced.md) - More transform options diff --git a/docs/python-api.md b/docs/python-api.md new file mode 100644 index 0000000..7025086 --- /dev/null +++ b/docs/python-api.md @@ -0,0 +1,287 @@ +# Python API Reference + +Complete guide to using SETLr programmatically from Python. + +## Main Entry Point + +### `run_setl(setl_graph)` + +Execute a SETL script and return all generated resources. + +**Parameters:** +- `setl_graph` (rdflib.Graph): An RDF graph containing the SETL script description + +**Returns:** +- `dict`: Dictionary mapping resource URIs (as URIRef objects) to their generated content: + - Tables → pandas DataFrame + - RDF Graphs → rdflib.Graph + - Functions → Python functions + +**Example:** + +```python +from rdflib import Graph, URIRef +import setlr + +# Load SETL script +setl_graph = Graph() +setl_graph.parse("transform.setl.ttl", format="turtle") + +# Execute +resources = setlr.run_setl(setl_graph) + +# Access resources by URI +table_uri = URIRef('http://example.com/myTable') +if table_uri in resources: + df = resources[table_uri] + print(f"Loaded table with {len(df)} rows") + +output_uri = URIRef('http://example.com/output') +if output_uri in resources: + graph = resources[output_uri] + print(f"Generated {len(graph)} triples") +``` + +## Complete Python Example + +Here's a complete example building a SETL script programmatically: + +```python +from rdflib import Graph, Namespace, Literal, URIRef +from rdflib.namespace import RDF, PROV +import setlr +import tempfile + +# Define namespaces +setl = Namespace('http://purl.org/twc/vocab/setl/') +void = Namespace('http://rdfs.org/ns/void#') +csvw = Namespace('http://www.w3.org/ns/csvw#') +dcterms = Namespace('http://purl.org/dc/terms/') +ex = Namespace('http://example.com/') + +# Create CSV file +with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write('Name,Age\\n') + f.write('Alice,30\\n') + f.write('Bob,25\\n') + csv_file = f.name + +# Build SETL graph +setl_graph = Graph() +setl_graph.bind('setl', setl) +setl_graph.bind('prov', PROV) +setl_graph.bind('void', void) +setl_graph.bind('csvw', csvw) + +# Extract: Define table +table = ex.myTable +setl_graph.add((table, RDF.type, setl.Table)) +setl_graph.add((table, RDF.type, csvw.Table)) +setl_graph.add((table, csvw.delimiter, Literal(','))) + +extract = setl_graph.resource(setl_graph.skolemize()) +extract.add(RDF.type, setl.Extract) +extract.add(PROV.used, URIRef('file://' + csv_file)) +setl_graph.add((table, PROV.wasGeneratedBy, extract.identifier)) + +# Transform: Define JSON-LD template +output = ex.output +setl_graph.add((output, RDF.type, void.Dataset)) + +transform = setl_graph.resource(setl_graph.skolemize()) +transform.add(RDF.type, setl.Transform) +transform.add(RDF.type, setl.JSLDT) +transform.add(PROV.used, table) + +template = '''[{ + "@id": "http://example.com/person/{{row.Name}}", + "@type": "http://xmlns.com/foaf/0.1/Person", + "http://xmlns.com/foaf/0.1/name": "{{row.Name}}", + "http://xmlns.com/foaf/0.1/age": "{{row.Age}}" +}]''' +transform.add(PROV.value, Literal(template)) +setl_graph.add((output, PROV.wasGeneratedBy, transform.identifier)) + +# Execute +resources = setlr.run_setl(setl_graph) + +# Access results +output_graph = resources[output] +print(f"Generated {len(output_graph)} RDF triples") + +# Query the graph +from rdflib import URIRef as U +foaf_name = U('http://xmlns.com/foaf/0.1/name') +for s, p, o in output_graph.triples((None, foaf_name, None)): + print(f"{s} has name: {o}") +``` + +## Utility Functions + +SETLr exports several utility functions that can be used independently: + +### Data Reading Functions + +```python +from rdflib import Graph +import setlr + +# Read CSV +csv_graph = Graph() +df = setlr.read_csv('data.csv', csv_graph) + +# Read Excel +excel_graph = Graph() +df = setlr.read_excel('data.xlsx', excel_graph) + +# Read JSON +json_graph = Graph() +data = setlr.read_json('data.json', json_graph) + +# Read XML +xml_graph = Graph() +data = setlr.read_xml('data.xml', xml_graph) + +# Read RDF graph +rdf_graph = Graph() +graph = setlr.read_graph('data.ttl', rdf_graph) +``` + +### Helper Functions + +```python +import setlr + +# Check if value is empty/NaN +if setlr.isempty(value): + print("Value is empty") + +# Generate hash +hash_value = setlr.hash("some text") # SHA-256 hash + +# Convert to camelCase +name = setlr.camelcase("hello-world") # Returns "HelloWorld" + +# Get content from URL or file +content = setlr.get_content('http://example.com/data.csv', result_graph) +``` + +## Working with Multiple Tables + +You can process multiple tables in a single script: + +```python +from rdflib import Graph, Namespace, Literal, URIRef +from rdflib.namespace import RDF, PROV +import setlr + +setl = Namespace('http://purl.org/twc/vocab/setl/') +ex = Namespace('http://example.com/') + +setl_graph = Graph() +setl_graph.bind('setl', setl) +setl_graph.bind('prov', PROV) + +# Extract table 1 +table1 = ex.employees +setl_graph.add((table1, RDF.type, setl.Table)) +extract1 = setl_graph.resource(setl_graph.skolemize()) +extract1.add(RDF.type, setl.Extract) +extract1.add(PROV.used, URIRef('file:///path/to/employees.csv')) +setl_graph.add((table1, PROV.wasGeneratedBy, extract1.identifier)) + +# Extract table 2 +table2 = ex.departments +setl_graph.add((table2, RDF.type, setl.Table)) +extract2 = setl_graph.resource(setl_graph.skolemize()) +extract2.add(RDF.type, setl.Extract) +extract2.add(PROV.used, URIRef('file:///path/to/departments.csv')) +setl_graph.add((table2, PROV.wasGeneratedBy, extract2.identifier)) + +# Transform using both tables +# (use prov:qualifiedUsage to reference secondary tables) + +# Execute +resources = setlr.run_setl(setl_graph) + +# Access both tables +employees_df = resources[table1] +departments_df = resources[table2] +``` + +## Configuration + +### Logging + +SETLr uses Python's logging module: + +```python +import logging +import setlr + +# Set log level +setlr.logger.setLevel(logging.DEBUG) + +# Add custom handler +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +setlr.logger.addHandler(handler) +``` + +### Processing Options + +Control which rows are processed: + +```python +# Process only first N rows (for testing) +setlr.core.run_samples = 10 # Process only first 10 rows + +# Process all rows +setlr.core.run_samples = -1 # Default: process all +``` + +## Error Handling + +SETLr provides detailed error messages when templates fail: + +```python +from rdflib import Graph +import setlr + +try: + setl_graph = Graph() + setl_graph.parse("script.setl.ttl", format="turtle") + resources = setlr.run_setl(setl_graph) +except Exception as e: + print(f"SETL execution failed: {e}") + # Error includes: + # - Row data with markers + # - Template context (8 lines before error) + # - Line number in template + # - Python stack trace +``` + +## Deprecated API + +### `_setl(setl_graph)` [DEPRECATED] + +**Note:** Use `run_setl()` instead. This function is kept for backward compatibility but will emit a DeprecationWarning. + +```python +import setlr +import warnings + +# Old way (deprecated) +with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + resources = setlr._setl(setl_graph) + +# New way (recommended) +resources = setlr.run_setl(setl_graph) +``` + +## Next Steps + +- Learn about [JSLDT Template Language](jsldt.md) +- Explore [Advanced Features](advanced.md) +- See [Examples](examples.md) diff --git a/docs/python-functions.md b/docs/python-functions.md new file mode 100644 index 0000000..abb05c0 --- /dev/null +++ b/docs/python-functions.md @@ -0,0 +1,255 @@ +# Python Scripts in Transforms + +SETLr allows you to execute custom Python code within transforms using `setl:PythonScript`. + +## Overview + +Python scripts in SETLr can: +- Perform complex data processing within transforms +- Manipulate RDF graphs +- Access the transform context +- Execute custom logic + +⚠️ **Note**: This is an advanced feature. For most use cases, [JSLDT templates](jsldt.md) are recommended. + +⚠️ **Security Warning**: Python scripts execute with full system access. Only run trusted SETL scripts. + +## Using Python Scripts + +Python scripts are used **within** JSLDT transforms to manipulate graphs: + +```turtle +@prefix setl: . +@prefix prov: . +@prefix void: . +@prefix csvw: . +@prefix : . + +# Extract data +:dataTable a csvw:Table, setl:Table ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . + +# Transform with JSLDT that uses a Python script +:processedGraph a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :dataTable ; + prov:used [ + a setl:PythonScript ; + prov:value ''' +# Variables available: graph, setl_graph +print(f"Processing transform with {len(graph)} triples") +''' + ] ; + prov:value '''[{ + "@id": "http://example.com/{{row.ID}}", + "@type": "http://example.com/Item", + "http://example.com/name": "{{row.Name}}" + }]''' ; + ] . +``` + +## Available Variables + +Inside Python scripts within transforms: + +| Variable | Type | Description | +|----------|------|-------------| +| `graph` | rdflib.Graph | The transform output graph | +| `setl_graph` | rdflib.Graph | The SETL script description graph | + +## Example: Count Triples by Type + +```turtle +:validatedGraph a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :dataTable ; + prov:used [ + a setl:PythonScript ; + prov:value ''' +from rdflib.namespace import RDF + +# Count triples by type +types = {} +for s, p, o in graph.triples((None, RDF.type, None)): + t = str(o) + types[t] = types.get(t, 0) + 1 + +print("Triple counts by type:") +for t, count in sorted(types.items()): + print(f" {t}: {count}") +''' + ] ; + prov:value '''[{ + "@id": "http://example.com/{{row.ID}}", + "@type": "http://example.com/Item" + }]''' ; + ] . +``` + +## Example: Add Computed Triples + +```turtle +:enrichedGraph a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :salesTable ; + prov:used [ + a setl:PythonScript ; + prov:value ''' +from rdflib import Namespace, Literal +from rdflib.namespace import RDF + +ex = Namespace("http://example.com/") + +# Add summary statistics +total_value = 0 +count = 0 + +for s, p, o in graph.triples((None, ex.value, None)): + try: + total_value += float(o) + count += 1 + except: + pass + +if count > 0: + summary = ex.Summary + graph.add((summary, RDF.type, ex.Statistics)) + graph.add((summary, ex.total, Literal(total_value))) + graph.add((summary, ex.average, Literal(total_value / count))) + graph.add((summary, ex.count, Literal(count))) +''' + ] ; + prov:value '''[{ + "@id": "http://example.com/sale/{{row.ID}}", + "@type": "http://example.com/Sale", + "http://example.com/value": "{{row.Value}}" + }]''' ; + ] . +``` + +## Best Practices + +### 1. Prefer JSLDT Templates + +For most transformations, use JSLDT templates instead of Python: + +```turtle +# Good: Simple and declarative +prov:value '''[{ + "@id": "http://example.com/{{row.ID}}", + "@type": "foaf:Person", + "foaf:name": "{{row.Name}}" +}]''' +``` + +### 2. Use Python for Post-Processing + +Use Python scripts for: +- Computing aggregates after template processing +- Adding summary statistics +- Validating generated RDF +- Logging and debugging + +### 3. Keep Scripts Focused + +```python +# Good: Single purpose +for s, p, o in graph.triples((None, RDF.type, ex.Item)): + count += 1 +print(f"Generated {count} items") + +# Avoid: Complex multi-purpose scripts +# (use multiple transforms instead) +``` + +### 4. Handle Errors Gracefully + +```python +# Good: Error handling +try: + value = float(row['Value']) + # Process value +except (ValueError, KeyError) as e: + print(f"Warning: {e}") + +# Avoid: Unhandled exceptions that crash the transform +``` + +## Common Patterns + +### Validate Generated RDF + +```python +# Check for required properties +from rdflib.namespace import RDF +ex = Namespace("http://example.com/") + +for item in graph.subjects(RDF.type, ex.Item): + has_name = (item, ex.name, None) in graph + if not has_name: + print(f"Warning: {item} missing name property") +``` + +### Add Cross-References + +```python +# Link related entities +ex = Namespace("http://example.com/") + +items = list(graph.subjects(RDF.type, ex.Item)) +for i, item1 in enumerate(items): + for item2 in items[i+1:]: + # Add relationship based on some logic + graph.add((item1, ex.related, item2)) +``` + +### Compute Derived Properties + +```python +# Calculate totals, averages, etc. +from rdflib import Literal + +ex = Namespace("http://example.com/") +total = sum(float(o) for s, p, o in graph.triples((None, ex.price, None))) + +summary = ex.PriceSummary +graph.add((summary, ex.totalPrice, Literal(total))) +``` + +## Debugging + +Enable debug logging: + +```python +import logging +import setlr + +setlr.logger.setLevel(logging.DEBUG) +``` + +Add print statements in your script: + +```python +print(f"Graph has {len(graph)} triples") +print(f"Types: {set(o for s, p, o in graph.triples((None, RDF.type, None)))}") +``` + +## Limitations + +- Python scripts run **after** JSLDT template processing +- Cannot modify the input table +- Cannot access row data directly (use JSLDT templates for that) +- Scripts execute in the transform context + +## See Also + +- [JSLDT Template Language](jsldt.md) - Recommended transformation approach +- [Python API](python-api.md) - Using setlr from Python +- [Tutorial](tutorial.md) - Step-by-step guide +- [Examples](examples.md) - Complete examples diff --git a/docs/quickstart.md b/docs/quickstart.md new file mode 100644 index 0000000..14b460a --- /dev/null +++ b/docs/quickstart.md @@ -0,0 +1,116 @@ +# Quick Start Guide + +Get up and running with SETLr in 5 minutes! + +## Installation + +```bash +pip install setlr +``` + +## Your First SETL Script + +### 1. Create Sample Data + +Save this as `people.csv`: + +```csv +ID,Name,Email +1,Alice Smith,alice@example.com +2,Bob Jones,bob@example.com +``` + +### 2. Create a SETL Script + +Save this as `people.setl.ttl`: + +```turtle +@prefix setl: . +@prefix prov: . +@prefix csvw: . +@prefix void: . +@prefix dcterms: . +@prefix pv: . +@prefix : . + +# Extract: Load the CSV file +:peopleTable a csvw:Table, setl:Table ; + csvw:delimiter "," ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . + +# Transform: Convert to RDF using JSON-LD template +:peopleGraph a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :peopleTable ; + setl:hasContext '''{ + "foaf": "http://xmlns.com/foaf/0.1/" + }''' ; + prov:value '''[{ + "@id": "http://example.com/person/{{row.ID}}", + "@type": "foaf:Person", + "foaf:name": "{{row.Name}}", + "foaf:mbox": "mailto:{{row.Email}}" + }]''' ; + ] . + +# Load: Save to file + a pv:File ; + dcterms:format "text/turtle" ; + prov:wasGeneratedBy [ + a setl:Load ; + prov:used :peopleGraph ; + ] . +``` + +### 3. Run SETLr + +```bash +setlr people.setl.ttl +``` + +This creates `people.ttl` with RDF output: + +```turtle +@prefix foaf: . + + a foaf:Person ; + foaf:name "Alice Smith" ; + foaf:mbox "mailto:alice@example.com" . + + a foaf:Person ; + foaf:name "Bob Jones" ; + foaf:mbox "mailto:bob@example.com" . +``` + +## Using from Python + +```python +from rdflib import Graph, URIRef +import setlr + +# Load SETL script +setl_graph = Graph() +setl_graph.parse("people.setl.ttl", format="turtle") + +# Execute +resources = setlr.run_setl(setl_graph) + +# Access generated RDF +people_graph = resources[URIRef('http://example.com/peopleGraph')] +print(f"Generated {len(people_graph)} triples") + +# Query the graph +for person in people_graph.subjects(predicate=URIRef('http://xmlns.com/foaf/0.1/name')): + print(f"Person: {person}") +``` + +## Next Steps + +- Learn more about [JSLDT Template Language](jsldt.md) +- Explore [Advanced Features](advanced.md) +- See more [Examples](examples.md) +- Read the [Full Tutorial](tutorial.md) diff --git a/docs/streaming-xml.md b/docs/streaming-xml.md new file mode 100644 index 0000000..c77070e --- /dev/null +++ b/docs/streaming-xml.md @@ -0,0 +1,239 @@ +# Streaming XML with XPath + +SETLr supports efficient streaming parsing of large XML files using XPath filtering. + +## Overview + +For large XML files, loading the entire document into memory can be problematic. SETLr's streaming XML parser uses `iterparse` to process XML elements incrementally, combined with XPath expressions to filter only the elements you need. + +## Basic XML Extraction + +```turtle +@prefix setl: . +@prefix prov: . +@prefix : . + +:xmlTable a setl:Table ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . +``` + +This extracts all elements from the XML file into a pandas DataFrame. + +## XPath Filtering + +Use `setl:xpath` to select specific elements: + +```turtle +:bookTable a setl:Table ; + setl:xpath "//book" ; # Select only elements + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . +``` + +### Example XML File + +```xml + + + + Gambardella, Matthew + XML Developer's Guide + Computer + 44.95 + + + Ralls, Kim + Midnight Rain + Fantasy + 5.95 + + + Tech Weekly + 9.99 + + +``` + +With `setl:xpath "//book"`, only the `` elements are extracted, not the ``. + +## Advanced XPath Patterns + +### Select by Attribute + +```turtle +:expensiveBooks a setl:Table ; + setl:xpath "//book[price > 10]" ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . +``` + +### Select Nested Elements + +```turtle +:chapters a setl:Table ; + setl:xpath "//book/chapter" ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . +``` + +### Combine Conditions + +```turtle +:computerBooks a setl:Table ; + setl:xpath "//book[genre='Computer']" ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . +``` + +## DTD Validation + +For XML files with DTD declarations, you can enable validation: + +```turtle +:validatedTable a setl:Table, setl:DTDValidatedXML ; + setl:xpath "//record" ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . +``` + +## Performance Considerations + +### Memory Efficiency + +Streaming XML parsing is particularly useful for: +- **Large files** (> 100 MB) +- **Many elements** (thousands of records) +- **Limited memory** environments + +The parser only keeps the current element in memory, not the entire document. + +### Progress Tracking + +SETLr shows a progress bar when parsing XML: + +``` +Processing XML: 45%|████▌ | 1234/2750 [00:12<00:15, 98.2 elements/s] +``` + +## Complete Example + +### SETL Script (`books.setl.ttl`) + +```turtle +@prefix setl: . +@prefix prov: . +@prefix csvw: . +@prefix void: . +@prefix : . + +# Extract: Parse XML with XPath +:booksTable a setl:Table, csvw:Table ; + setl:xpath "//book" ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . + +# Transform: Convert to RDF +:booksGraph a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :booksTable ; + prov:value '''[{ + "@id": "http://example.com/book/{{row['@id']}}", + "@type": "http://schema.org/Book", + "http://schema.org/author": "{{row.author}}", + "http://schema.org/name": "{{row.title}}", + "http://schema.org/genre": "{{row.genre}}" + }]''' ; + ] . +``` + +### Run from Python + +```python +from rdflib import Graph, URIRef +import setlr + +# Load SETL script +setl_graph = Graph() +setl_graph.parse("books.setl.ttl", format="turtle") + +# Execute (streaming XML parse happens here) +resources = setlr.run_setl(setl_graph) + +# Access parsed data +books_df = resources[URIRef('http://example.com/booksTable')] +print(f"Extracted {len(books_df)} books") +print(books_df.head()) + +# Access generated RDF +books_graph = resources[URIRef('http://example.com/booksGraph')] +print(f"Generated {len(books_graph)} triples") +``` + +## XML Attributes + +XML attributes are accessible in the DataFrame with `@` prefix: + +```xml + + My Book + +``` + +Access in template: +``` +"{{row['@id']}}" # → "bk101" +"{{row['@isbn']}}" # → "1234567890" +"{{row.title}}" # → "My Book" +``` + +## Nested Elements + +For nested XML structures: + +```xml + + + John Doe + 2024 + + Example + +``` + +Use nested XPath: +```turtle +:metadata a setl:Table ; + setl:xpath "//book/metadata" ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . +``` + +## Limitations + +- XPath 1.0 syntax only (not full XPath 2.0) +- Element text content and attributes only (no CDATA sections) +- Cannot access parent or sibling elements after extraction + +## See Also + +- [JSLDT Template Language](jsldt.md) - For transforming extracted data +- [Python API](python-api.md) - Using XML extraction from Python +- [Examples](examples.md) - More XML examples diff --git a/docs/tutorial.md b/docs/tutorial.md new file mode 100644 index 0000000..049325b --- /dev/null +++ b/docs/tutorial.md @@ -0,0 +1,288 @@ +# SETLr Tutorial + +Learn the fundamentals of SETLr by building a complete ETL pipeline from CSV to RDF. + +## Overview + +SETLr uses declarative SETL (Semantic Extract, Transform, and Load) workflows described in RDF to transform tabular data into semantic RDF graphs. This tutorial teaches you the core concepts step-by-step. + +## Sample Data + +Create a file named `social.csv` with this content: + +```csv +ID,Name,MarriedTo,Knows,DOB +Alice,Alice Smith,Bob,Bob; Charles,1/12/1983 +Bob,Bob Smith,Alice,Alice; Charles,3/23/1985 +Charles,Charles Brown,,Alice; Bob,12/15/1955 +Dave,Dave Jones,,,4/25/1967 +``` + +## Step 1: Starting Your SETL File + +Create `social.setl.ttl` with namespace prefixes: + +```turtle +@prefix prov: . +@prefix dcat: . +@prefix dcterms: . +@prefix void: . +@prefix setl: . +@prefix csvw: . +@prefix pv: . +@prefix : . +``` + +## Step 2: Extracting Data + +Add an Extract activity to load the CSV: + +```turtle +:table a csvw:Table, setl:Table ; + csvw:delimiter "," ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . +``` + +**Key Points:** +- `csvw:Table` indicates CSV format +- `setl:Table` marks it as a SETL table resource +- `csvw:delimiter` specifies the delimiter (default is comma) +- `csvw:skipRows` can skip header rows if needed + +### Supported Extract Formats + +| Type | Format | Options | +|------|--------|---------| +| `csvw:Table, setl:Table` | CSV/TSV | `csvw:delimiter`, `csvw:skipRows` | +| `setl:Excel, setl:Table` | Excel (XLS/XLSX) | None | +| `setl:XPORT, setl:Table` | SAS XPORT | None | +| `setl:SAS7BDAT, setl:Table` | SAS Dataset | None | +| `void:Dataset` | RDF (Turtle, JSON-LD, etc.) | None | +| `owl:Ontology` | OWL Ontology | None | + +## Step 3: Transforming with JSLDT + +JSLDT (JSON-LD Templates) transform tables into RDF using Jinja2 templating: + +```turtle + a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :table ; + setl:hasContext '''{ + "foaf": "http://xmlns.com/foaf/0.1/" + }''' ; + prov:value '''[{ + "@id": "https://example.com/social/{{row.ID}}", + "@type": "foaf:Person", + "foaf:name": "{{row.Name}}" + }]''' ; + ] . +``` + +This generates RDF for each row: + +```turtle + a foaf:Person ; + foaf:name "Alice Smith" . + + a foaf:Person ; + foaf:name "Bob Smith" . + +# ... etc +``` + +### Template Variables + +Inside JSLDT templates, you have access to: + +- `row` - Current row as pandas.Series +- `table` - Full table as pandas.DataFrame +- `name` - Row index +- `isempty()` - Function to check for empty/NaN values +- `hash()` - Generate UUIDs +- `re` - Python regex module +- `resources` - All generated SETL resources + +## Step 4: Conditional Elements + +Use `@if` to conditionally include elements: + +```turtle +prov:value '''[{ + "@id": "https://example.com/social/{{row.ID}}", + "@type": "foaf:Person", + "foaf:name": "{{row.Name}}", + "http://schema.org/spouse": [{ + "@if": "not isempty(row.MarriedTo)", + "@id": "https://example.com/social/{{row.MarriedTo}}" + }] +}]''' ; +``` + +Now only Alice and Bob have `schema:spouse` properties. + +**Key Points:** +- `@if` value is a Python expression +- Wrap in array `[{...}]` for valid JSON-LD +- Use `isempty()` to safely check for NaN/None + +## Step 5: Iterating with @for + +Split delimited values with `@for`: + +```turtle +prov:value '''[{ + "@id": "https://example.com/social/{{row.ID}}", + "@type": "foaf:Person", + "foaf:name": "{{row.Name}}", + "foaf:knows": [{ + "@if": "not isempty(row.Knows)", + "@for": "friend in row.Knows.split('; ')", + "@do": { "@id": "https://example.com/social/{{friend}}" } + }] +}]''' ; +``` + +This creates multiple `foaf:knows` links: + +```turtle + a foaf:Person ; + foaf:knows , + ; + foaf:name "Alice Smith" . +``` + +**Key Points:** +- `@for` iterates over Python iterable +- `@do` is repeated for each item +- Variable (e.g., `friend`) is scoped to the loop + +## Step 6: Loading Results + +Save to a file: + +```turtle + a pv:File ; + dcterms:format "text/turtle" ; + prov:wasGeneratedBy [ + a setl:Load ; + prov:used ; + ] . +``` + +### Supported Formats + +- **RDF/XML**: `application/rdf+xml`, `text/rdf` (default) +- **Turtle**: `text/turtle`, `application/turtle` +- **N-Triples**: `text/plain` +- **N3**: `text/n3` +- **TriG**: `application/trig` +- **JSON-LD**: `application/json` + +### Load to SPARQL Endpoint + +```turtle +@prefix sd: . + +:sparql_load a setl:Load, sd:Service ; + sd:endpoint ; + prov:used . +``` + +## Complete Example + +Here's the full `social.setl.ttl`: + +```turtle +@prefix prov: . +@prefix dcterms: . +@prefix void: . +@prefix setl: . +@prefix csvw: . +@prefix pv: . +@prefix : . + +# Extract +:table a csvw:Table, setl:Table ; + csvw:delimiter "," ; + prov:wasGeneratedBy [ + a setl:Extract ; + prov:used ; + ] . + +# Transform + a void:Dataset ; + prov:wasGeneratedBy [ + a setl:Transform, setl:JSLDT ; + prov:used :table ; + setl:hasContext '''{ + "foaf": "http://xmlns.com/foaf/0.1/", + "schema": "http://schema.org/" + }''' ; + prov:value '''[{ + "@id": "https://example.com/social/{{row.ID}}", + "@type": "foaf:Person", + "foaf:name": "{{row.Name}}", + "schema:spouse": [{ + "@if": "not isempty(row.MarriedTo)", + "@id": "https://example.com/social/{{row.MarriedTo}}" + }], + "foaf:knows": [{ + "@if": "not isempty(row.Knows)", + "@for": "friend in row.Knows.split('; ')", + "@do": { "@id": "https://example.com/social/{{friend}}" } + }] + }]''' ; + ] . + +# Load + a pv:File ; + dcterms:format "text/turtle" ; + prov:wasGeneratedBy [ + a setl:Load ; + prov:used ; + ] . +``` + +## Running Your SETL Script + +### Command Line + +```bash +setlr social.setl.ttl +``` + +This creates `social.ttl` with the RDF output. + +### From Python + +```python +from rdflib import Graph, URIRef +import setlr + +# Load script +setl_graph = Graph() +setl_graph.parse("social.setl.ttl", format="turtle") + +# Execute +resources = setlr.run_setl(setl_graph) + +# Access results +social_graph = resources[URIRef('http://example.com/social')] +print(f"Generated {len(social_graph)} triples") +``` + +## Next Steps + +- Learn more about [JSLDT Template Language](jsldt.md) +- Explore [Advanced Features](advanced.md): + - [Streaming XML](streaming-xml.md) + - [Python Functions](python-functions.md) + - [SPARQL Support](sparql.md) + - [SHACL Validation](shacl.md) +- See more [Examples](examples.md) +- Check the [Python API Reference](python-api.md) diff --git a/pyproject.toml b/pyproject.toml index a99201f..47c2ae0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,6 @@ name = "setlr" version = "1.0.2" description = "setlr is a tool for Semantic Extraction, Transformation, and Loading." readme = "README.md" -license = {text = "Apache License 2.0"} authors = [ {name = "Jamie McCusker", email = "mccusj@cs.rpi.edu"} ] @@ -15,7 +14,6 @@ keywords = ["rdf", "semantic", "etl"] classifiers = [ "Development Status :: 5 - Production/Stable", "Topic :: Utilities", - "License :: OSI Approved :: Apache Software License", ] requires-python = ">=3.8" dependencies = [ @@ -48,6 +46,3 @@ setlr = "setlr:main" [tool.setuptools] packages = ["setlr"] include-package-data = true - -[tool.setuptools.package-data] -setlr = ["**/*"] diff --git a/setup.cfg b/setup.cfg index 21c4a39..9d8e31c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,9 +1,3 @@ -[bdist_wheel] -universal = 1 - -[metadata] -description-file = README.md - [flake8] exclude = config-template,iterparse_filter,venv ignore = E115,E116,E121,E122,E126,E127,E128,E201,E202,E203,E226,E225,E228,E231,E241,E251,E261,E265,E301,E302,E303,E305,E501,W291,W293 diff --git a/setup.py b/setup.py index 1cb9a7a..75186fa 100644 --- a/setup.py +++ b/setup.py @@ -1,60 +1,11 @@ -import os -from setuptools import setup, find_packages +from setuptools import setup from sys import argv -#from _version import __version__ -__version__='1.0.1' - -# Utility function to read the README file. -# Used for the long_description. It's nice, because now 1) we have a top level -# README file and 2) it's easier to type in the README file than to put a raw -# string in below ... -def read(fname): - return open(os.path.join(os.path.dirname(__file__), fname)).read() +__version__='1.0.2' if '--version' in argv: print(__version__) else: - setup( - name = "setlr", - version = __version__, - author = "Jamie McCusker", - author_email = "mccusj@cs.rpi.edu", - description = ("setlr is a tool for Semantic Extraction, Transformation, and Loading."), - license = "Apache License 2.0", - keywords = "rdf semantic etl", - url = "http://packages.python.org/setlr", - packages=['setlr'], - long_description='''SETLr is a tool for generating RDF graphs, including named graphs, from almost any kind of tabular data.''', - include_package_data = True, - install_requires = [ - 'future', - 'pip>=9.0.0', - 'cython', - 'numpy', - 'rdflib>=6.0.0', - 'pandas>=0.23.0', - 'requests', - 'toposort', - 'beautifulsoup4', - 'jinja2', - 'lxml', - 'six', - 'xlrd', - 'ijson', - 'click', - 'tqdm', - 'requests-testadapter', - 'python-slugify', - 'pyshacl[js]' - ], - entry_points = { - 'console_scripts': ['setlr=setlr:main'], - }, - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Topic :: Utilities", - "License :: OSI Approved :: Apache Software License", - ], - ) + # Configuration is now in pyproject.toml + setup() diff --git a/tests/setlr_test/test_programmatic_usage.py b/tests/setlr_test/test_programmatic_usage.py new file mode 100644 index 0000000..20cdddd --- /dev/null +++ b/tests/setlr_test/test_programmatic_usage.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Unit tests for using setlr programmatically from Python. + +Tests the main API entry points (run_setl) for executing SETL scripts +from Python code. +""" + +import unittest +import tempfile +import os +from rdflib import Graph, Namespace, Literal, URIRef +from rdflib.namespace import RDF, PROV +import setlr + +setl = Namespace('http://purl.org/twc/vocab/setl/') +void = Namespace('http://rdfs.org/ns/void#') +csvw = Namespace('http://www.w3.org/ns/csvw#') +dcterms = Namespace('http://purl.org/dc/terms/') +ex = Namespace('http://example.com/') + + +class TestProgrammaticUsage(unittest.TestCase): + """Test using setlr programmatically from Python""" + + def test_simple_csv_to_rdf(self): + """Test complete ETL: CSV -> RDF using run_setl()""" + # Create test CSV + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write('ID,Name,Email\n') + f.write('1,Alice,alice@example.com\n') + f.write('2,Bob,bob@example.com\n') + csv_file = f.name + + try: + # Build SETL graph programmatically + setl_graph = Graph() + setl_graph.bind('setl', setl) + setl_graph.bind('prov', PROV) + setl_graph.bind('void', void) + setl_graph.bind('csvw', csvw) + setl_graph.bind('dcterms', dcterms) + setl_graph.bind('ex', ex) + + # Extract: Load CSV + table = ex.myTable + setl_graph.add((table, RDF.type, setl.Table)) + setl_graph.add((table, RDF.type, csvw.Table)) + setl_graph.add((table, csvw.delimiter, Literal(','))) + + extract = setl_graph.resource(setl_graph.skolemize()) + extract.add(RDF.type, setl.Extract) + extract.add(PROV.used, URIRef('file://' + csv_file)) + setl_graph.add((table, PROV.wasGeneratedBy, extract.identifier)) + + # Transform: CSV -> RDF using JSLDT + output = ex.output + setl_graph.add((output, RDF.type, void.Dataset)) + + transform = setl_graph.resource(setl_graph.skolemize()) + transform.add(RDF.type, setl.Transform) + transform.add(RDF.type, setl.JSLDT) + transform.add(PROV.used, table) + + # JSON-LD template + template = '''[{ + "@id": "http://example.com/person/{{row.ID}}", + "@type": "http://xmlns.com/foaf/0.1/Person", + "http://xmlns.com/foaf/0.1/name": "{{row.Name}}", + "http://xmlns.com/foaf/0.1/mbox": "mailto:{{row.Email}}" +}]''' + transform.add(PROV.value, Literal(template)) + + context = '''{"foaf": "http://xmlns.com/foaf/0.1/"}''' + transform.add(setl.hasContext, Literal(context)) + + setl_graph.add((output, PROV.wasGeneratedBy, transform.identifier)) + + # Execute SETL script + resources = setlr.run_setl(setl_graph) + + # Verify results - resources dict uses URIRef as keys + self.assertIn(table, resources, "Table should be in resources") + self.assertIn(output, resources, "Output graph should be in resources") + + # Check output graph has triples + output_graph = resources[output] + self.assertIsInstance(output_graph, Graph) + self.assertGreater(len(output_graph), 0, "Output graph should have triples") + + # Verify specific triples exist + foaf_name = URIRef('http://xmlns.com/foaf/0.1/name') + names = list(output_graph.objects(predicate=foaf_name)) + self.assertGreater(len(names), 0, "Should have foaf:name triples") + + finally: + os.unlink(csv_file) + + def test_access_generated_resources(self): + """Test that run_setl returns a dictionary of all generated resources""" + # Create minimal SETL script + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write('ID\n1\n2\n') + csv_file = f.name + + try: + setl_graph = Graph() + setl_graph.bind('setl', setl) + setl_graph.bind('prov', PROV) + setl_graph.bind('csvw', csvw) + + # Just extract + table = ex.testTable + setl_graph.add((table, RDF.type, setl.Table)) + setl_graph.add((table, RDF.type, csvw.Table)) # Need csvw.Table for CSV extraction + + extract = setl_graph.resource(setl_graph.skolemize()) + extract.add(RDF.type, setl.Extract) + extract.add(PROV.used, URIRef('file://' + csv_file)) + setl_graph.add((table, PROV.wasGeneratedBy, extract.identifier)) + + # Execute + resources = setlr.run_setl(setl_graph) + + # Check return type + self.assertIsInstance(resources, dict) + self.assertIn(table, resources) + + # Verify we can access the table + table_data = resources[table] + self.assertIsNotNone(table_data) + + finally: + os.unlink(csv_file) + + def test_multiple_transforms(self): + """Test executing multiple transforms in a single SETL script""" + # Create test CSV + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write('Name,Value\n') + f.write('A,10\n') + f.write('B,20\n') + csv_file = f.name + + try: + setl_graph = Graph() + setl_graph.bind('setl', setl) + setl_graph.bind('prov', PROV) + setl_graph.bind('void', void) + setl_graph.bind('csvw', csvw) + + # Extract + table = ex.data + setl_graph.add((table, RDF.type, setl.Table)) + setl_graph.add((table, RDF.type, csvw.Table)) # Need csvw.Table for CSV extraction + + extract = setl_graph.resource(setl_graph.skolemize()) + extract.add(RDF.type, setl.Extract) + extract.add(PROV.used, URIRef('file://' + csv_file)) + setl_graph.add((table, PROV.wasGeneratedBy, extract.identifier)) + + # Transform 1 + output1 = ex.output1 + setl_graph.add((output1, RDF.type, void.Dataset)) + + transform1 = setl_graph.resource(setl_graph.skolemize()) + transform1.add(RDF.type, setl.Transform) + transform1.add(RDF.type, setl.JSLDT) + transform1.add(PROV.used, table) + transform1.add(PROV.value, Literal('[{"@id": "http://example.com/{{row.Name}}", "http://example.com/value": "{{row.Value}}"}]')) + setl_graph.add((output1, PROV.wasGeneratedBy, transform1.identifier)) + + # Transform 2 (uses same table) + output2 = ex.output2 + setl_graph.add((output2, RDF.type, void.Dataset)) + + transform2 = setl_graph.resource(setl_graph.skolemize()) + transform2.add(RDF.type, setl.Transform) + transform2.add(RDF.type, setl.JSLDT) + transform2.add(PROV.used, table) + transform2.add(PROV.value, Literal('[{"@id": "http://example.com/item/{{row.Name}}", "http://example.com/hasValue": "{{row.Value}}"}]')) + setl_graph.add((output2, PROV.wasGeneratedBy, transform2.identifier)) + + # Execute + resources = setlr.run_setl(setl_graph) + + # Verify both outputs were created + self.assertIn(output1, resources) + self.assertIn(output2, resources) + + # Both should be graphs + self.assertIsInstance(resources[output1], Graph) + self.assertIsInstance(resources[output2], Graph) + + finally: + os.unlink(csv_file) + + +if __name__ == '__main__': + unittest.main()