From 2c780313de72fcf56f23a1b622aaacdbcdf6b188 Mon Sep 17 00:00:00 2001 From: Bastian Havers-Zulka Date: Wed, 8 Apr 2026 16:56:40 +0200 Subject: [PATCH 1/3] Add dataprov-add CLI tool for recording processing steps Expose ProvenanceChain.add() as a CLI tool so users can record processing steps in shell scripts and pipelines without writing Python. Changes: - dataprov/cli/addstep.py: new CLI module with full argument coverage of add()'s required and optional parameters, including --capture-agent, --capture-environment, --input-provenance-files (with 'none' sentinel), --drl, and -o / --overwrite for output control - pyproject.toml: register dataprov-add entry point - README.md: add dataprov-add to CLI Tools section and table of contents - tests/test_dataprov.py: add TestCLIAddStep with 15 tests covering basic usage, optional fields, multiple I/O, provenance file linking, output redirect, overwrite, agent/env capture, and error cases Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 57 ++++++ dataprov/cli/addstep.py | 410 ++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + tests/test_dataprov.py | 373 ++++++++++++++++++++++++++++++++++++ 4 files changed, 841 insertions(+) create mode 100644 dataprov/cli/addstep.py diff --git a/README.md b/README.md index b2299e4..c9bb143 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ A lightweight Python library for tracking data provenance through processing pip - [Visualization](#visualization) - [CLI Tools](#cli-tools) - [dataprov-new](#dataprov-new) + - [dataprov-add](#dataprov-add) - [dataprov-visualize](#dataprov-visualize) - [dataprov-add-attribution](#dataprov-add-attribution) - [dataprov-report](#dataprov-report) @@ -791,6 +792,62 @@ dataprov-new \ --tags video,processing,2024 ``` +### dataprov-add + +Add a processing step to an existing provenance chain. This is the CLI equivalent of `ProvenanceChain.add()`: + +```bash +# Minimal usage +dataprov-add -p provenance.json \ + --started-at "2024-10-15T11:00:00Z" --ended-at "2024-10-15T11:05:30Z" \ + --tool-name drone_stabilizer --tool-version 2.0 --operation stabilization \ + -i raw.mp4 --input-formats MP4 \ + --outputs stabilized.mp4 --output-formats MP4 + +# Multiple inputs and outputs +dataprov-add -p provenance.json \ + --started-at "2024-10-15T12:00:00Z" --ended-at "2024-10-15T12:10:00Z" \ + --tool-name video_combiner --tool-version 1.5 --operation concatenation \ + -i video1.mp4 video2.mp4 video3.mp4 \ + --input-formats MP4 MP4 MP4 \ + --outputs combined.mp4 --output-formats MP4 + +# With linked input provenance files (use "none" for inputs with no provenance) +dataprov-add -p provenance.json \ + --started-at "2024-10-15T12:00:00Z" --ended-at "2024-10-15T12:10:00Z" \ + --tool-name video_combiner --tool-version 1.5 --operation concatenation \ + -i video1.mp4 video2.mp4 --input-formats MP4 MP4 \ + --outputs combined.mp4 --output-formats MP4 \ + --input-provenance-files video1_prov.json none + +# With DRL, agent capture, and execution log +dataprov-add -p provenance.json \ + --started-at "2024-10-15T10:00:00Z" --ended-at "2024-10-15T10:05:30Z" \ + --tool-name processor --tool-version 1.0 --operation processing \ + -i input.txt --input-formats TXT \ + --outputs output.txt --output-formats TXT \ + --drl 3 --capture-agent \ + --output-log "Processing completed successfully" + +# Capture execution environment +dataprov-add -p provenance.json \ + --started-at "2024-10-15T10:00:00Z" --ended-at "2024-10-15T10:05:30Z" \ + --tool-name processor --tool-version 1.0 --operation processing \ + -i input.txt --input-formats TXT \ + --outputs output.txt --output-formats TXT \ + --capture-environment + +# Write to new file instead of modifying in place +dataprov-add -p provenance.json \ + --started-at "2024-10-15T11:00:00Z" --ended-at "2024-10-15T11:05:30Z" \ + --tool-name my_tool --tool-version 1.0 --operation processing \ + -i input.csv --input-formats CSV \ + --outputs output.csv --output-formats CSV \ + -o provenance_updated.json +``` + +Omit `--ended-at` for steps that are still ongoing or whose end time is not yet known. + ### dataprov-visualize Generate GraphViz DOT visualization: diff --git a/dataprov/cli/addstep.py b/dataprov/cli/addstep.py new file mode 100644 index 0000000..5a9f8be --- /dev/null +++ b/dataprov/cli/addstep.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python3 +""" +Add Processing Step to Provenance Chain + +Command-line tool to record a processing step in an existing provenance chain. +Wraps the ProvenanceChain.add() method for use in shell pipelines and scripts. + +Usage: + dataprov-add -p provenance.json \\ + --started-at "2024-10-15T11:00:00Z" --ended-at "2024-10-15T11:05:30Z" \\ + --tool-name my_tool --tool-version 1.0 --operation processing \\ + -i input.csv --input-formats CSV \\ + --outputs output.csv --output-formats CSV + +Examples: + # Minimal usage + dataprov-add -p provenance.json \\ + --started-at "2024-10-15T11:00:00Z" --ended-at "2024-10-15T11:05:30Z" \\ + --tool-name drone_stabilizer --tool-version 2.0 --operation stabilization \\ + -i raw.mp4 --input-formats MP4 \\ + --outputs stabilized.mp4 --output-formats MP4 + + # Multiple inputs and outputs with provenance references + dataprov-add -p provenance.json \\ + --started-at "2024-10-15T12:00:00Z" --ended-at "2024-10-15T12:10:00Z" \\ + --tool-name video_combiner --tool-version 1.5 --operation concatenation \\ + -i video1.mp4 video2.mp4 --input-formats MP4 MP4 \\ + --outputs combined.mp4 --output-formats MP4 \\ + --input-provenance-files video1_prov.json none + + # With agent capture, DRL, and execution log + dataprov-add -p provenance.json \\ + --started-at "2024-10-15T10:00:00Z" \\ + --tool-name processor --tool-version 1.0 --operation processing \\ + -i input.txt --input-formats TXT \\ + --outputs output.txt --output-formats TXT \\ + --drl 3 --capture-agent --output-log "Processing completed successfully" + + # Capture runtime environment + dataprov-add -p provenance.json \\ + --started-at "2024-10-15T10:00:00Z" \\ + --tool-name processor --tool-version 1.0 --operation processing \\ + -i input.txt --input-formats TXT \\ + --outputs output.txt --output-formats TXT \\ + --capture-environment + + # Output to new file instead of in-place modification + dataprov-add -p provenance.json \\ + --started-at "2024-10-15T11:00:00Z" \\ + --tool-name my_tool --tool-version 1.0 --operation processing \\ + -i input.csv --input-formats CSV \\ + --outputs output.csv --output-formats CSV \\ + -o provenance_updated.json +""" + +import argparse +import sys +from pathlib import Path + +from dataprov import ProvenanceChain + + +def main(): + parser = argparse.ArgumentParser( + description="Add a processing step to an existing provenance chain", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic step + dataprov-add -p provenance.json \\ + --started-at "2024-10-15T11:00:00Z" --ended-at "2024-10-15T11:05:30Z" \\ + --tool-name my_tool --tool-version 1.0 --operation processing \\ + -i input.csv --input-formats CSV \\ + --outputs output.csv --output-formats CSV + + # Multiple inputs/outputs + dataprov-add -p provenance.json \\ + --started-at "2024-10-15T12:00:00Z" --ended-at "2024-10-15T12:10:00Z" \\ + --tool-name combiner --tool-version 1.0 --operation merge \\ + -i a.csv b.csv --input-formats CSV CSV \\ + --outputs merged.csv --output-formats CSV + + # With provenance references (use "none" for missing entries) + dataprov-add -p provenance.json \\ + --started-at "2024-10-15T12:00:00Z" --ended-at "2024-10-15T12:10:00Z" \\ + --tool-name combiner --tool-version 1.0 --operation merge \\ + -i a.csv b.csv --input-formats CSV CSV \\ + --outputs merged.csv --output-formats CSV \\ + --input-provenance-files a_prov.json none + """, + ) + + # Required arguments + parser.add_argument( + "-p", + "--provenance-file", + required=True, + help="Path to existing provenance JSON file", + ) + + parser.add_argument( + "--started-at", + required=True, + help="ISO 8601 timestamp when processing started (e.g. 2024-10-15T11:00:00Z)", + ) + + parser.add_argument( + "--tool-name", + required=True, + help="Name of the tool that performed this step", + ) + + parser.add_argument( + "--tool-version", + required=True, + help="Version of the tool", + ) + + parser.add_argument( + "--operation", + required=True, + help="Semantic description of the operation (e.g. stabilization, merge, calibration)", + ) + + parser.add_argument( + "-i", + "--inputs", + required=True, + nargs="+", + metavar="FILE", + help="One or more input file paths", + ) + + parser.add_argument( + "--input-formats", + required=True, + nargs="+", + metavar="FORMAT", + help="File formats for each input (e.g. MP4 CSV TXT) — must match number of inputs", + ) + + parser.add_argument( + "--outputs", + required=True, + nargs="+", + metavar="FILE", + help="One or more output file paths", + ) + + parser.add_argument( + "--output-formats", + required=True, + nargs="+", + metavar="FORMAT", + help="File formats for each output — must match number of outputs", + ) + + # Optional step metadata + parser.add_argument( + "--ended-at", + default=None, + help="ISO 8601 timestamp when processing ended (omit for ongoing/incomplete steps)", + ) + + parser.add_argument( + "--source", + default="", + help="Tool source or organization", + ) + + parser.add_argument( + "--arguments", + default="", + help="Command-line arguments string used for this step", + ) + + parser.add_argument( + "--output-log", + default="", + help="Free-text output log from tool execution", + ) + + parser.add_argument( + "--warnings", + default="", + help="Warning messages or issues encountered", + ) + + parser.add_argument( + "--input-provenance-files", + nargs="+", + metavar="FILE", + default=None, + help=( + "Provenance JSON files for each input (must match number of inputs). " + 'Use "none" for inputs that have no provenance file.' + ), + ) + + parser.add_argument( + "--drl", + type=int, + default=None, + metavar="N", + help="Data Readiness Level after this step (0–9)", + ) + + # Agent tracking + parser.add_argument( + "--capture-agent", + action="store_true", + default=False, + help="Capture agent/user information automatically", + ) + + parser.add_argument( + "--user", + default=None, + help="Override username for agent capture (only with --capture-agent)", + ) + + parser.add_argument( + "--hostname", + default=None, + help="Override hostname for agent capture (only with --capture-agent)", + ) + + # Environment tracking + parser.add_argument( + "--capture-environment", + action="store_true", + default=False, + help="Capture execution environment information (Python version, platform, etc.)", + ) + + parser.add_argument( + "--runtime", + default=None, + help="Override runtime name for environment capture (default: auto-detect)", + ) + + parser.add_argument( + "--runtime-version", + default=None, + help="Override runtime version for environment capture (default: auto-detect)", + ) + + # Output options + parser.add_argument( + "-o", + "--output", + default=None, + help="Write updated chain to a new file instead of modifying in place", + ) + + parser.add_argument( + "--overwrite", + action="store_true", + help="Overwrite output file if it exists (only with -o)", + ) + + args = parser.parse_args() + + # Validate provenance file exists + prov_path = Path(args.provenance_file) + if not prov_path.exists(): + print( + f"Error: Provenance file not found: {args.provenance_file}", file=sys.stderr + ) + return 1 + + # Validate format list lengths + if len(args.inputs) != len(args.input_formats): + print( + f"Error: Number of --input-formats ({len(args.input_formats)}) must match" + f" number of --inputs ({len(args.inputs)})", + file=sys.stderr, + ) + return 1 + + if len(args.outputs) != len(args.output_formats): + print( + f"Error: Number of --output-formats ({len(args.output_formats)}) must match" + f" number of --outputs ({len(args.outputs)})", + file=sys.stderr, + ) + return 1 + + # Validate input provenance files list length + if args.input_provenance_files is not None and len( + args.input_provenance_files + ) != len(args.inputs): + print( + f"Error: Number of --input-provenance-files ({len(args.input_provenance_files)})" + f" must match number of --inputs ({len(args.inputs)})", + file=sys.stderr, + ) + return 1 + + # Validate agent-only flags + if not args.capture_agent and (args.user or args.hostname): + print( + "Error: --user and --hostname can only be used with --capture-agent", + file=sys.stderr, + ) + return 1 + + # Validate environment-only flags + if not args.capture_environment and (args.runtime or args.runtime_version): + print( + "Error: --runtime and --runtime-version can only be used with --capture-environment", + file=sys.stderr, + ) + return 1 + + # Validate output file + output_path = Path(args.output) if args.output else prov_path + if args.output and output_path.exists() and not args.overwrite: + print( + f"Error: Output file already exists: {args.output}", file=sys.stderr + ) + print("Use --overwrite to replace existing file", file=sys.stderr) + return 1 + + # Convert "none" sentinels to Python None in provenance files list + input_provenance_files = None + if args.input_provenance_files is not None: + input_provenance_files = [ + None if entry.lower() == "none" else entry + for entry in args.input_provenance_files + ] + + # Load provenance chain + try: + chain = ProvenanceChain.load(args.provenance_file) + except Exception as e: + print(f"Error: Failed to load provenance chain: {e}", file=sys.stderr) + return 1 + + # Add the processing step + try: + success = chain.add( + started_at=args.started_at, + ended_at=args.ended_at, + tool_name=args.tool_name, + tool_version=args.tool_version, + operation=args.operation, + inputs=args.inputs, + input_formats=args.input_formats, + outputs=args.outputs, + output_formats=args.output_formats, + source=args.source, + arguments=args.arguments, + output_log=args.output_log, + warnings=args.warnings, + input_provenance_files=input_provenance_files, + drl=args.drl, + capture_agent=args.capture_agent, + user=args.user, + hostname=args.hostname, + capture_environment=args.capture_environment, + runtime=args.runtime, + runtime_version=args.runtime_version, + ) + + if not success: + print( + "Error: Failed to add processing step (see error messages above)", + file=sys.stderr, + ) + return 1 + + except Exception as e: + print(f"Error: Failed to add processing step: {e}", file=sys.stderr) + return 1 + + # Save the updated chain + try: + chain.save(str(output_path)) + except Exception as e: + print(f"Error: Failed to save provenance chain: {e}", file=sys.stderr) + return 1 + + # Success message + input_list = ( + ", ".join(args.inputs) + if len(args.inputs) <= 3 + else f"{args.inputs[0]}, ... ({len(args.inputs)} files)" + ) + output_list = ( + ", ".join(args.outputs) + if len(args.outputs) <= 3 + else f"{args.outputs[0]}, ... ({len(args.outputs)} files)" + ) + + print(f"Added processing step to {output_path}:", file=sys.stderr) + print(f" Tool: {args.tool_name} v{args.tool_version}", file=sys.stderr) + print(f" Operation: {args.operation}", file=sys.stderr) + print(f" Inputs: {input_list}", file=sys.stderr) + print(f" Outputs: {output_list}", file=sys.stderr) + if args.drl is not None: + print(f" DRL: {args.drl}", file=sys.stderr) + + print("\nProvenance chain updated successfully.", file=sys.stderr) + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/pyproject.toml b/pyproject.toml index 2188fcb..d4de664 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dev = [ [project.scripts] dataprov-new = "dataprov.cli.newprovchain:main" +dataprov-add = "dataprov.cli.addstep:main" dataprov-add-attribution = "dataprov.cli.addattribution:main" dataprov-visualize = "dataprov.cli.visualize:main" dataprov-report = "dataprov.cli.report:main" diff --git a/tests/test_dataprov.py b/tests/test_dataprov.py index 80a79af..994350b 100644 --- a/tests/test_dataprov.py +++ b/tests/test_dataprov.py @@ -2875,3 +2875,376 @@ def test_custom_metadata_validation(self): "invalidkey": "value" # missing namespace prefix }, ) + + +class TestCLIAddStep: + """Tests for the dataprov-add CLI tool.""" + + @pytest.fixture + def prov_file(self, tmp_path): + """Create a minimal provenance chain file for CLI tests.""" + chain = ProvenanceChain.create( + entity_id="cli_test_entity", + initial_source="/test/source/", + ) + filepath = tmp_path / "prov.json" + chain.save(str(filepath)) + return filepath + + def _run_main(self, argv): + """Run dataprov-add main() with given argv list.""" + import sys + + from dataprov.cli.addstep import main + + old_argv = sys.argv + sys.argv = ["dataprov-add"] + argv + try: + return main() + finally: + sys.argv = old_argv + + def test_add_basic_step(self, prov_file, tmp_path, sample_file): + """Test adding a basic processing step via CLI.""" + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--ended-at", "2024-10-15T11:05:30Z", + "--tool-name", "my_tool", + "--tool-version", "1.0", + "--operation", "processing", + "-i", str(sample_file), + "--input-formats", "TXT", + "--outputs", str(sample_file), + "--output-formats", "TXT", + ] + ) + + assert result == 0 + chain = ProvenanceChain.load(str(prov_file)) + assert len(chain.get_steps()) == 1 + step = chain.get_steps()[0] + assert step["tool"]["name"] == "my_tool" + assert step["tool"]["version"] == "1.0" + assert step["operation"] == "processing" + + def test_add_step_with_optional_fields(self, prov_file, sample_file): + """Test adding a step with optional metadata fields.""" + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--ended-at", "2024-10-15T11:05:30Z", + "--tool-name", "my_tool", + "--tool-version", "2.0", + "--operation", "calibration", + "-i", str(sample_file), + "--input-formats", "TXT", + "--outputs", str(sample_file), + "--output-formats", "TXT", + "--source", "ACME Corp", + "--arguments", "--mode fast", + "--output-log", "Done", + "--warnings", "None", + "--drl", "5", + ] + ) + + assert result == 0 + chain = ProvenanceChain.load(str(prov_file)) + step = chain.get_steps()[0] + assert step["arguments"] == "--mode fast" + assert step["output_log"] == "Done" + assert step["warnings"] == "None" + assert step["drl"] == 5 + + def test_add_step_no_ended_at(self, prov_file, sample_file): + """Test adding an ongoing step without --ended-at.""" + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--tool-name", "my_tool", + "--tool-version", "1.0", + "--operation", "processing", + "-i", str(sample_file), + "--input-formats", "TXT", + "--outputs", str(sample_file), + "--output-formats", "TXT", + ] + ) + + assert result == 0 + chain = ProvenanceChain.load(str(prov_file)) + assert len(chain.get_steps()) == 1 + + def test_add_step_multiple_inputs_outputs(self, prov_file, sample_files): + """Test adding a step with multiple inputs and outputs.""" + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--ended-at", "2024-10-15T11:05:30Z", + "--tool-name", "combiner", + "--tool-version", "1.0", + "--operation", "merge", + "-i", str(sample_files[0]), str(sample_files[1]), + "--input-formats", "TXT", "TXT", + "--outputs", str(sample_files[2]), + "--output-formats", "TXT", + ] + ) + + assert result == 0 + chain = ProvenanceChain.load(str(prov_file)) + assert len(chain.get_steps()) == 1 + + def test_add_step_with_none_provenance_files(self, prov_file, sample_files): + """Test --input-provenance-files with 'none' sentinel values.""" + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--ended-at", "2024-10-15T11:05:30Z", + "--tool-name", "combiner", + "--tool-version", "1.0", + "--operation", "merge", + "-i", str(sample_files[0]), str(sample_files[1]), + "--input-formats", "TXT", "TXT", + "--outputs", str(sample_files[2]), + "--output-formats", "TXT", + "--input-provenance-files", "none", "none", + ] + ) + + assert result == 0 + + def test_add_step_output_to_new_file(self, prov_file, tmp_path, sample_file): + """Test writing the updated chain to a new output file.""" + output_file = tmp_path / "prov_updated.json" + + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--ended-at", "2024-10-15T11:05:30Z", + "--tool-name", "my_tool", + "--tool-version", "1.0", + "--operation", "processing", + "-i", str(sample_file), + "--input-formats", "TXT", + "--outputs", str(sample_file), + "--output-formats", "TXT", + "-o", str(output_file), + ] + ) + + assert result == 0 + assert output_file.exists() + # Original file should be unchanged + original_chain = ProvenanceChain.load(str(prov_file)) + assert len(original_chain.get_steps()) == 0 + # Updated file should have the step + updated_chain = ProvenanceChain.load(str(output_file)) + assert len(updated_chain.get_steps()) == 1 + + def test_missing_provenance_file(self, tmp_path, sample_file): + """Test error when provenance file does not exist.""" + result = self._run_main( + [ + "-p", str(tmp_path / "nonexistent.json"), + "--started-at", "2024-10-15T11:00:00Z", + "--tool-name", "my_tool", + "--tool-version", "1.0", + "--operation", "processing", + "-i", str(sample_file), + "--input-formats", "TXT", + "--outputs", str(sample_file), + "--output-formats", "TXT", + ] + ) + + assert result == 1 + + def test_mismatched_input_formats(self, prov_file, sample_files): + """Test error when --input-formats count does not match --inputs.""" + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--tool-name", "my_tool", + "--tool-version", "1.0", + "--operation", "processing", + "-i", str(sample_files[0]), str(sample_files[1]), + "--input-formats", "TXT", # only 1 format for 2 inputs + "--outputs", str(sample_files[2]), + "--output-formats", "TXT", + ] + ) + + assert result == 1 + + def test_mismatched_output_formats(self, prov_file, sample_files): + """Test error when --output-formats count does not match --outputs.""" + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--tool-name", "my_tool", + "--tool-version", "1.0", + "--operation", "processing", + "-i", str(sample_files[0]), + "--input-formats", "TXT", + "--outputs", str(sample_files[1]), str(sample_files[2]), + "--output-formats", "TXT", # only 1 format for 2 outputs + ] + ) + + assert result == 1 + + def test_overwrite_existing_output(self, prov_file, tmp_path, sample_file): + """Test --overwrite flag for output file.""" + output_file = tmp_path / "output.json" + output_file.write_text("{}") + + # Without --overwrite should fail + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--tool-name", "my_tool", + "--tool-version", "1.0", + "--operation", "processing", + "-i", str(sample_file), + "--input-formats", "TXT", + "--outputs", str(sample_file), + "--output-formats", "TXT", + "-o", str(output_file), + ] + ) + assert result == 1 + + # With --overwrite should succeed + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--tool-name", "my_tool", + "--tool-version", "1.0", + "--operation", "processing", + "-i", str(sample_file), + "--input-formats", "TXT", + "--outputs", str(sample_file), + "--output-formats", "TXT", + "-o", str(output_file), + "--overwrite", + ] + ) + assert result == 0 + + def test_capture_agent(self, prov_file, sample_file): + """Test --capture-agent flag records user info.""" + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--ended-at", "2024-10-15T11:05:30Z", + "--tool-name", "my_tool", + "--tool-version", "1.0", + "--operation", "processing", + "-i", str(sample_file), + "--input-formats", "TXT", + "--outputs", str(sample_file), + "--output-formats", "TXT", + "--capture-agent", + ] + ) + + assert result == 0 + chain = ProvenanceChain.load(str(prov_file)) + step = chain.get_steps()[0] + assert step.get("agent") is not None + + def test_capture_environment(self, prov_file, sample_file): + """Test --capture-environment flag records runtime info.""" + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--ended-at", "2024-10-15T11:05:30Z", + "--tool-name", "my_tool", + "--tool-version", "1.0", + "--operation", "processing", + "-i", str(sample_file), + "--input-formats", "TXT", + "--outputs", str(sample_file), + "--output-formats", "TXT", + "--capture-environment", + ] + ) + + assert result == 0 + chain = ProvenanceChain.load(str(prov_file)) + step = chain.get_steps()[0] + assert step.get("environment") is not None + + def test_user_without_capture_agent_fails(self, prov_file, sample_file): + """Test that --user without --capture-agent returns error.""" + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--tool-name", "my_tool", + "--tool-version", "1.0", + "--operation", "processing", + "-i", str(sample_file), + "--input-formats", "TXT", + "--outputs", str(sample_file), + "--output-formats", "TXT", + "--user", "alice", + ] + ) + + assert result == 1 + + def test_runtime_without_capture_environment_fails(self, prov_file, sample_file): + """Test that --runtime without --capture-environment returns error.""" + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", "2024-10-15T11:00:00Z", + "--tool-name", "my_tool", + "--tool-version", "1.0", + "--operation", "processing", + "-i", str(sample_file), + "--input-formats", "TXT", + "--outputs", str(sample_file), + "--output-formats", "TXT", + "--runtime", "Node.js", + ] + ) + + assert result == 1 + + def test_add_multiple_steps(self, prov_file, sample_file): + """Test adding multiple steps sequentially via CLI.""" + for i in range(3): + result = self._run_main( + [ + "-p", str(prov_file), + "--started-at", f"2024-10-15T1{i}:00:00Z", + "--ended-at", f"2024-10-15T1{i}:05:00Z", + "--tool-name", f"tool_{i}", + "--tool-version", "1.0", + "--operation", f"step_{i}", + "-i", str(sample_file), + "--input-formats", "TXT", + "--outputs", str(sample_file), + "--output-formats", "TXT", + ] + ) + assert result == 0 + + chain = ProvenanceChain.load(str(prov_file)) + assert len(chain.get_steps()) == 3 From 721306353c66747569b315ea30f8115152a37351 Mon Sep 17 00:00:00 2001 From: Bastian Havers-Zulka Date: Tue, 14 Apr 2026 13:49:08 +0200 Subject: [PATCH 2/3] reformat to pass ruff format --- dataprov/cli/addstep.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dataprov/cli/addstep.py b/dataprov/cli/addstep.py index 5a9f8be..4c18041 100644 --- a/dataprov/cli/addstep.py +++ b/dataprov/cli/addstep.py @@ -316,9 +316,7 @@ def main(): # Validate output file output_path = Path(args.output) if args.output else prov_path if args.output and output_path.exists() and not args.overwrite: - print( - f"Error: Output file already exists: {args.output}", file=sys.stderr - ) + print(f"Error: Output file already exists: {args.output}", file=sys.stderr) print("Use --overwrite to replace existing file", file=sys.stderr) return 1 From 0c542b61547eec85ef2b255c947ac0a48f9d38d0 Mon Sep 17 00:00:00 2001 From: Bastian Havers-Zulka Date: Tue, 14 Apr 2026 13:51:12 +0200 Subject: [PATCH 3/3] see previous commit --- tests/test_dataprov.py | 496 +++++++++++++++++++++++++++-------------- 1 file changed, 333 insertions(+), 163 deletions(-) diff --git a/tests/test_dataprov.py b/tests/test_dataprov.py index 994350b..0391261 100644 --- a/tests/test_dataprov.py +++ b/tests/test_dataprov.py @@ -2908,16 +2908,26 @@ def test_add_basic_step(self, prov_file, tmp_path, sample_file): """Test adding a basic processing step via CLI.""" result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--ended-at", "2024-10-15T11:05:30Z", - "--tool-name", "my_tool", - "--tool-version", "1.0", - "--operation", "processing", - "-i", str(sample_file), - "--input-formats", "TXT", - "--outputs", str(sample_file), - "--output-formats", "TXT", + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--ended-at", + "2024-10-15T11:05:30Z", + "--tool-name", + "my_tool", + "--tool-version", + "1.0", + "--operation", + "processing", + "-i", + str(sample_file), + "--input-formats", + "TXT", + "--outputs", + str(sample_file), + "--output-formats", + "TXT", ] ) @@ -2933,21 +2943,36 @@ def test_add_step_with_optional_fields(self, prov_file, sample_file): """Test adding a step with optional metadata fields.""" result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--ended-at", "2024-10-15T11:05:30Z", - "--tool-name", "my_tool", - "--tool-version", "2.0", - "--operation", "calibration", - "-i", str(sample_file), - "--input-formats", "TXT", - "--outputs", str(sample_file), - "--output-formats", "TXT", - "--source", "ACME Corp", - "--arguments", "--mode fast", - "--output-log", "Done", - "--warnings", "None", - "--drl", "5", + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--ended-at", + "2024-10-15T11:05:30Z", + "--tool-name", + "my_tool", + "--tool-version", + "2.0", + "--operation", + "calibration", + "-i", + str(sample_file), + "--input-formats", + "TXT", + "--outputs", + str(sample_file), + "--output-formats", + "TXT", + "--source", + "ACME Corp", + "--arguments", + "--mode fast", + "--output-log", + "Done", + "--warnings", + "None", + "--drl", + "5", ] ) @@ -2963,15 +2988,24 @@ def test_add_step_no_ended_at(self, prov_file, sample_file): """Test adding an ongoing step without --ended-at.""" result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--tool-name", "my_tool", - "--tool-version", "1.0", - "--operation", "processing", - "-i", str(sample_file), - "--input-formats", "TXT", - "--outputs", str(sample_file), - "--output-formats", "TXT", + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--tool-name", + "my_tool", + "--tool-version", + "1.0", + "--operation", + "processing", + "-i", + str(sample_file), + "--input-formats", + "TXT", + "--outputs", + str(sample_file), + "--output-formats", + "TXT", ] ) @@ -2983,16 +3017,28 @@ def test_add_step_multiple_inputs_outputs(self, prov_file, sample_files): """Test adding a step with multiple inputs and outputs.""" result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--ended-at", "2024-10-15T11:05:30Z", - "--tool-name", "combiner", - "--tool-version", "1.0", - "--operation", "merge", - "-i", str(sample_files[0]), str(sample_files[1]), - "--input-formats", "TXT", "TXT", - "--outputs", str(sample_files[2]), - "--output-formats", "TXT", + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--ended-at", + "2024-10-15T11:05:30Z", + "--tool-name", + "combiner", + "--tool-version", + "1.0", + "--operation", + "merge", + "-i", + str(sample_files[0]), + str(sample_files[1]), + "--input-formats", + "TXT", + "TXT", + "--outputs", + str(sample_files[2]), + "--output-formats", + "TXT", ] ) @@ -3004,17 +3050,31 @@ def test_add_step_with_none_provenance_files(self, prov_file, sample_files): """Test --input-provenance-files with 'none' sentinel values.""" result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--ended-at", "2024-10-15T11:05:30Z", - "--tool-name", "combiner", - "--tool-version", "1.0", - "--operation", "merge", - "-i", str(sample_files[0]), str(sample_files[1]), - "--input-formats", "TXT", "TXT", - "--outputs", str(sample_files[2]), - "--output-formats", "TXT", - "--input-provenance-files", "none", "none", + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--ended-at", + "2024-10-15T11:05:30Z", + "--tool-name", + "combiner", + "--tool-version", + "1.0", + "--operation", + "merge", + "-i", + str(sample_files[0]), + str(sample_files[1]), + "--input-formats", + "TXT", + "TXT", + "--outputs", + str(sample_files[2]), + "--output-formats", + "TXT", + "--input-provenance-files", + "none", + "none", ] ) @@ -3026,17 +3086,28 @@ def test_add_step_output_to_new_file(self, prov_file, tmp_path, sample_file): result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--ended-at", "2024-10-15T11:05:30Z", - "--tool-name", "my_tool", - "--tool-version", "1.0", - "--operation", "processing", - "-i", str(sample_file), - "--input-formats", "TXT", - "--outputs", str(sample_file), - "--output-formats", "TXT", - "-o", str(output_file), + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--ended-at", + "2024-10-15T11:05:30Z", + "--tool-name", + "my_tool", + "--tool-version", + "1.0", + "--operation", + "processing", + "-i", + str(sample_file), + "--input-formats", + "TXT", + "--outputs", + str(sample_file), + "--output-formats", + "TXT", + "-o", + str(output_file), ] ) @@ -3053,15 +3124,24 @@ def test_missing_provenance_file(self, tmp_path, sample_file): """Test error when provenance file does not exist.""" result = self._run_main( [ - "-p", str(tmp_path / "nonexistent.json"), - "--started-at", "2024-10-15T11:00:00Z", - "--tool-name", "my_tool", - "--tool-version", "1.0", - "--operation", "processing", - "-i", str(sample_file), - "--input-formats", "TXT", - "--outputs", str(sample_file), - "--output-formats", "TXT", + "-p", + str(tmp_path / "nonexistent.json"), + "--started-at", + "2024-10-15T11:00:00Z", + "--tool-name", + "my_tool", + "--tool-version", + "1.0", + "--operation", + "processing", + "-i", + str(sample_file), + "--input-formats", + "TXT", + "--outputs", + str(sample_file), + "--output-formats", + "TXT", ] ) @@ -3071,15 +3151,25 @@ def test_mismatched_input_formats(self, prov_file, sample_files): """Test error when --input-formats count does not match --inputs.""" result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--tool-name", "my_tool", - "--tool-version", "1.0", - "--operation", "processing", - "-i", str(sample_files[0]), str(sample_files[1]), - "--input-formats", "TXT", # only 1 format for 2 inputs - "--outputs", str(sample_files[2]), - "--output-formats", "TXT", + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--tool-name", + "my_tool", + "--tool-version", + "1.0", + "--operation", + "processing", + "-i", + str(sample_files[0]), + str(sample_files[1]), + "--input-formats", + "TXT", # only 1 format for 2 inputs + "--outputs", + str(sample_files[2]), + "--output-formats", + "TXT", ] ) @@ -3089,15 +3179,25 @@ def test_mismatched_output_formats(self, prov_file, sample_files): """Test error when --output-formats count does not match --outputs.""" result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--tool-name", "my_tool", - "--tool-version", "1.0", - "--operation", "processing", - "-i", str(sample_files[0]), - "--input-formats", "TXT", - "--outputs", str(sample_files[1]), str(sample_files[2]), - "--output-formats", "TXT", # only 1 format for 2 outputs + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--tool-name", + "my_tool", + "--tool-version", + "1.0", + "--operation", + "processing", + "-i", + str(sample_files[0]), + "--input-formats", + "TXT", + "--outputs", + str(sample_files[1]), + str(sample_files[2]), + "--output-formats", + "TXT", # only 1 format for 2 outputs ] ) @@ -3111,16 +3211,26 @@ def test_overwrite_existing_output(self, prov_file, tmp_path, sample_file): # Without --overwrite should fail result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--tool-name", "my_tool", - "--tool-version", "1.0", - "--operation", "processing", - "-i", str(sample_file), - "--input-formats", "TXT", - "--outputs", str(sample_file), - "--output-formats", "TXT", - "-o", str(output_file), + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--tool-name", + "my_tool", + "--tool-version", + "1.0", + "--operation", + "processing", + "-i", + str(sample_file), + "--input-formats", + "TXT", + "--outputs", + str(sample_file), + "--output-formats", + "TXT", + "-o", + str(output_file), ] ) assert result == 1 @@ -3128,16 +3238,26 @@ def test_overwrite_existing_output(self, prov_file, tmp_path, sample_file): # With --overwrite should succeed result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--tool-name", "my_tool", - "--tool-version", "1.0", - "--operation", "processing", - "-i", str(sample_file), - "--input-formats", "TXT", - "--outputs", str(sample_file), - "--output-formats", "TXT", - "-o", str(output_file), + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--tool-name", + "my_tool", + "--tool-version", + "1.0", + "--operation", + "processing", + "-i", + str(sample_file), + "--input-formats", + "TXT", + "--outputs", + str(sample_file), + "--output-formats", + "TXT", + "-o", + str(output_file), "--overwrite", ] ) @@ -3147,16 +3267,26 @@ def test_capture_agent(self, prov_file, sample_file): """Test --capture-agent flag records user info.""" result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--ended-at", "2024-10-15T11:05:30Z", - "--tool-name", "my_tool", - "--tool-version", "1.0", - "--operation", "processing", - "-i", str(sample_file), - "--input-formats", "TXT", - "--outputs", str(sample_file), - "--output-formats", "TXT", + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--ended-at", + "2024-10-15T11:05:30Z", + "--tool-name", + "my_tool", + "--tool-version", + "1.0", + "--operation", + "processing", + "-i", + str(sample_file), + "--input-formats", + "TXT", + "--outputs", + str(sample_file), + "--output-formats", + "TXT", "--capture-agent", ] ) @@ -3170,16 +3300,26 @@ def test_capture_environment(self, prov_file, sample_file): """Test --capture-environment flag records runtime info.""" result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--ended-at", "2024-10-15T11:05:30Z", - "--tool-name", "my_tool", - "--tool-version", "1.0", - "--operation", "processing", - "-i", str(sample_file), - "--input-formats", "TXT", - "--outputs", str(sample_file), - "--output-formats", "TXT", + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--ended-at", + "2024-10-15T11:05:30Z", + "--tool-name", + "my_tool", + "--tool-version", + "1.0", + "--operation", + "processing", + "-i", + str(sample_file), + "--input-formats", + "TXT", + "--outputs", + str(sample_file), + "--output-formats", + "TXT", "--capture-environment", ] ) @@ -3193,16 +3333,26 @@ def test_user_without_capture_agent_fails(self, prov_file, sample_file): """Test that --user without --capture-agent returns error.""" result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--tool-name", "my_tool", - "--tool-version", "1.0", - "--operation", "processing", - "-i", str(sample_file), - "--input-formats", "TXT", - "--outputs", str(sample_file), - "--output-formats", "TXT", - "--user", "alice", + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--tool-name", + "my_tool", + "--tool-version", + "1.0", + "--operation", + "processing", + "-i", + str(sample_file), + "--input-formats", + "TXT", + "--outputs", + str(sample_file), + "--output-formats", + "TXT", + "--user", + "alice", ] ) @@ -3212,16 +3362,26 @@ def test_runtime_without_capture_environment_fails(self, prov_file, sample_file) """Test that --runtime without --capture-environment returns error.""" result = self._run_main( [ - "-p", str(prov_file), - "--started-at", "2024-10-15T11:00:00Z", - "--tool-name", "my_tool", - "--tool-version", "1.0", - "--operation", "processing", - "-i", str(sample_file), - "--input-formats", "TXT", - "--outputs", str(sample_file), - "--output-formats", "TXT", - "--runtime", "Node.js", + "-p", + str(prov_file), + "--started-at", + "2024-10-15T11:00:00Z", + "--tool-name", + "my_tool", + "--tool-version", + "1.0", + "--operation", + "processing", + "-i", + str(sample_file), + "--input-formats", + "TXT", + "--outputs", + str(sample_file), + "--output-formats", + "TXT", + "--runtime", + "Node.js", ] ) @@ -3232,16 +3392,26 @@ def test_add_multiple_steps(self, prov_file, sample_file): for i in range(3): result = self._run_main( [ - "-p", str(prov_file), - "--started-at", f"2024-10-15T1{i}:00:00Z", - "--ended-at", f"2024-10-15T1{i}:05:00Z", - "--tool-name", f"tool_{i}", - "--tool-version", "1.0", - "--operation", f"step_{i}", - "-i", str(sample_file), - "--input-formats", "TXT", - "--outputs", str(sample_file), - "--output-formats", "TXT", + "-p", + str(prov_file), + "--started-at", + f"2024-10-15T1{i}:00:00Z", + "--ended-at", + f"2024-10-15T1{i}:05:00Z", + "--tool-name", + f"tool_{i}", + "--tool-version", + "1.0", + "--operation", + f"step_{i}", + "-i", + str(sample_file), + "--input-formats", + "TXT", + "--outputs", + str(sample_file), + "--output-formats", + "TXT", ] ) assert result == 0