diff --git a/README.md b/README.md index cdec271..28ea4a4 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,16 @@ ```bash -pip install cognis-deidproof +pip install "git+https://github.com/cognis-digital/deidproof.git" deidproof scan . # → prioritized findings in seconds ``` + +## What is this? + +`deidproof` is a small, self-contained command-line tool from the Cognis suite. It does one job well, runs locally with no account or cloud service required, and is built to be easy to install and read. See the usage below for what it can do. + + ## Contents - [Why deidproof?](#why) · [Features](#features) · [Quick start](#quick-start) · [Example](#example) · [Architecture](#architecture) · [AI stack](#ai-stack) · [How it compares](#how-it-compares) · [Integrations](#integrations) · [Install anywhere](#install-anywhere) · [Related](#related) · [Contributing](#contributing) @@ -47,10 +53,56 @@ Proves your 'de-identified' export actually is de-identified, emitting a signed
↑ back to top
+ +## Domains + +**Primary domain:** Data & Privacy · **JTF MERIDIAN division:** NULLBYTE · BLUE CELL + +**Topics:** `cognis` `privacy` `data-protection` `pii` + +Part of the **Cognis Neural Suite** — 300+ source-available tools organized across 12 domains under the JTF MERIDIAN command structure. See the [suite on GitHub](https://github.com/cognis-digital) and [jtf-meridian](https://github.com/cognis-digital/jtf-meridian) for how the pieces fit together. + + + +## Install + +`deidproof` is source-available (not published to PyPI) — every method below installs +straight from GitHub. Pick whichever you prefer; the one-line scripts auto-detect +the best tool available on your machine. + +**One-liner (Linux / macOS):** +```sh +curl -fsSL https://raw.githubusercontent.com/cognis-digital/deidproof/HEAD/install.sh | sh +``` + +**One-liner (Windows PowerShell):** +```powershell +irm https://raw.githubusercontent.com/cognis-digital/deidproof/HEAD/install.ps1 | iex +``` + +**Or install manually — any one of:** +```sh +pipx install "git+https://github.com/cognis-digital/deidproof.git" # isolated (recommended) +uv tool install "git+https://github.com/cognis-digital/deidproof.git" # uv +pip install "git+https://github.com/cognis-digital/deidproof.git" # pip +``` + +**From source:** +```sh +git clone https://github.com/cognis-digital/deidproof.git +cd deidproof && pip install . +``` + +Then run: +```sh +deidproof --help +``` + + ## Quick start ```bash -pip install cognis-deidproof +pip install "git+https://github.com/cognis-digital/deidproof.git" deidproof --version deidproof scan . # scan current project deidproof scan . --format json # machine-readable diff --git a/deidproof/cli.py b/deidproof/cli.py index db6b8c8..5f92a2f 100644 --- a/deidproof/cli.py +++ b/deidproof/cli.py @@ -171,6 +171,14 @@ def main(argv: Optional[List[str]] = None) -> int: qi = _split_cols(args.quasi_identifiers) sensitive = _split_cols(args.sensitive) + # Validate numeric thresholds before hitting the CSV layer. + if args.k is not None and args.k < 1: + print(f"error: -k must be a positive integer, got {args.k}", file=sys.stderr) + return 1 + if args.l is not None and args.l < 1: + print(f"error: -l must be a positive integer, got {args.l}", file=sys.stderr) + return 1 + try: rep = analyze_csv( args.dataset, @@ -181,7 +189,16 @@ def main(argv: Optional[List[str]] = None) -> int: safe_harbor=args.safe_harbor, delimiter=args.delimiter, ) - except (FileNotFoundError, ValueError) as exc: + except (FileNotFoundError, IsADirectoryError, PermissionError) as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + except UnicodeDecodeError as exc: + print( + f"error: could not decode file as UTF-8 — {exc}", + file=sys.stderr, + ) + return 1 + except ValueError as exc: print(f"error: {exc}", file=sys.stderr) return 1 diff --git a/deidproof/core.py b/deidproof/core.py index a174206..e79e191 100644 --- a/deidproof/core.py +++ b/deidproof/core.py @@ -12,10 +12,9 @@ import csv import re -from collections import Counter, defaultdict +from collections import defaultdict from dataclasses import dataclass, field, asdict -from datetime import date, datetime -from typing import Dict, Iterable, List, Optional, Sequence, Tuple +from typing import Dict, List, Optional, Sequence, Tuple TOOL_NAME = "deidproof" TOOL_VERSION = "1.0.0" @@ -293,7 +292,7 @@ def analyze_rows( quasi_identifiers: Optional[Sequence[str]] = None, sensitive: Optional[Sequence[str]] = None, k: Optional[int] = None, - l: Optional[int] = None, + l: Optional[int] = None, # noqa: E741 safe_harbor: bool = True, max_samples: int = 3, ) -> Report: @@ -347,16 +346,40 @@ def analyze_csv( quasi_identifiers: Optional[Sequence[str]] = None, sensitive: Optional[Sequence[str]] = None, k: Optional[int] = None, - l: Optional[int] = None, + l: Optional[int] = None, # noqa: E741 safe_harbor: bool = True, delimiter: str = ",", max_samples: int = 3, ) -> Report: - """Parse a CSV file and run the full analysis.""" - with open(path, "r", newline="", encoding="utf-8-sig") as fh: - reader = csv.DictReader(fh, delimiter=delimiter) - columns = list(reader.fieldnames or []) - rows = [dict(r) for r in reader] + """Parse a CSV file and run the full analysis. + + Raises + ------ + FileNotFoundError + If *path* does not exist. + IsADirectoryError + If *path* is a directory, not a file. + PermissionError + If the process lacks read permission for *path*. + UnicodeDecodeError + If the file is not UTF-8 (or UTF-8-with-BOM) encoded. + ValueError + If *delimiter* is not a single character, or if a requested + quasi-identifier / sensitive column is absent from the dataset. + """ + if len(delimiter) != 1: + raise ValueError( + f"delimiter must be a single character, got {delimiter!r} " + f"(length {len(delimiter)})" + ) + + try: + with open(path, "r", newline="", encoding="utf-8-sig") as fh: + reader = csv.DictReader(fh, delimiter=delimiter) + columns = list(reader.fieldnames or []) + rows = [dict(r) for r in reader] + except IsADirectoryError: + raise IsADirectoryError(f"expected a CSV file but got a directory: {path!r}") _validate_columns(columns, quasi_identifiers, "quasi-identifier") _validate_columns(columns, sensitive, "sensitive") diff --git a/deidproof/mcp_server.py b/deidproof/mcp_server.py index 151a862..4cddd20 100644 --- a/deidproof/mcp_server.py +++ b/deidproof/mcp_server.py @@ -1,22 +1,31 @@ -"""DEIDPROOF MCP server — exposes scan() as an MCP tool for Cognis.Studio.""" -from __future__ import annotations -from deidproof.core import scan, to_json - -def serve() -> int: - """Start an MCP stdio server. Requires the optional 'mcp' extra: - pip install "cognis-deidproof[mcp]" - """ - try: - from mcp.server.fastmcp import FastMCP - except Exception: - print("Install the MCP extra: pip install 'cognis-deidproof[mcp]'") - return 1 - app = FastMCP("deidproof") - - @app.tool() - def deidproof_scan(target: str) -> str: - """Re-identification risk assessment that computes k-anonymity, l-diversity, and HIPAA Safe Harbor compliance on a dataset.. Returns JSON findings.""" - return to_json(scan(target)) - - app.run() - return 0 +"""DEIDPROOF MCP server — exposes deidproof_scan() as an MCP tool for Cognis.Studio.""" +from __future__ import annotations + +import json + +from deidproof.core import analyze_csv + + +def serve() -> int: + """Start an MCP stdio server. Requires the optional 'mcp' extra: + pip install "cognis-deidproof[mcp]" + """ + try: + from mcp.server.fastmcp import FastMCP + except Exception: + print("Install the MCP extra: pip install 'cognis-deidproof[mcp]'") + return 1 + app = FastMCP("deidproof") + + @app.tool() + def deidproof_scan(target: str) -> str: + """Re-identification risk assessment that computes k-anonymity, l-diversity, + and HIPAA Safe Harbor compliance on a dataset. Returns JSON findings.""" + try: + report = analyze_csv(target) + except (FileNotFoundError, IsADirectoryError, PermissionError, UnicodeDecodeError, ValueError) as exc: + return json.dumps({"error": str(exc)}) + return json.dumps(report.to_dict(), indent=2) + + app.run() + return 0 diff --git a/install.ps1 b/install.ps1 new file mode 100644 index 0000000..5271a4a --- /dev/null +++ b/install.ps1 @@ -0,0 +1,29 @@ +# Comprehensive installer for cognis-digital/deidproof (Windows PowerShell). +# Tries: pipx -> uv -> pip (git+https) -> from source. +# deidproof is source-available and not on PyPI; all paths install from GitHub. +$ErrorActionPreference = "Stop" +$Repo = "deidproof" +$Url = "git+https://github.com/cognis-digital/deidproof.git" +$Git = "https://github.com/cognis-digital/deidproof.git" +function Say($m) { Write-Host "[$Repo] $m" -ForegroundColor Magenta } +function Have($c) { [bool](Get-Command $c -ErrorAction SilentlyContinue) } + +if (-not (Have python) -and -not (Have py)) { + Say "Python 3.9+ is required but was not found. Install Python first."; exit 1 +} +if (Have pipx) { + Say "Installing with pipx (isolated, recommended)..." + pipx install $Url; if ($LASTEXITCODE -eq 0) { Say "Done. Run: deidproof"; exit 0 } +} +if (Have uv) { + Say "Installing with uv..." + uv tool install $Url; if ($LASTEXITCODE -eq 0) { Say "Done. Run: deidproof"; exit 0 } +} +if (Have pip) { + Say "Installing with pip (user site)..." + pip install --user $Url; if ($LASTEXITCODE -eq 0) { Say "Done. Run: deidproof"; exit 0 } +} +Say "No packaging tool worked; falling back to a source clone." +$Tmp = Join-Path $env:TEMP "$Repo-src" +git clone --depth 1 $Git $Tmp +Say "Cloned to $Tmp - run: cd $Tmp; python -m pip install ." diff --git a/install.sh b/install.sh index 494b880..4dca654 100644 --- a/install.sh +++ b/install.sh @@ -1,10 +1,34 @@ -#!/usr/bin/env sh -# Universal installer for deidproof. Prefers uv > pipx > pip; installs from the repo. -set -e -SRC="git+https://github.com/cognis-digital/deidproof.git" -echo "Installing deidproof ..." -if command -v uv >/dev/null 2>&1; then uv tool install "$SRC" -elif command -v pipx >/dev/null 2>&1; then pipx install "$SRC" -elif command -v python3 >/dev/null 2>&1; then python3 -m pip install --user "$SRC" -else echo "Need uv, pipx, or python3+pip"; exit 1; fi -echo "Done. Run: deidproof --help" +#!/usr/bin/env sh +# Comprehensive installer for cognis-digital/deidproof (Linux / macOS). +# Tries the best available method: pipx -> uv -> pip (git+https) -> from source. +# deidproof is source-available and not on PyPI; all paths install from GitHub. +set -eu + +REPO="deidproof" +URL="git+https://github.com/cognis-digital/deidproof.git" +GITURL="https://github.com/cognis-digital/deidproof.git" + +say() { printf '\033[1;35m[%s]\033[0m %s\n' "$REPO" "$1"; } +have() { command -v "$1" >/dev/null 2>&1; } + +if ! have python3 && ! have python; then + say "Python 3.9+ is required but was not found. Install Python first."; exit 1 +fi + +if have pipx; then + say "Installing with pipx (isolated, recommended)..." + pipx install "$URL" && { say "Done. Run: deidproof"; exit 0; } +fi +if have uv; then + say "Installing with uv..." + uv tool install "$URL" && { say "Done. Run: deidproof"; exit 0; } +fi +if have pip3 || have pip; then + PIP="$(command -v pip3 || command -v pip)" + say "Installing with pip (user site)..." + "$PIP" install --user "$URL" && { say "Done. Run: deidproof"; exit 0; } +fi + +say "No packaging tool worked; falling back to a source clone." +TMP="$(mktemp -d)"; git clone --depth 1 "$GITURL" "$TMP/$REPO" +say "Cloned to $TMP/$REPO — run: cd $TMP/$REPO && python3 -m pip install ." diff --git a/integrations/webhook.py b/integrations/webhook.py index 91e0211..9bf7258 100644 --- a/integrations/webhook.py +++ b/integrations/webhook.py @@ -5,7 +5,7 @@ Usage: scan . --format json | python integrations/webhook.py --url URL """ from __future__ import annotations -import argparse, json, sys, urllib.request +import argparse, sys, urllib.request def main() -> int: ap = argparse.ArgumentParser() diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 01ea357..fa10958 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -2,6 +2,7 @@ import os import sys +import tempfile import pytest @@ -149,3 +150,65 @@ def test_cli_version(capsys): assert exc.value.code == 0 out = capsys.readouterr().out assert TOOL_VERSION in out + + +# --------------------------------------------------------------------------- +# Hardening tests — bad input / edge cases +# --------------------------------------------------------------------------- + + +def test_cli_missing_file_exits_1(capsys): + """Requesting a non-existent CSV must exit 1 with a clear stderr message.""" + rc = main(["check", "/no/such/file.csv"]) + assert rc == 1 + err = capsys.readouterr().err + assert "error:" in err + + +def test_cli_multichar_delimiter_exits_1(capsys): + """A multi-character delimiter is not a valid CSV separator; must exit 1.""" + rc = main(["check", DEMO, "--delimiter", "TAB"]) + assert rc == 1 + err = capsys.readouterr().err + assert "delimiter" in err.lower() + + +def test_cli_negative_k_exits_1(capsys): + """Negative k is semantically invalid; must exit 1 with a clear message.""" + rc = main(["check", DEMO, "-k", "-3"]) + assert rc == 1 + err = capsys.readouterr().err + assert "error:" in err + + +def test_cli_zero_l_exits_1(capsys): + """l=0 is semantically invalid; must exit 1 with a clear message.""" + rc = main(["check", DEMO, "--sensitive", "diagnosis", "-l", "0"]) + assert rc == 1 + err = capsys.readouterr().err + assert "error:" in err + + +def test_analyze_csv_multichar_delimiter_raises(): + """analyze_csv must raise ValueError for a multi-character delimiter.""" + with pytest.raises(ValueError, match="delimiter"): + analyze_csv(DEMO, delimiter="||") + + +def test_analyze_csv_non_utf8_raises(): + """analyze_csv must propagate UnicodeDecodeError for non-UTF-8 files.""" + with tempfile.NamedTemporaryFile(mode="wb", suffix=".csv", delete=False) as fh: + fh.write(b"name,age\nJos\xe9,30\n") # Latin-1 byte in UTF-8 context + fpath = fh.name + try: + with pytest.raises(UnicodeDecodeError): + analyze_csv(fpath) + finally: + os.unlink(fpath) + + +def test_mcp_server_importable(): + """mcp_server must import without error (broken imports must not survive).""" + import importlib + mod = importlib.import_module("deidproof.mcp_server") + assert callable(mod.serve)