From 6a4d06a9c057b9ebf6c3e5141fa4585d4719b99a Mon Sep 17 00:00:00 2001 From: CoderDeltaLAN Date: Fri, 19 Jun 2026 21:54:14 +0100 Subject: [PATCH] feat: add dedupe baseline command --- CHANGELOG.md | 1 + README.md | 9 +++ src/agent_rules_kit/cli.py | 62 ++++++++++++++++++ src/agent_rules_kit/dedupe.py | 120 ++++++++++++++++++++++++++++++++++ tests/test_cli.py | 42 ++++++++++++ tests/test_dedupe.py | 57 ++++++++++++++++ 6 files changed, 291 insertions(+) create mode 100644 src/agent_rules_kit/dedupe.py create mode 100644 tests/test_dedupe.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b7ffb3..2ddeb9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ This project has a published GitHub Release line, but no stable support or API g ### Added +- Added a read-only `dedupe` baseline command for deterministic duplicate instruction-line detection across supported instruction files. - Added an OpenSSF Scorecard evaluation record with current official workflow constraints and a deferred workflow decision. - Added a dependency graph and Dependabot settings record with manual GitHub UI evidence and deferred version-update policy. - Added a private vulnerability reporting verification record and documented that GitHub private vulnerability reporting is enabled after manual UI verification. diff --git a/README.md b/README.md index 398b9ee..45339d4 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,7 @@ The implemented behavior includes: - provides read-only `doctor` repository diagnosis output on `main`; - provides read-only `budget` local size and context-pressure approximation output on `main`; - provides read-only `explain` output for known governance rule IDs on `main`; +- provides read-only `dedupe` duplicate instruction-line detection on `main`; - redacts supported secret-like values in supported output, including finding messages, paths, and evidence payloads; - avoids network calls; - avoids LLM calls; @@ -363,6 +364,14 @@ If root `AGENTS.md` already exists, it is backed up before replacement: PYTHONPATH=src python -m agent_rules_kit.cli budget tests/fixtures/repositories/multi-agent-overlap +### Dedupe command + +`dedupe` reports repeated instruction lines across supported instruction files: + + PYTHONPATH=src python -m agent_rules_kit.cli dedupe tests/fixtures/repositories/multi-agent-overlap + +The first baseline is conservative: it detects repeated normalized lines across files, not broad semantic duplication. + ### Explain command `explain` lists or explains known local governance rule IDs: diff --git a/src/agent_rules_kit/cli.py b/src/agent_rules_kit/cli.py index ac462d4..3fdef07 100644 --- a/src/agent_rules_kit/cli.py +++ b/src/agent_rules_kit/cli.py @@ -10,6 +10,7 @@ from agent_rules_kit import __version__ from agent_rules_kit.budget import BudgetReport, build_budget_report +from agent_rules_kit.dedupe import DedupeReport, build_dedupe_report from agent_rules_kit.discovery import InstructionFile, discover_instruction_files from agent_rules_kit.explain import ( RuleExplanation, @@ -99,6 +100,17 @@ def build_parser() -> argparse.ArgumentParser: help="Repository root to inspect. Defaults to the current directory.", ) + dedupe_parser = subparsers.add_parser( + "dedupe", + help="Detect repeated instruction lines across supported instruction files.", + ) + dedupe_parser.add_argument( + "repository", + nargs="?", + default=".", + help="Repository root to inspect. Defaults to the current directory.", + ) + explain_parser = subparsers.add_parser( "explain", help="Explain known governance rule IDs.", @@ -143,6 +155,9 @@ def main(argv: Sequence[str] | None = None) -> int: if args.command == "budget": return _run_budget(Path(args.repository)) + if args.command == "dedupe": + return _run_dedupe(Path(args.repository)) + if args.command == "explain": return _run_explain(args.rule_id, list_rules=args.list_rules) @@ -193,6 +208,53 @@ def _print_rule_explanation(explanation: RuleExplanation) -> None: print(f"Limits: {explanation.limits}") + +def _run_dedupe(repository_root: Path) -> int: + try: + instruction_files = discover_instruction_files(repository_root) + report = build_dedupe_report(repository_root, instruction_files) + except ValueError as error: + print(f"ERROR: {redact_secret_like_values(str(error))}", file=sys.stderr) + return 2 + + return _print_console_dedupe(repository_root, instruction_files, report) + + +def _print_console_dedupe( + repository_root: Path, + instruction_files: tuple[InstructionFile, ...], + report: DedupeReport, +) -> int: + print(f"agent-rules-kit dedupe: {redact_secret_like_values(str(repository_root))}") + + if not instruction_files: + print("Status: no_instruction_files") + print("Supported instruction files: 0") + print("Duplicate groups: 0") + print("Duplicate lines: 0") + print("Next step: add a supported agent instruction file before checking duplication.") + return 1 + + status = "review" if report.groups else "ok" + print(f"Status: {status}") + print(f"Supported instruction files: {len(instruction_files)}") + print(f"Duplicate groups: {report.duplicate_group_count}") + print(f"Duplicate lines: {report.duplicate_line_count}") + + if report.groups: + print("Duplicate groups:") + for index, group in enumerate(report.groups, start=1): + evidence = redact_secret_like_values(group.locations[0].evidence) + print(f"{index}. {evidence}") + for location in group.locations: + path = redact_secret_like_values(location.path) + print(f" - {path}:{location.line}") + print("Next step: move repeated instructions into one source of truth or import path.") + else: + print("Next step: no repeated instruction lines were detected by implemented checks.") + + return 0 + def _run_budget(repository_root: Path) -> int: try: instruction_files = discover_instruction_files(repository_root) diff --git a/src/agent_rules_kit/dedupe.py b/src/agent_rules_kit/dedupe.py new file mode 100644 index 0000000..ad38aab --- /dev/null +++ b/src/agent_rules_kit/dedupe.py @@ -0,0 +1,120 @@ +"""Deterministic duplicate instruction detection.""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from pathlib import Path + +from agent_rules_kit.discovery import InstructionFile + + +@dataclass(frozen=True, slots=True) +class DuplicateLineLocation: + """One occurrence of a duplicated instruction line.""" + + path: str + line: int + evidence: str + + +@dataclass(frozen=True, slots=True) +class DuplicateLineGroup: + """A duplicated normalized instruction line and its locations.""" + + normalized_text: str + locations: tuple[DuplicateLineLocation, ...] + + +@dataclass(frozen=True, slots=True) +class DedupeReport: + """Duplicate instruction report for supported instruction files.""" + + groups: tuple[DuplicateLineGroup, ...] + + @property + def duplicate_group_count(self) -> int: + return len(self.groups) + + @property + def duplicate_line_count(self) -> int: + return sum(len(group.locations) for group in self.groups) + + +def build_dedupe_report( + repository_root: Path, + instruction_files: tuple[InstructionFile, ...], +) -> DedupeReport: + """Build a conservative exact-line duplicate report.""" + locations_by_normalized_line: dict[str, list[DuplicateLineLocation]] = {} + + for instruction_file in instruction_files: + file_path = repository_root / instruction_file.path + + if file_path.is_symlink(): + raise ValueError( + "instruction file path is a symlink and cannot be deduplicated: " + f"{instruction_file.path}" + ) + + try: + text = file_path.read_text(encoding="utf-8") + except UnicodeDecodeError as error: + raise ValueError( + "instruction file is not valid UTF-8 and cannot be deduplicated: " + f"{instruction_file.path}" + ) from error + + for line_number, line_text in enumerate(text.splitlines(), start=1): + normalized_text = _normalize_instruction_line(line_text) + if normalized_text is None: + continue + + locations_by_normalized_line.setdefault(normalized_text, []).append( + DuplicateLineLocation( + path=instruction_file.path, + line=line_number, + evidence=line_text.strip(), + ) + ) + + groups = [ + DuplicateLineGroup( + normalized_text=normalized_text, + locations=tuple(locations), + ) + for normalized_text, locations in locations_by_normalized_line.items() + if len({location.path for location in locations}) > 1 + ] + + return DedupeReport(groups=tuple(groups)) + + +def _normalize_instruction_line(line_text: str) -> str | None: + stripped = line_text.strip() + if not stripped: + return None + + if stripped.startswith(("```", "---", "