Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ This project has a published GitHub Release line, but no stable support or API g

### Added

- Added a read-only `dedupe` baseline command for deterministic duplicate instruction-line detection across supported instruction files.
- Added an OpenSSF Scorecard evaluation record with current official workflow constraints and a deferred workflow decision.
- Added a dependency graph and Dependabot settings record with manual GitHub UI evidence and deferred version-update policy.
- Added a private vulnerability reporting verification record and documented that GitHub private vulnerability reporting is enabled after manual UI verification.
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ The implemented behavior includes:
- provides read-only `doctor` repository diagnosis output on `main`;
- provides read-only `budget` local size and context-pressure approximation output on `main`;
- provides read-only `explain` output for known governance rule IDs on `main`;
- provides read-only `dedupe` duplicate instruction-line detection on `main`;
- redacts supported secret-like values in supported output, including finding messages, paths, and evidence payloads;
- avoids network calls;
- avoids LLM calls;
Expand Down Expand Up @@ -363,6 +364,14 @@ If root `AGENTS.md` already exists, it is backed up before replacement:

PYTHONPATH=src python -m agent_rules_kit.cli budget tests/fixtures/repositories/multi-agent-overlap

### Dedupe command

`dedupe` reports repeated instruction lines across supported instruction files:

PYTHONPATH=src python -m agent_rules_kit.cli dedupe tests/fixtures/repositories/multi-agent-overlap

The first baseline is conservative: it detects repeated normalized lines across files, not broad semantic duplication.

### Explain command

`explain` lists or explains known local governance rule IDs:
Expand Down
62 changes: 62 additions & 0 deletions src/agent_rules_kit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from agent_rules_kit import __version__
from agent_rules_kit.budget import BudgetReport, build_budget_report
from agent_rules_kit.dedupe import DedupeReport, build_dedupe_report
from agent_rules_kit.discovery import InstructionFile, discover_instruction_files
from agent_rules_kit.explain import (
RuleExplanation,
Expand Down Expand Up @@ -99,6 +100,17 @@ def build_parser() -> argparse.ArgumentParser:
help="Repository root to inspect. Defaults to the current directory.",
)

dedupe_parser = subparsers.add_parser(
"dedupe",
help="Detect repeated instruction lines across supported instruction files.",
)
dedupe_parser.add_argument(
"repository",
nargs="?",
default=".",
help="Repository root to inspect. Defaults to the current directory.",
)

explain_parser = subparsers.add_parser(
"explain",
help="Explain known governance rule IDs.",
Expand Down Expand Up @@ -143,6 +155,9 @@ def main(argv: Sequence[str] | None = None) -> int:
if args.command == "budget":
return _run_budget(Path(args.repository))

if args.command == "dedupe":
return _run_dedupe(Path(args.repository))

if args.command == "explain":
return _run_explain(args.rule_id, list_rules=args.list_rules)

Expand Down Expand Up @@ -193,6 +208,53 @@ def _print_rule_explanation(explanation: RuleExplanation) -> None:
print(f"Limits: {explanation.limits}")



def _run_dedupe(repository_root: Path) -> int:
try:
instruction_files = discover_instruction_files(repository_root)
report = build_dedupe_report(repository_root, instruction_files)
except ValueError as error:
print(f"ERROR: {redact_secret_like_values(str(error))}", file=sys.stderr)
return 2

return _print_console_dedupe(repository_root, instruction_files, report)


def _print_console_dedupe(
repository_root: Path,
instruction_files: tuple[InstructionFile, ...],
report: DedupeReport,
) -> int:
print(f"agent-rules-kit dedupe: {redact_secret_like_values(str(repository_root))}")

if not instruction_files:
print("Status: no_instruction_files")
print("Supported instruction files: 0")
print("Duplicate groups: 0")
print("Duplicate lines: 0")
print("Next step: add a supported agent instruction file before checking duplication.")
return 1

status = "review" if report.groups else "ok"
print(f"Status: {status}")
print(f"Supported instruction files: {len(instruction_files)}")
print(f"Duplicate groups: {report.duplicate_group_count}")
print(f"Duplicate lines: {report.duplicate_line_count}")

if report.groups:
print("Duplicate groups:")
for index, group in enumerate(report.groups, start=1):
evidence = redact_secret_like_values(group.locations[0].evidence)
print(f"{index}. {evidence}")
for location in group.locations:
path = redact_secret_like_values(location.path)
print(f" - {path}:{location.line}")
print("Next step: move repeated instructions into one source of truth or import path.")
else:
print("Next step: no repeated instruction lines were detected by implemented checks.")

return 0

def _run_budget(repository_root: Path) -> int:
try:
instruction_files = discover_instruction_files(repository_root)
Expand Down
120 changes: 120 additions & 0 deletions src/agent_rules_kit/dedupe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""Deterministic duplicate instruction detection."""

from __future__ import annotations

import re
from dataclasses import dataclass
from pathlib import Path

from agent_rules_kit.discovery import InstructionFile


@dataclass(frozen=True, slots=True)
class DuplicateLineLocation:
"""One occurrence of a duplicated instruction line."""

path: str
line: int
evidence: str


@dataclass(frozen=True, slots=True)
class DuplicateLineGroup:
"""A duplicated normalized instruction line and its locations."""

normalized_text: str
locations: tuple[DuplicateLineLocation, ...]


@dataclass(frozen=True, slots=True)
class DedupeReport:
"""Duplicate instruction report for supported instruction files."""

groups: tuple[DuplicateLineGroup, ...]

@property
def duplicate_group_count(self) -> int:
return len(self.groups)

@property
def duplicate_line_count(self) -> int:
return sum(len(group.locations) for group in self.groups)


def build_dedupe_report(
repository_root: Path,
instruction_files: tuple[InstructionFile, ...],
) -> DedupeReport:
"""Build a conservative exact-line duplicate report."""
locations_by_normalized_line: dict[str, list[DuplicateLineLocation]] = {}

for instruction_file in instruction_files:
file_path = repository_root / instruction_file.path

if file_path.is_symlink():
raise ValueError(
"instruction file path is a symlink and cannot be deduplicated: "
f"{instruction_file.path}"
)

try:
text = file_path.read_text(encoding="utf-8")
except UnicodeDecodeError as error:
raise ValueError(
"instruction file is not valid UTF-8 and cannot be deduplicated: "
f"{instruction_file.path}"
) from error

for line_number, line_text in enumerate(text.splitlines(), start=1):
normalized_text = _normalize_instruction_line(line_text)
if normalized_text is None:
continue

locations_by_normalized_line.setdefault(normalized_text, []).append(
DuplicateLineLocation(
path=instruction_file.path,
line=line_number,
evidence=line_text.strip(),
)
)

groups = [
DuplicateLineGroup(
normalized_text=normalized_text,
locations=tuple(locations),
)
for normalized_text, locations in locations_by_normalized_line.items()
if len({location.path for location in locations}) > 1
]

return DedupeReport(groups=tuple(groups))


def _normalize_instruction_line(line_text: str) -> str | None:
stripped = line_text.strip()
if not stripped:
return None

if stripped.startswith(("```", "---", "<!--")):
return None

stripped = re.sub(r"^#{1,6}\s+", "", stripped)
stripped = re.sub(r"^[-*+]\s+", "", stripped)
stripped = re.sub(r"^\d+[.)]\s+", "", stripped)
stripped = re.sub(r"\s+", " ", stripped).strip().lower()

if len(stripped) < 24:
return None

if not any(character.isalpha() for character in stripped):
return None

return stripped


__all__ = [
"DedupeReport",
"DuplicateLineGroup",
"DuplicateLineLocation",
"build_dedupe_report",
]
42 changes: 42 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,48 @@ def test_budget_returns_two_for_invalid_repository_root(self) -> None:
self.assertEqual(exit_code, 2)
self.assertIn("ERROR: repository root does not exist:", output.getvalue())


def test_dedupe_reports_duplicate_lines(self) -> None:
with tempfile.TemporaryDirectory() as tmp_dir:
root = Path(tmp_dir)
duplicate = "Run unit tests before opening a pull request."
(root / "AGENTS.md").write_text(
f"# Agent instructions\n\n- {duplicate}\n",
encoding="utf-8",
)
(root / "CLAUDE.md").write_text(
f"# Claude instructions\n\n{duplicate}\n",
encoding="utf-8",
)

output = io.StringIO()

with redirect_stdout(output):
exit_code = main(["dedupe", str(root)])

text = output.getvalue()

self.assertEqual(exit_code, 0)
self.assertIn("agent-rules-kit dedupe:", text)
self.assertIn("Status: review", text)
self.assertIn("Supported instruction files: 2", text)
self.assertIn("Duplicate groups: 1", text)
self.assertIn("Duplicate lines: 2", text)
self.assertIn("AGENTS.md:3", text)
self.assertIn("CLAUDE.md:3", text)

def test_dedupe_returns_one_when_no_instruction_files_are_found(self) -> None:
output = io.StringIO()

with redirect_stdout(output):
exit_code = main(["dedupe", str(FIXTURE_ROOT / "empty-repo")])

text = output.getvalue()

self.assertEqual(exit_code, 1)
self.assertIn("Status: no_instruction_files", text)
self.assertIn("Duplicate groups: 0", text)

def test_explain_lists_known_rules(self) -> None:
output = io.StringIO()

Expand Down
57 changes: 57 additions & 0 deletions tests/test_dedupe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

import tempfile
import unittest
from pathlib import Path

from agent_rules_kit.dedupe import build_dedupe_report
from agent_rules_kit.discovery import discover_instruction_files


class DedupeTests(unittest.TestCase):
def test_reports_duplicate_instruction_lines_across_files(self) -> None:
with tempfile.TemporaryDirectory() as tmp_dir:
root = Path(tmp_dir)
duplicate = "Run unit tests before opening a pull request."
(root / "AGENTS.md").write_text(
f"# Agent instructions\n\n- {duplicate}\n",
encoding="utf-8",
)
(root / "CLAUDE.md").write_text(
f"# Claude instructions\n\n{duplicate}\n",
encoding="utf-8",
)

report = build_dedupe_report(root, discover_instruction_files(root))

self.assertEqual(report.duplicate_group_count, 1)
self.assertEqual(report.duplicate_line_count, 2)
self.assertEqual(report.groups[0].normalized_text, duplicate.lower())
self.assertEqual(
[location.path for location in report.groups[0].locations],
["AGENTS.md", "CLAUDE.md"],
)

def test_ignores_short_boilerplate_lines(self) -> None:
with tempfile.TemporaryDirectory() as tmp_dir:
root = Path(tmp_dir)
(root / "AGENTS.md").write_text("# Scope\n", encoding="utf-8")
(root / "CLAUDE.md").write_text("# Scope\n", encoding="utf-8")

report = build_dedupe_report(root, discover_instruction_files(root))

self.assertEqual(report.duplicate_group_count, 0)

def test_rejects_symlinked_instruction_files(self) -> None:
with tempfile.TemporaryDirectory() as tmp_dir:
root = Path(tmp_dir)
target = root / "REAL.md"
target.write_text("Run unit tests before opening a pull request.\n", encoding="utf-8")
(root / "AGENTS.md").symlink_to(target)

with self.assertRaisesRegex(ValueError, "symlink"):
build_dedupe_report(root, discover_instruction_files(root))


if __name__ == "__main__":
unittest.main()