diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..af3d6f2 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,47 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + lint: + name: Lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Ruff + run: ruff check src/ + + - name: Mypy + run: mypy src/ + + test: + name: Test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Run tests with coverage + run: pytest --cov=ter_calculator --cov-report=term-missing diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..e430342 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,15 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.8.6 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..13f6809 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,42 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +## Our Standards + +Examples of behavior that contributes to a positive environment: + +- Using welcoming and inclusive language +- Being respectful of differing viewpoints and experiences +- Gracefully accepting constructive criticism +- Focusing on what is best for the community +- Showing empathy towards other community members + +Examples of unacceptable behavior: + +- The use of sexualized language or imagery, and sexual attention or advances +- Trolling, insulting or derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information without explicit permission +- Other conduct which could reasonably be considered inappropriate + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by opening an issue on GitHub or contacting the project maintainers. + +All complaints will be reviewed and investigated promptly and fairly. Project +maintainers are obligated to respect the privacy and security of the reporter. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..7325931 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,114 @@ +# Contributing to TER Calculator + +Thanks for your interest in contributing! This guide covers everything you need to get started. + +## Getting Started + +```bash +# Fork and clone the repo +git clone https://github.com//TER.git +cd TER + +# Install in development mode +pip install -e ".[dev]" + +# Install pre-commit hooks +pip install pre-commit +pre-commit install +``` + +Requires Python 3.11+. + +## Development Workflow + +### Running Tests + +```bash +pytest # All tests +pytest tests/unit/test_classifier.py -v # Specific module +pytest --cov=ter_calculator # With coverage +``` + +### Linting and Type Checking + +```bash +ruff check src/ # Lint +ruff format src/ tests/ # Format +mypy src/ # Type check +``` + +Pre-commit hooks run ruff automatically on staged files. + +### Branch Naming + +- `feature/` -- new functionality +- `fix/` -- bug fixes +- `docs/` -- documentation changes +- `refactor/` -- code restructuring +- `test/` -- test additions or fixes + +### Commit Messages + +Use [Conventional Commits](https://www.conventionalcommits.org/): + +``` +feat: add rolling window size option to watch command +fix: correct token count for merged reasoning spans +docs: add context orchestrator usage examples +test: add unit tests for waste_detectors module +refactor: extract shared CLI argument definitions +``` + +## Pull Request Process + +1. Create a feature branch from `main` +2. Make your changes with tests +3. Ensure all checks pass: `pytest && ruff check src/ && mypy src/` +4. Open a PR against `main` with a clear description +5. One approval required for merge + +### PR Guidelines + +- One feature or fix per PR +- Include tests for new functionality +- Update documentation if behavior changes +- Keep PRs focused -- separate unrelated changes into different PRs + +## Code Style + +- Python 3.11+ -- use modern syntax (type unions with `|`, match statements where appropriate) +- Dataclasses for models (see `models.py`) +- Lazy imports in CLI handlers for fast startup +- Ruff handles formatting and linting -- don't fight the formatter + +## Project Structure + +``` +src/ter_calculator/ # Source modules +tests/unit/ # Unit tests +tests/features/ # BDD feature files +tests/integration/ # Integration tests +docs/ # Architecture and user documentation +sample_sessions/ # Sample JSONL files for testing +``` + +## Reporting Bugs + +Open a [GitHub Issue](https://github.com/lgriffin/TER/issues) with: + +- Steps to reproduce +- Expected vs actual behavior +- Python version and OS +- Sample session file (if applicable, redact sensitive content) + +## Requesting Features + +Open a [GitHub Issue](https://github.com/lgriffin/TER/issues) with the `enhancement` label describing: + +- The problem you're trying to solve +- Your proposed solution +- Any alternatives you've considered + +## License + +By contributing, you agree that your contributions will be licensed under the [Apache License 2.0](LICENSE). diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6622082 --- /dev/null +++ b/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to the Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by the Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding any notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2026 Leigh Griffin + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 0c21153..4e1e3b0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # TER Calculator +[![CI](https://github.com/lgriffin/TER/actions/workflows/ci.yml/badge.svg)](https://github.com/lgriffin/TER/actions/workflows/ci.yml) +[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](LICENSE) + Token Efficiency Ratio (TER) calculator for Claude Code sessions. Measures how efficiently an AI coding agent uses its token budget by classifying output token spans as **aligned** (contributing to intent) or **waste** (redundant reasoning, unnecessary tool calls, over-explanation), and surfaces session economics, context optimization, and cross-session consistency. ## Features @@ -349,15 +352,24 @@ See [docs/architecture.md](docs/architecture.md) for detailed diagrams and data - [Context Orchestrator](docs/context-orchestrator.md) -- patent implementation reference - [User Guide](docs/user-guide.md) -- installation, workflows, troubleshooting +## Contributing + +We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on setting up your development environment, running tests, and submitting pull requests. + +This project follows the [Contributor Covenant Code of Conduct](CODE_OF_CONDUCT.md). + ## Development ```bash -# Run tests (93 context orchestrator + 538 BDD + existing unit tests) +# Run tests pytest # Lint ruff check src/ +# Type check +mypy src/ + # Run specific test modules pytest tests/unit/test_fragment_store.py -v pytest tests/unit/test_budget_optimizer.py -v diff --git a/pyproject.toml b/pyproject.toml index b73a769..fbc39aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ build-backend = "setuptools.build_meta" name = "ter-calculator" version = "0.1.0" description = "Token Efficiency Ratio calculator for Claude Code sessions" +license = "Apache-2.0" requires-python = ">=3.11" dependencies = [ "sentence-transformers>=2.2.0", @@ -21,6 +22,7 @@ dev = [ "pytest-bdd>=7.0.0", "mypy>=1.0.0", "ruff>=0.1.0", + "pre-commit>=3.0.0", ] [project.scripts] diff --git a/src/ter_calculator/cli.py b/src/ter_calculator/cli.py index 1f6c865..171f9db 100644 --- a/src/ter_calculator/cli.py +++ b/src/ter_calculator/cli.py @@ -18,129 +18,95 @@ def _setup_stdout_encoding(): ) -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser( - prog="ter", - description="Token Efficiency Ratio calculator for Claude Code sessions", - ) - parser.add_argument( - "--version", action="version", version=f"%(prog)s {__version__}" - ) +def _add_analysis_args(parser: argparse.ArgumentParser) -> None: + """Add the shared analysis arguments used by both analyze and report.""" parser.add_argument( - "--verbose", action="store_true", help="Enable verbose output" - ) - parser.add_argument( - "--quiet", action="store_true", help="Suppress non-essential output" - ) - - subparsers = parser.add_subparsers(dest="command") - - # analyze subcommand - analyze_parser = subparsers.add_parser( - "analyze", help="Analyze a Claude Code session" - ) - analyze_parser.add_argument( "session_path", nargs="?", default=None, help="Path to a JSONL session file (optional if --latest is used)" ) - analyze_parser.add_argument( + parser.add_argument( "--latest", action="store_true", - help="Analyze the most recent session (based on file modification time)" + help="Use the most recent session (based on file modification time)" ) - analyze_parser.add_argument( - "--format", dest="output_format", choices=["text", "json"], - default="text", help="Output format (default: text)" - ) - analyze_parser.add_argument( + parser.add_argument( "--similarity-threshold", type=float, default=0.40, help="Cosine similarity threshold for alignment (default: 0.40)" ) - analyze_parser.add_argument( + parser.add_argument( "--confidence-threshold", type=float, default=0.75, help="Classifier confidence threshold (default: 0.75)" ) - analyze_parser.add_argument( + parser.add_argument( "--restatement-threshold", type=float, default=0.85, help="Similarity threshold for context restatement (default: 0.85)" ) - analyze_parser.add_argument( + parser.add_argument( "--phase-weights", type=str, default="0.3,0.4,0.3", help="Phase weights as r,t,g (default: 0.3,0.4,0.3)" ) - analyze_parser.add_argument( + parser.add_argument( "--no-waste-patterns", action="store_true", help="Disable waste pattern detection" ) - analyze_parser.add_argument( + parser.add_argument( "--cost-model", type=str, default="sonnet", help="Cost model: 'sonnet' (default) or custom 'input,output,cache_read,cache_write' rates per MTok" ) - analyze_parser.add_argument( + parser.add_argument( "--no-input-analysis", action="store_true", help="Disable input analysis (user/model token breakdown, drift, and alignment)" ) - analyze_parser.add_argument( + parser.add_argument( "--prompt-similarity-threshold", type=float, default=0.75, help="Cosine similarity threshold for flagging redundant prompts (default: 0.75)" ) - analyze_parser.add_argument( - "--group", action="store_true", - help="Include subagent sessions in grouped analysis" - ) - analyze_parser.add_argument( + parser.add_argument( "--cost-weighted", action="store_true", help="Include cost-weighted TER analysis" ) - analyze_parser.add_argument( + parser.add_argument( "--check-overthinking", action="store_true", help="Analyze reasoning efficiency and detect overthinking" ) - # report — Markdown summary (same analysis pipeline as analyze) - report_parser = subparsers.add_parser( - "report", - help="Print a Markdown summary (headline metrics, calibration, top waste, next steps)", - ) - report_parser.add_argument( - "session_path", nargs="?", default=None, - help="Path to a JSONL session file (optional if --latest is used)" - ) - report_parser.add_argument( - "--latest", action="store_true", - help="Report on the most recent session (based on file modification time)" - ) - report_parser.add_argument( - "--similarity-threshold", type=float, default=0.40, - help="Cosine similarity threshold for alignment (default: 0.40)" + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="ter", + description="Token Efficiency Ratio calculator for Claude Code sessions", ) - report_parser.add_argument( - "--confidence-threshold", type=float, default=0.75, - help="Classifier confidence threshold (default: 0.75)" + parser.add_argument( + "--version", action="version", version=f"%(prog)s {__version__}" ) - report_parser.add_argument( - "--restatement-threshold", type=float, default=0.85, - help="Similarity threshold for context restatement (default: 0.85)" + parser.add_argument( + "--verbose", action="store_true", help="Enable verbose output" ) - report_parser.add_argument( - "--phase-weights", type=str, default="0.3,0.4,0.3", - help="Phase weights as r,t,g (default: 0.3,0.4,0.3)" + parser.add_argument( + "--quiet", action="store_true", help="Suppress non-essential output" ) - report_parser.add_argument( - "--no-waste-patterns", action="store_true", - help="Disable waste pattern detection" + + subparsers = parser.add_subparsers(dest="command") + + # analyze subcommand + analyze_parser = subparsers.add_parser( + "analyze", help="Analyze a Claude Code session" ) - report_parser.add_argument( - "--cost-model", type=str, default="sonnet", - help="Cost model: 'sonnet' (default) or custom rates per MTok" + _add_analysis_args(analyze_parser) + analyze_parser.add_argument( + "--format", dest="output_format", choices=["text", "json"], + default="text", help="Output format (default: text)" ) - report_parser.add_argument( - "--no-input-analysis", action="store_true", - help="Disable input analysis" + analyze_parser.add_argument( + "--group", action="store_true", + help="Include subagent sessions in grouped analysis" ) - report_parser.add_argument( - "--prompt-similarity-threshold", type=float, default=0.75, - help="Cosine similarity threshold for redundant prompts (default: 0.75)" + + # report — Markdown summary (same analysis pipeline as analyze) + report_parser = subparsers.add_parser( + "report", + help="Print a Markdown summary (headline metrics, calibration, top waste, next steps)", ) + _add_analysis_args(report_parser) report_parser.add_argument( "-o", "--output", @@ -149,14 +115,6 @@ def main(argv: list[str] | None = None) -> int: default=None, help="Write Markdown to FILE instead of stdout (e.g. report.md)", ) - report_parser.add_argument( - "--cost-weighted", action="store_true", - help="Include cost-weighted TER analysis" - ) - report_parser.add_argument( - "--check-overthinking", action="store_true", - help="Analyze reasoning efficiency and detect overthinking" - ) # compare subcommand compare_parser = subparsers.add_parser( diff --git a/src/ter_calculator/formatter.py b/src/ter_calculator/formatter.py index fd18706..51d827e 100644 --- a/src/ter_calculator/formatter.py +++ b/src/ter_calculator/formatter.py @@ -1,40 +1,44 @@ -"""Output formatting for TER results.""" +"""Output formatting for TER results. -from __future__ import annotations +Public API: format_ter_result, format_comparison, format_grouped_analysis. +Format-specific rendering lives in formatter_rich, formatter_text, formatter_json. +""" -import json -import io +from __future__ import annotations -from .models import CostModel, InputAnalysis, TERResult, WastePattern -from .waste import summarize_waste +from .models import CostModel, TERResult, WastePattern def format_ter_result( result: TERResult, fmt: str = "text", use_rich: bool = True, ) -> str: - """Format a TER result for output.""" if fmt == "json": - return _format_json(result) + from .formatter_json import format_json + return format_json(result) if use_rich: try: - return _format_rich(result) + from .formatter_rich import format_rich + return format_rich(result) except (ImportError, UnicodeEncodeError): pass - return _format_text(result) + from .formatter_text import format_text + return format_text(result) def format_comparison( results: list[TERResult], fmt: str = "text", use_rich: bool = True, ) -> str: - """Format multiple TER results as a comparison.""" if fmt == "json": - return _format_comparison_json(results) + from .formatter_json import format_comparison_json + return format_comparison_json(results) if use_rich: try: - return _format_comparison_rich(results) + from .formatter_rich import format_comparison_rich + return format_comparison_rich(results) except (ImportError, UnicodeEncodeError): pass - return _format_comparison_text(results) + from .formatter_text import format_comparison_text + return format_comparison_text(results) def format_grouped_analysis( @@ -43,19 +47,25 @@ def format_grouped_analysis( fmt: str = "text", use_rich: bool = True, ) -> str: - """Format a grouped parent + subagent analysis.""" if fmt == "json": - return _format_grouped_json(parent_result, subagent_results) + from .formatter_json import format_grouped_json + return format_grouped_json(parent_result, subagent_results) if use_rich: try: - return _format_grouped_rich(parent_result, subagent_results) + from .formatter_rich import format_grouped_rich + return format_grouped_rich(parent_result, subagent_results) except (ImportError, UnicodeEncodeError): pass - return _format_grouped_text(parent_result, subagent_results) + from .formatter_text import format_grouped_text + return format_grouped_text(parent_result, subagent_results) + + +# --------------------------------------------------------------------------- +# Shared helpers used by formatter_rich, formatter_text, formatter_json +# --------------------------------------------------------------------------- def _compute_group_aggregates(all_results: list[TERResult]) -> dict: - """Compute aggregate metrics across a group of sessions.""" total_tokens = sum(r.total_tokens for r in all_results) total_waste = sum(r.waste_tokens for r in all_results) weighted_ter = ( @@ -78,172 +88,7 @@ def _compute_group_aggregates(all_results: list[TERResult]) -> dict: } -# --- Rich formatting --- - -# Import shared components -from .rich_components import ter_color as _ter_color - - -def _format_rich(result: TERResult) -> str: - """Format TER result using Rich library.""" - from rich.console import Console - from rich.panel import Panel - from rich.table import Table - from rich.text import Text - - buf = io.StringIO() - console = Console(file=buf, force_terminal=True, width=72) - - # --- Header panel --- - ter_text = Text(f"{result.aggregate_ter:.2f}", style=_ter_color(result.aggregate_ter)) - waste_pct = (result.waste_tokens / result.total_tokens * 100) if result.total_tokens else 0 - sid = result.session_id - if len(sid) > 20: - sid = sid[:8] + "..." - - # Line 1: TER | Waste | Cost - line1_parts: list = [("TER: ", "bold"), ter_text] - line1_parts.append((" | ", "")) - line1_parts.append((f"Waste: {waste_pct:.1f}%", "red" if waste_pct > 10 else "")) - if result.economics: - line1_parts.append((" | ", "")) - line1_parts.append((f"Cost: ${result.economics.estimated_cost_usd:.2f}", "")) - waste_cost = _compute_waste_cost(result) - if waste_cost > 0: - line1_parts.append((" | ", "")) - line1_parts.append((f"Waste $: ${waste_cost:.2f}", "red")) - - # Line 2: Input analysis headline (if available) - ia = result.input_analysis - if ia is not None: - drift = ia.intent_drift - pra = ia.prompt_response_alignment - ps = ia.prompt_similarity - bd = ia.token_breakdown - - drift_colors = { - "convergent": "red", "divergent": "green", - "stable": "green", "mixed": "yellow", - } - d_color = drift_colors.get(drift.overall_trajectory, "") - a_color = "red" if pra.average_alignment < 0.3 else ( - "yellow" if pra.average_alignment < 0.5 else "green" - ) - r_color = "red" if ps.prompt_redundancy_score > 0.5 else ( - "yellow" if ps.prompt_redundancy_score > 0 else "green" - ) - - line2_parts: list = [ - ("Drift: ", "bold"), - (f"{drift.overall_trajectory}", d_color), - ] - if pra.pairs: - line2_parts.append((" | ", "")) - line2_parts.append(("Alignment: ", "bold")) - line2_parts.append((f"{pra.average_alignment:.2f}", a_color)) - if ps.prompt_count >= 2: - line2_parts.append((" | ", "")) - line2_parts.append(("Redundancy: ", "bold")) - line2_parts.append((f"{ps.prompt_redundancy_score:.0%}", r_color)) - line2_parts.append((" | ", "")) - line2_parts.append((f"User: {bd.user_ratio:.0%}", "dim")) - - header = Text.assemble( - *line1_parts, ("\n", ""), *line2_parts, - ) - else: - header = Text.assemble(*line1_parts) - - console.print(Panel(header, title=sid, expand=False)) - - # --- Combined scores table (phases + tokens in one) --- - table = Table(show_header=True, show_edge=True) - table.add_column("Phase", style="bold", width=12) - table.add_column("TER", justify="right", width=6) - table.add_column("", width=3) - table.add_column("Metric", style="bold", width=14) - table.add_column("Value", justify="right", width=12) - - phases = [ - ("Reasoning", result.phase_scores.get("reasoning", 0)), - ("Tool Use", result.phase_scores.get("tool_use", 0)), - ("Generation", result.phase_scores.get("generation", 0)), - ] - right_rows = [ - ("Output Tokens", f"{result.total_tokens:,}"), - ("Aligned", f"{result.aligned_tokens:,}"), - ("Waste", f"{result.waste_tokens:,}"), - ] - - for i in range(3): - p_name, p_score = phases[i] - p_color = _ter_color(p_score) - r_label, r_value = right_rows[i] - table.add_row( - p_name, - f"[{p_color}]{p_score:.2f}[/{p_color}]", - "", - r_label, - r_value, - ) - console.print(table) - - # --- Session economics (compact) --- - if result.economics is not None: - econ = result.economics - cache_pct = econ.cache_hit_rate * 100 - cache_color = "green" if cache_pct >= 50 else "yellow" if cache_pct >= 20 else "red" - - econ_table = Table(show_header=True, show_edge=True) - econ_table.add_column("Economics", style="bold", width=18) - econ_table.add_column("", justify="right", width=12) - econ_table.add_column("", width=3) - econ_table.add_column("Context", style="bold", width=14) - econ_table.add_column("", justify="right", width=12) - - pos = econ.positional - g = econ.input_growth - bloat_str = "[red]YES[/red]" if g.context_bloat_detected else ( - "[yellow]WATCH[/yellow]" if g.is_superlinear else "[green]NO[/green]" - ) - - left_rows = [ - ("Input Tokens", f"{econ.total_input_tokens:,}"), - ("Cache Read", f"{econ.total_cache_read_tokens:,}"), - ("Cache Hit Rate", f"[{cache_color}]{cache_pct:.1f}%[/{cache_color}]"), - ] - right_rows_e = [ - ("Growth", f"{g.growth_rate:.1f}x ({len(g.turn_input_tokens)} turns)"), - ("Bloat", bloat_str), - ("Positional", f"{pos.early_ter:.2f} / {pos.mid_ter:.2f} / {pos.late_ter:.2f}"), - ] - - for i in range(3): - l_label, l_value = left_rows[i] - r_label, r_value = right_rows_e[i] - econ_table.add_row(l_label, l_value, "", r_label, r_value) - console.print(econ_table) - - # --- Waste breakdown table --- - _format_waste_breakdown_rich(console, result) - - # --- Input analysis --- - if result.input_analysis is not None: - _format_input_analysis_rich(console, result.input_analysis) - - # --- Cost report --- - if result.cost_report is not None: - _format_cost_report_rich(console, result.cost_report) - - # --- Overthinking analysis --- - if result.overthinking_result is not None: - _format_overthinking_rich(console, result.overthinking_result) - - return buf.getvalue().rstrip() - - def _compute_waste_cost(result: TERResult) -> float: - """Compute total waste $ from breakdown rows (mixed input/output pricing).""" rows = _build_waste_breakdown(result) if not rows: return 0.0 @@ -256,7 +101,6 @@ def _compute_waste_cost(result: TERResult) -> float: def _pattern_pricing(pattern_type: str) -> str: - """Structural patterns whose token cost is mostly input-side context.""" if pattern_type in ( "repetitive_read", "bash_antipattern", @@ -270,19 +114,11 @@ def _pattern_pricing(pattern_type: str) -> str: def _build_waste_breakdown( result: TERResult, ) -> list[tuple[str, int, int, str]]: - """Build rows: (label, tokens, instance_count, pricing_kind). - - ``pricing_kind`` is ``output`` (billed as generation / assistant tool - JSON) or ``input`` (tool results re-sent as context). Output-priced - rows are scaled elsewhere to match API ``output_tokens`` when known. - - Skips pattern rows that duplicate classified span categories. - """ + """Build rows: (label, tokens, instance_count, pricing_kind).""" from .models import ALIGNED_LABELS rows: list[tuple[str, int, int, str]] = [] - # Classified waste: assistant spans priced as output; rare user spans as input. category_map = { "redundant_reasoning": "Redundant Reasoning", "unnecessary_tool_call": "Unnecessary Tool Calls", @@ -348,791 +184,7 @@ def _build_waste_breakdown( return rows -def _format_waste_breakdown_rich(console, result: TERResult) -> None: - """Render waste breakdown as a Rich table.""" - from rich.table import Table - - rows = _build_waste_breakdown(result) - if not rows: - return - - total_waste = sum(t for _, t, _, _ in rows) - cm = result.economics.cost_model if result.economics else CostModel() - - table = Table(show_header=True, show_edge=True, title="Waste Breakdown") - table.add_column("Source", style="bold", width=22) - table.add_column("Tokens", justify="right", width=10) - table.add_column("%", justify="right", width=6) - table.add_column("Cost", justify="right", width=8) - table.add_column("Count", justify="right", width=6, style="dim") - - for label, tokens, count, kind in rows: - pct = (tokens / total_waste * 100) if total_waste > 0 else 0 - rate = cm.output_rate if kind == "output" else cm.input_rate - row_cost = float(tokens) * rate / 1_000_000 - table.add_row( - label, - f"{tokens:,}", - f"{pct:.0f}%", - f"${row_cost:.4f}", - str(count), - ) - - table.add_section() - total_cost = _compute_waste_cost(result) - table.add_row( - "[bold]Total[/bold]", - f"[bold]{total_waste:,}[/bold]", - "[bold]100%[/bold]", - f"[bold]${total_cost:.4f}[/bold]", - "", - ) - console.print(table) - - -def _collapse_waste_patterns(patterns: list[WastePattern]) -> list[str]: - """Collapse waste patterns into a summary by type.""" - by_type: dict[str, list[WastePattern]] = {} - for wp in patterns: - by_type.setdefault(wp.pattern_type, []).append(wp) - - lines: list[str] = [] - for ptype, wps in by_type.items(): - label = ptype.replace("_", " ").title() - total_tokens = sum(wp.tokens_wasted for wp in wps) - count = len(wps) - if count == 1: - lines.append(f"{label}: {wps[0].description} ({total_tokens:,} tokens)") - else: - lines.append(f"{label}: {count} instances ({total_tokens:,} tokens)") - return lines - - -def _format_comparison_rich(results: list[TERResult]) -> str: - """Format comparison using Rich table.""" - from rich.console import Console - from rich.table import Table - - buf = io.StringIO() - console = Console(file=buf, force_terminal=True, width=90) - - table = Table(title="TER Comparison", show_header=True) - table.add_column("#", justify="right", style="dim") - table.add_column("Session", style="bold") - table.add_column("TER", justify="right") - table.add_column("Waste%", justify="right") - table.add_column("Cache%", justify="right") - table.add_column("Cost", justify="right") - table.add_column("Waste $", justify="right") - table.add_column("Patterns", justify="right") - - for i, r in enumerate(results, 1): - color = _ter_color(r.aggregate_ter) - pattern_count = len(r.waste_patterns) if r.waste_patterns else 0 - waste_pct = (r.waste_tokens / r.total_tokens * 100) if r.total_tokens else 0 - cache_str = "" - cost_str = "" - waste_cost_str = "" - if r.economics: - cache_pct = r.economics.cache_hit_rate * 100 - cache_str = f"{cache_pct:.0f}%" - cost_str = f"${r.economics.estimated_cost_usd:.2f}" - wc = _compute_waste_cost(r) - waste_cost_str = f"[red]${wc:.2f}[/red]" - sid = r.session_id - if len(sid) > 20: - sid = sid[:8] + "..." - table.add_row( - str(i), - sid, - f"[{color}]{r.aggregate_ter:.2f}[/{color}]", - f"{waste_pct:.1f}%", - cache_str, - cost_str, - waste_cost_str, - str(pattern_count), - ) - - console.print(table) - - if results: - avg_ter = sum(r.aggregate_ter for r in results) / len(results) - total_cost = sum(r.economics.estimated_cost_usd for r in results if r.economics) - total_waste_cost = sum(_compute_waste_cost(r) for r in results) - color = _ter_color(avg_ter) - console.print(f"\nAverage TER: [{color}]{avg_ter:.2f}[/{color}] | Total Cost: ${total_cost:.2f} | Total Waste: [red]${total_waste_cost:.2f}[/red]") - - return buf.getvalue().rstrip() - - -def _format_grouped_rich( - parent_result: TERResult, - subagent_results: list[TERResult], -) -> str: - """Format grouped analysis using Rich.""" - from rich.console import Console - from rich.panel import Panel - from rich.table import Table - from rich.text import Text - - all_results = [parent_result] + subagent_results - agg = _compute_group_aggregates(all_results) - - buf = io.StringIO() - console = Console(file=buf, force_terminal=True, width=90) - - # Header panel. - sid = parent_result.session_id - if len(sid) > 20: - sid = sid[:8] + "..." - ter_text = Text(f"{agg['weighted_ter']:.2f}", style=_ter_color(agg["weighted_ter"])) - - header = Text.assemble( - ("TER: ", "bold"), ter_text, - (" | ", ""), - (f"Waste: {agg['waste_pct']:.1f}%", "red" if agg["waste_pct"] > 10 else ""), - (" | ", ""), - (f"Cost: ${agg['total_cost_usd']:.2f}", ""), - (" | ", ""), - (f"Waste $: ${agg['total_waste_cost_usd']:.2f}", "red"), - ("\n", ""), - (f"Sessions: 1 parent + {len(subagent_results)} subagent(s)", "dim"), - (" | ", ""), - (f"Tokens: {agg['total_tokens']:,}", "dim"), - ) - console.print(Panel(header, title=f"Group: {sid}", expand=False)) - - # Per-session table. - table = Table(show_header=True, title="Session Breakdown") - table.add_column("Role", width=10) - table.add_column("Session", width=14) - table.add_column("TER", justify="right", width=6) - table.add_column("Waste%", justify="right", width=7) - table.add_column("Tokens", justify="right", width=10) - table.add_column("Cost", justify="right", width=8) - table.add_column("Waste $", justify="right", width=8) - table.add_column("Patterns", justify="right", width=8) - - def _add_session_row(r: TERResult, role: str): - color = _ter_color(r.aggregate_ter) - waste_pct = (r.waste_tokens / r.total_tokens * 100) if r.total_tokens else 0 - cost_str = f"${r.economics.estimated_cost_usd:.2f}" if r.economics else "" - wc = _compute_waste_cost(r) - waste_str = f"[red]${wc:.2f}[/red]" if wc > 0 else "" - pattern_count = len(r.waste_patterns) if r.waste_patterns else 0 - rsid = r.session_id - if len(rsid) > 14: - rsid = rsid[:8] + "..." - table.add_row( - role, rsid, - f"[{color}]{r.aggregate_ter:.2f}[/{color}]", - f"{waste_pct:.1f}%", - f"{r.total_tokens:,}", - cost_str, waste_str, str(pattern_count), - ) - - _add_session_row(parent_result, "parent") - for r in subagent_results: - _add_session_row(r, "agent") - - # Total row. - table.add_section() - color = _ter_color(agg["weighted_ter"]) - table.add_row( - "[bold]Total[/bold]", "", - f"[bold][{color}]{agg['weighted_ter']:.2f}[/{color}][/bold]", - f"[bold]{agg['waste_pct']:.1f}%[/bold]", - f"[bold]{agg['total_tokens']:,}[/bold]", - f"[bold]${agg['total_cost_usd']:.2f}[/bold]", - f"[bold][red]${agg['total_waste_cost_usd']:.2f}[/red][/bold]", - "", - ) - console.print(table) - - return buf.getvalue().rstrip() - - -def _format_grouped_text( - parent_result: TERResult, - subagent_results: list[TERResult], -) -> str: - """Format grouped analysis as plain text.""" - all_results = [parent_result] + subagent_results - agg = _compute_group_aggregates(all_results) - - sid = parent_result.session_id - if len(sid) > 20: - sid = sid[:8] + "..." - - lines = [ - f"Group Analysis: {sid}", - "\u2550" * 50, - "", - f"TER: {agg['weighted_ter']:.2f} | Waste: {agg['waste_pct']:.1f}%" - f" | Cost: ${agg['total_cost_usd']:.2f}" - f" | Waste $: ${agg['total_waste_cost_usd']:.2f}", - f"Sessions: 1 parent + {len(subagent_results)} subagent(s) | Tokens: {agg['total_tokens']:,}", - "", - f" {'Role':<10} {'Session':<14} {'TER':<6} {'Waste%':<8} {'Tokens':<10} {'Cost':<10} {'Waste $':<10} {'Patterns':<8}", - ] - - def _add_row(r: TERResult, role: str): - rsid = r.session_id[:14] if len(r.session_id) <= 14 else r.session_id[:8] + "..." - waste_pct = (r.waste_tokens / r.total_tokens * 100) if r.total_tokens else 0 - cost_str = f"${r.economics.estimated_cost_usd:.2f}" if r.economics else "" - wc = _compute_waste_cost(r) - waste_str = f"${wc:.2f}" if wc > 0 else "" - pattern_count = len(r.waste_patterns) if r.waste_patterns else 0 - lines.append( - f" {role:<10} {rsid:<14} {r.aggregate_ter:<6.2f} " - f"{waste_pct:<8.1f} {r.total_tokens:<10,} {cost_str:<10} {waste_str:<10} {pattern_count:<8}" - ) - - _add_row(parent_result, "[parent]") - for r in subagent_results: - _add_row(r, "[agent]") - - lines.extend([ - "", - f" {'Total':<10} {'':<14} {agg['weighted_ter']:<6.2f} " - f"{agg['waste_pct']:<8.1f} {agg['total_tokens']:<10,} " - f"${agg['total_cost_usd']:<9.2f} ${agg['total_waste_cost_usd']:<9.2f}", - ]) - - return "\n".join(lines) - - -def _format_grouped_json( - parent_result: TERResult, - subagent_results: list[TERResult], -) -> str: - """Format grouped analysis as JSON.""" - all_results = [parent_result] + subagent_results - agg = _compute_group_aggregates(all_results) - - data = { - "group": { - "parent_session_id": parent_result.session_id, - "subagent_count": len(subagent_results), - **agg, - }, - "parent": _ter_result_to_dict(parent_result), - "subagents": [_ter_result_to_dict(r) for r in subagent_results], - } - return json.dumps(data, indent=2) - - -def _format_input_analysis_rich(console, ia: InputAnalysis) -> None: - """Render input analysis section using Rich.""" - from rich.table import Table - - bd = ia.token_breakdown - ps = ia.prompt_similarity - - # Token breakdown table. - console.print("\n[bold]Input Analysis[/bold]") - tb = Table(show_header=True, show_edge=True) - tb.add_column("Origin", style="bold", width=14) - tb.add_column("Category", width=16) - tb.add_column("Tokens", justify="right", width=10) - - tb.add_row("User", "Prompt Text", f"{bd.user_input_tokens:,}") - tb.add_row("User", "Tool Results", f"{bd.user_result_tokens:,}") - tb.add_row("Model", "Reasoning", f"{bd.model_reasoning_tokens:,}") - tb.add_row("Model", "Tool Calls", f"{bd.model_tool_tokens:,}") - tb.add_row("Model", "Generation", f"{bd.model_generation_tokens:,}") - tb.add_section() - tb.add_row("[bold]User Total[/bold]", "", f"[bold]{bd.total_user_tokens:,}[/bold]") - tb.add_row("[bold]Model Total[/bold]", "", f"[bold]{bd.total_model_tokens:,}[/bold]") - tb.add_row("User Ratio", "", f"{bd.user_ratio:.1%}") - console.print(tb) - - # Prompt similarity. - if ps.prompt_count >= 2: - r_color = "red" if ps.prompt_redundancy_score > 0.5 else ( - "yellow" if ps.prompt_redundancy_score > 0 else "green" - ) - console.print( - f"\nPrompt Redundancy: [{r_color}]{ps.prompt_redundancy_score:.0%}[/{r_color}]" - f" ({ps.prompt_count} prompts, {len(ps.similar_pairs)} similar pair(s))" - ) - for pair in ps.similar_pairs[:5]: - a_text = pair.prompt_a_text[:40] + "..." if len(pair.prompt_a_text) > 40 else pair.prompt_a_text - b_text = pair.prompt_b_text[:40] + "..." if len(pair.prompt_b_text) > 40 else pair.prompt_b_text - console.print( - f' [dim]#{pair.prompt_a_index+1}[/dim] "{a_text}" ' - f'[dim]~[/dim] [dim]#{pair.prompt_b_index+1}[/dim] "{b_text}" ' - f'[yellow]({pair.similarity:.2f})[/yellow]' - ) - - # Intent drift. - drift = ia.intent_drift - if drift.steps: - _drift_colors = { - "convergent": "red", "divergent": "green", - "stable": "green", "mixed": "yellow", - } - t_color = _drift_colors.get(drift.overall_trajectory, "") - console.print( - f"\nIntent Drift: [{t_color}]{drift.overall_trajectory}[/{t_color}]" - f" (avg similarity: {drift.average_drift:.2f})" - ) - for step in drift.steps: - s_color = "red" if step.drift_type == "convergent" else ( - "green" if step.drift_type == "divergent" else "yellow" - ) - console.print( - f" #{step.from_index+1} -> #{step.to_index+1}: " - f"[{s_color}]{step.drift_type}[/{s_color}] ({step.similarity:.2f})" - ) - - # Prompt-response alignment. - pra = ia.prompt_response_alignment - if pra.pairs: - a_color = "red" if pra.average_alignment < 0.3 else ( - "yellow" if pra.average_alignment < 0.5 else "green" - ) - console.print( - f"\nPrompt-Response Alignment: [{a_color}]{pra.average_alignment:.2f}[/{a_color}]" - f" ({len(pra.pairs)} pair(s), {pra.low_alignment_count} low)" - ) - for pair in pra.pairs: - p_color = "red" if pair.alignment < 0.3 else ( - "yellow" if pair.alignment < 0.5 else "green" - ) - prompt_short = pair.prompt_text[:50] + "..." if len(pair.prompt_text) > 50 else pair.prompt_text - console.print( - f' [dim]#{pair.prompt_index+1}[/dim] "{prompt_short}" ' - f'-> [{p_color}]{pair.alignment:.2f}[/{p_color}]' - ) - - -def _format_cost_report_rich(console, cost_report) -> None: - """Render cost report section using Rich.""" - from rich.table import Table - - console.print("\n[bold]Cost Analysis[/bold]") - - # Cost-weighted TER table - cost_table = Table(show_header=True, show_edge=True) - cost_table.add_column("Metric", style="cyan", width=20) - cost_table.add_column("Value", justify="right", width=16) - - cwter = cost_report.cost_ter - cost_table.add_row("Cost-Weighted TER", f"{cwter.cost_weighted_ter:.4f}") - cost_table.add_row("Raw TER", f"{cwter.raw_ter:.4f}") - cost_table.add_row("Total Cost", f"${cwter.total_cost_usd:.4f}") - cost_table.add_row("Waste Cost", f"${cwter.waste_cost_usd:.4f}") - waste_pct = (cwter.waste_cost_usd / cwter.total_cost_usd * 100) if cwter.total_cost_usd > 0 else 0 - cost_table.add_row("Waste %", f"{waste_pct:.1f}%") - cost_table.add_row("Semantic Density", f"{cost_report.session_density.density_score:.2%}") - cost_table.add_row("Redundancy", f"{cost_report.session_density.redundancy_ratio:.2%}") - - console.print(cost_table) - - # Recommendations - if cost_report.recommendations: - console.print("\n[bold]Recommendations:[/bold]") - for rec in cost_report.recommendations: - console.print(f" • {rec}") - - -def _format_overthinking_rich(console, ot) -> None: - """Render overthinking analysis section using Rich.""" - from rich.table import Table - - console.print("\n[bold]Overthinking Analysis[/bold]") - - # Status - status_color = "red" if ot.is_overthinking else "green" - status_text = "OVERTHINKING DETECTED" if ot.is_overthinking else "Efficient Reasoning" - console.print(f"Status: [{status_color}]{status_text}[/{status_color}]") - - # Overthinking metrics table - ot_table = Table(show_header=True, show_edge=True) - ot_table.add_column("Metric", style="cyan", width=20) - ot_table.add_column("Value", justify="right", width=16) - - ot_table.add_row("Total Reasoning", f"{ot.total_reasoning_tokens:,} tokens") - ot_table.add_row("Useful", f"{ot.useful_reasoning_tokens:,} tokens") - ot_table.add_row("Efficiency", f"{ot.reasoning_efficiency:.0%}") - ot_table.add_row("Wasted", f"{ot.wasted_reasoning_tokens:,} tokens") - - if ot.optimal_cutoff_index is not None: - ot_table.add_row("Optimal Cutoff", f"Span {ot.optimal_cutoff_index} (of {len(ot.segments)})") - - ot_table.add_row("Recommended Budget", f"{ot.recommended_budget:,} tokens") - - console.print(ot_table) - - # Explanation - console.print(f"\n{ot.explanation}") - - -# --- Plain text formatting --- - - -def _format_text(result: TERResult) -> str: - """Format TER result as plain text.""" - waste_pct = (result.waste_tokens / result.total_tokens * 100) if result.total_tokens else 0 - sid = result.session_id - if len(sid) > 20: - sid = sid[:8] + "..." - - lines = [ - f"TER Report: {sid}", - "\u2550" * 40, - "", - ] - - # Headline. - cost_str = "" - if result.economics: - cost_str = f" | Cost: ${result.economics.estimated_cost_usd:.2f}" - waste_cost = _compute_waste_cost(result) - if waste_cost > 0: - cost_str += f" | Waste $: ${waste_cost:.2f}" - lines.append(f"TER: {result.aggregate_ter:.2f} | Waste: {waste_pct:.1f}%{cost_str}") - - # Input analysis headline. - ia = result.input_analysis - if ia is not None: - drift = ia.intent_drift - pra = ia.prompt_response_alignment - ps = ia.prompt_similarity - parts = [f"Drift: {drift.overall_trajectory}"] - if pra.pairs: - parts.append(f"Alignment: {pra.average_alignment:.2f}") - if ps.prompt_count >= 2: - parts.append(f"Redundancy: {ps.prompt_redundancy_score:.0%}") - parts.append(f"User: {ia.token_breakdown.user_ratio:.0%}") - lines.append(" | ".join(parts)) - - lines.append("") - - # Phases. - lines.append("Phases: Reasoning Tool Use Generation") - lines.append( - f" {result.phase_scores.get('reasoning', 0):.2f}" - f" {result.phase_scores.get('tool_use', 0):.2f}" - f" {result.phase_scores.get('generation', 0):.2f}" - ) - lines.append("") - - # Tokens. - lines.append(f"Output Tokens: {result.total_tokens:,} (aligned: {result.aligned_tokens:,} waste: {result.waste_tokens:,})") - - # Economics. - if result.economics is not None: - econ = result.economics - cache_pct = econ.cache_hit_rate * 100 - pos = econ.positional - g = econ.input_growth - - lines.extend([ - "", - f"Input: {econ.total_input_tokens:,} Cache Read: {econ.total_cache_read_tokens:,} Cache Hit: {cache_pct:.1f}%", - f"Context Growth: {g.growth_rate:.1f}x over {len(g.turn_input_tokens)} turns" - + (" [BLOAT]" if g.context_bloat_detected else (" [WATCH]" if g.is_superlinear else "")), - f"Positional TER: {pos.early_ter:.2f} (early) / {pos.mid_ter:.2f} (mid) / {pos.late_ter:.2f} (late)", - ]) - - # Waste breakdown. - rows = _build_waste_breakdown(result) - if rows: - total_waste = sum(t for _, t, _, _ in rows) - cm = result.economics.cost_model if result.economics else CostModel() - lines.extend(["", "Waste Breakdown:"]) - lines.append(f" {'Source':<24} {'Tokens':>10} {'%':>5} {'Cost':>10} {'Count':>6}") - for label, tokens, count, kind in rows: - pct = (tokens / total_waste * 100) if total_waste > 0 else 0 - rate = cm.output_rate if kind == "output" else cm.input_rate - row_cost = float(tokens) * rate / 1_000_000 - lines.append( - f" {label:<24} {tokens:>10,} {pct:>4.0f}% ${row_cost:>8.4f} {count:>6}" - ) - total_cost = _compute_waste_cost(result) - lines.append(f" {'Total':<24} {total_waste:>10,} 100% ${total_cost:>8.4f}") - - # Input analysis. - if result.input_analysis is not None: - lines.extend(_format_input_analysis_text(result.input_analysis)) - - return "\n".join(lines) - - -def _format_comparison_text(results: list[TERResult]) -> str: - """Format comparison as a plain text table.""" - lines = [ - "TER Comparison", - "\u2550" * 40, - "", - f" {'#':<3} {'Session':<12} {'TER':<6} {'Waste%':<8} {'Cache%':<8} {'Cost':<10} {'Waste $':<10} {'Patterns':<8}", - ] - - for i, r in enumerate(results, 1): - sid = r.session_id[:12] if len(r.session_id) <= 12 else r.session_id[:8] + "..." - pattern_count = len(r.waste_patterns) if r.waste_patterns else 0 - waste_pct = (r.waste_tokens / r.total_tokens * 100) if r.total_tokens else 0 - cache_str = "" - cost_str = "" - waste_cost_str = "" - if r.economics: - cache_pct = r.economics.cache_hit_rate * 100 - cache_str = f"{cache_pct:.0f}%" - cost_str = f"${r.economics.estimated_cost_usd:.2f}" - wc = _compute_waste_cost(r) - waste_cost_str = f"${wc:.2f}" - lines.append( - f" {i:<3} {sid:<12} {r.aggregate_ter:<6.2f} " - f"{waste_pct:<8.1f} {cache_str:<8} {cost_str:<10} {waste_cost_str:<10} {pattern_count:<8}" - ) - - if results: - avg_ter = sum(r.aggregate_ter for r in results) / len(results) - total_cost = sum(r.economics.estimated_cost_usd for r in results if r.economics) - total_waste_cost = sum(_compute_waste_cost(r) for r in results) - lines.extend(["", f"Average TER: {avg_ter:.2f} | Total Cost: ${total_cost:.2f} | Total Waste: ${total_waste_cost:.2f}"]) - - return "\n".join(lines) - - -def _format_input_analysis_text(ia: InputAnalysis) -> list[str]: - """Format input analysis as plain text lines.""" - bd = ia.token_breakdown - ps = ia.prompt_similarity - - lines = [ - "", - "Input Analysis:", - f" User Tokens: {bd.total_user_tokens:,} (prompt: {bd.user_input_tokens:,}, tool results: {bd.user_result_tokens:,})", - f" Model Tokens: {bd.total_model_tokens:,} (reasoning: {bd.model_reasoning_tokens:,}, tool: {bd.model_tool_tokens:,}, generation: {bd.model_generation_tokens:,})", - f" User Ratio: {bd.user_ratio:.1%}", - ] - - if ps.prompt_count >= 2: - lines.append(f" Prompt Redundancy: {ps.prompt_redundancy_score:.0%} ({ps.prompt_count} prompts, {len(ps.similar_pairs)} similar pair(s))") - for pair in ps.similar_pairs[:5]: - a_text = pair.prompt_a_text[:40] + "..." if len(pair.prompt_a_text) > 40 else pair.prompt_a_text - b_text = pair.prompt_b_text[:40] + "..." if len(pair.prompt_b_text) > 40 else pair.prompt_b_text - lines.append(f' #{pair.prompt_a_index+1} "{a_text}" ~ #{pair.prompt_b_index+1} "{b_text}" ({pair.similarity:.2f})') - - # Intent drift. - drift = ia.intent_drift - if drift.steps: - lines.append(f" Intent Drift: {drift.overall_trajectory} (avg similarity: {drift.average_drift:.2f})") - for step in drift.steps: - lines.append(f" #{step.from_index+1} -> #{step.to_index+1}: {step.drift_type} ({step.similarity:.2f})") - - # Prompt-response alignment. - pra = ia.prompt_response_alignment - if pra.pairs: - lines.append(f" Prompt-Response Alignment: {pra.average_alignment:.2f} ({len(pra.pairs)} pair(s), {pra.low_alignment_count} low)") - for pair in pra.pairs: - prompt_short = pair.prompt_text[:50] + "..." if len(pair.prompt_text) > 50 else pair.prompt_text - marker = " [LOW]" if pair.alignment < 0.3 else "" - lines.append(f' #{pair.prompt_index+1} "{prompt_short}" -> {pair.alignment:.2f}{marker}') - - return lines - - -# --- JSON formatting --- - - -def _format_json(result: TERResult) -> str: - """Format TER result as JSON.""" - data = _ter_result_to_dict(result) - return json.dumps(data, indent=2) - - -def _format_comparison_json(results: list[TERResult]) -> str: - """Format comparison as JSON.""" - data = { - "sessions": [_ter_result_to_dict(r) for r in results], - "average_ter": round( - sum(r.aggregate_ter for r in results) / len(results), 4 - ) if results else 0.0, - } - return json.dumps(data, indent=2) - - -def _ter_result_to_dict(result: TERResult) -> dict: - """Convert TERResult to a JSON-serializable dict.""" - data: dict = { - "session_id": result.session_id, - "aggregate_ter": result.aggregate_ter, - "raw_ratio": result.raw_ratio, - "phase_scores": result.phase_scores, - "total_tokens": result.total_tokens, - "aligned_tokens": result.aligned_tokens, - "waste_tokens": result.waste_tokens, - } - if result.intent: - data["intent_confidence"] = result.intent.confidence - if result.waste_patterns: - data["waste_patterns"] = [ - { - "type": wp.pattern_type, - "start_position": wp.start_position, - "end_position": wp.end_position, - "spans_involved": wp.spans_involved, - "tokens_wasted": wp.tokens_wasted, - "description": wp.description, - } - for wp in result.waste_patterns - ] - if result.classified_spans: - summary = summarize_waste(result.classified_spans, result.waste_patterns or []) - data["waste_summary"] = { - "total_waste_tokens": summary["total_waste_tokens"], - "waste_by_category": summary["waste_by_category"], - "waste_by_phase": summary["waste_by_phase"], - "top_patterns": summary["top_patterns"], - "explanation": summary["explanation"], - } - rows = _build_waste_breakdown(result) - if rows: - total_waste = sum(t for _, t, _, _ in rows) - cm = result.economics.cost_model if result.economics else CostModel() - sources = [] - for label, tokens, count, kind in rows: - rate = cm.output_rate if kind == "output" else cm.input_rate - row_cost = float(tokens) * rate / 1_000_000 - sources.append({ - "source": label, - "tokens": tokens, - "percentage": round(tokens / total_waste * 100, 1) if total_waste > 0 else 0, - "cost_usd": round(row_cost, 6), - "count": count, - "pricing": kind, - }) - data["waste_breakdown"] = { - "sources": sources, - "total_tokens": total_waste, - "total_cost_usd": round(_compute_waste_cost(result), 6), - } - if result.economics is not None: - econ = result.economics - data["economics"] = { - "total_input_tokens": econ.total_input_tokens, - "total_output_tokens": econ.total_output_tokens, - "total_cache_creation_tokens": econ.total_cache_creation_tokens, - "total_cache_read_tokens": econ.total_cache_read_tokens, - "input_output_ratio": econ.input_output_ratio, - "cache_hit_rate": econ.cache_hit_rate, - "estimated_cost_usd": econ.estimated_cost_usd, - "estimated_waste_cost_usd": econ.estimated_waste_cost_usd, - "cost_model": { - "input_rate": econ.cost_model.input_rate, - "output_rate": econ.cost_model.output_rate, - "cache_read_rate": econ.cost_model.cache_read_rate, - "cache_write_rate": econ.cost_model.cache_write_rate, - }, - "positional": { - "early_ter": econ.positional.early_ter, - "mid_ter": econ.positional.mid_ter, - "late_ter": econ.positional.late_ter, - "early_span_count": econ.positional.early_span_count, - "mid_span_count": econ.positional.mid_span_count, - "late_span_count": econ.positional.late_span_count, - }, - "input_growth": { - "turn_input_tokens": econ.input_growth.turn_input_tokens, - "growth_rate": econ.input_growth.growth_rate, - "is_superlinear": econ.input_growth.is_superlinear, - "context_bloat_detected": econ.input_growth.context_bloat_detected, - }, - } - if result.input_analysis is not None: - ia = result.input_analysis - bd = ia.token_breakdown - ps = ia.prompt_similarity - data["input_analysis"] = { - "token_breakdown": { - "user_input_tokens": bd.user_input_tokens, - "user_result_tokens": bd.user_result_tokens, - "model_reasoning_tokens": bd.model_reasoning_tokens, - "model_tool_tokens": bd.model_tool_tokens, - "model_generation_tokens": bd.model_generation_tokens, - "total_user_tokens": bd.total_user_tokens, - "total_model_tokens": bd.total_model_tokens, - "user_ratio": bd.user_ratio, - }, - "prompt_similarity": { - "prompt_count": ps.prompt_count, - "prompt_redundancy_score": ps.prompt_redundancy_score, - "similar_pairs": [ - { - "prompt_a_index": p.prompt_a_index, - "prompt_b_index": p.prompt_b_index, - "similarity": p.similarity, - "prompt_a_text": p.prompt_a_text, - "prompt_b_text": p.prompt_b_text, - } - for p in ps.similar_pairs - ], - }, - "intent_drift": { - "overall_trajectory": ia.intent_drift.overall_trajectory, - "average_drift": ia.intent_drift.average_drift, - "steps": [ - { - "from_index": s.from_index, - "to_index": s.to_index, - "similarity": s.similarity, - "drift_type": s.drift_type, - } - for s in ia.intent_drift.steps - ], - }, - "prompt_response_alignment": { - "average_alignment": ia.prompt_response_alignment.average_alignment, - "low_alignment_count": ia.prompt_response_alignment.low_alignment_count, - "pairs": [ - { - "prompt_index": p.prompt_index, - "prompt_text": p.prompt_text, - "response_text": p.response_text, - "alignment": p.alignment, - } - for p in ia.prompt_response_alignment.pairs - ], - }, - } - if result.cost_report is not None: - cr = result.cost_report - data["cost_report"] = { - "cost_weighted_ter": cr.cost_ter.cost_weighted_ter, - "raw_ter": cr.cost_ter.raw_ter, - "total_cost_usd": cr.cost_ter.total_cost_usd, - "waste_cost_usd": cr.cost_ter.waste_cost_usd, - "savings_if_perfect": cr.cost_ter.savings_if_perfect, - "semantic_density": { - "density_score": cr.session_density.density_score, - "vocabulary_richness": cr.session_density.vocabulary_richness, - "information_entropy": cr.session_density.information_entropy, - "redundancy_ratio": cr.session_density.redundancy_ratio, - }, - "recommendations": cr.recommendations, - "model_tier": cr.model_tier, - } - if result.overthinking_result is not None: - ot = result.overthinking_result - data["overthinking_analysis"] = { - "is_overthinking": ot.is_overthinking, - "total_reasoning_tokens": ot.total_reasoning_tokens, - "useful_reasoning_tokens": ot.useful_reasoning_tokens, - "wasted_reasoning_tokens": ot.wasted_reasoning_tokens, - "reasoning_efficiency": ot.reasoning_efficiency, - "optimal_cutoff_index": ot.optimal_cutoff_index, - "recommended_budget": ot.recommended_budget, - "explanation": ot.explanation, - } - return data - - def _format_waste_pattern(wp: WastePattern) -> str: - """Format a single waste pattern for text display.""" pos = ( f"spans {wp.start_position}-{wp.end_position}" if wp.start_position != wp.end_position diff --git a/src/ter_calculator/formatter_json.py b/src/ter_calculator/formatter_json.py new file mode 100644 index 0000000..b02a388 --- /dev/null +++ b/src/ter_calculator/formatter_json.py @@ -0,0 +1,221 @@ +"""JSON formatting for TER results.""" + +from __future__ import annotations + +import json + +from .models import TERResult +from .waste import summarize_waste + + +def format_json(result: TERResult) -> str: + data = ter_result_to_dict(result) + return json.dumps(data, indent=2) + + +def format_comparison_json(results: list[TERResult]) -> str: + data = { + "sessions": [ter_result_to_dict(r) for r in results], + "average_ter": round( + sum(r.aggregate_ter for r in results) / len(results), 4 + ) if results else 0.0, + } + return json.dumps(data, indent=2) + + +def format_grouped_json( + parent_result: TERResult, + subagent_results: list[TERResult], +) -> str: + from .formatter import _compute_group_aggregates + + all_results = [parent_result] + subagent_results + agg = _compute_group_aggregates(all_results) + + data = { + "group": { + "parent_session_id": parent_result.session_id, + "subagent_count": len(subagent_results), + **agg, + }, + "parent": ter_result_to_dict(parent_result), + "subagents": [ter_result_to_dict(r) for r in subagent_results], + } + return json.dumps(data, indent=2) + + +def ter_result_to_dict(result: TERResult) -> dict: + from .formatter import _build_waste_breakdown, _compute_waste_cost + from .models import CostModel + + data: dict = { + "session_id": result.session_id, + "aggregate_ter": result.aggregate_ter, + "raw_ratio": result.raw_ratio, + "phase_scores": result.phase_scores, + "total_tokens": result.total_tokens, + "aligned_tokens": result.aligned_tokens, + "waste_tokens": result.waste_tokens, + } + if result.intent: + data["intent_confidence"] = result.intent.confidence + if result.waste_patterns: + data["waste_patterns"] = [ + { + "type": wp.pattern_type, + "start_position": wp.start_position, + "end_position": wp.end_position, + "spans_involved": wp.spans_involved, + "tokens_wasted": wp.tokens_wasted, + "description": wp.description, + } + for wp in result.waste_patterns + ] + if result.classified_spans: + summary = summarize_waste(result.classified_spans, result.waste_patterns or []) + data["waste_summary"] = { + "total_waste_tokens": summary["total_waste_tokens"], + "waste_by_category": summary["waste_by_category"], + "waste_by_phase": summary["waste_by_phase"], + "top_patterns": summary["top_patterns"], + "explanation": summary["explanation"], + } + rows = _build_waste_breakdown(result) + if rows: + total_waste = sum(t for _, t, _, _ in rows) + cm = result.economics.cost_model if result.economics else CostModel() + sources = [] + for label, tokens, count, kind in rows: + rate = cm.output_rate if kind == "output" else cm.input_rate + row_cost = float(tokens) * rate / 1_000_000 + sources.append({ + "source": label, + "tokens": tokens, + "percentage": round(tokens / total_waste * 100, 1) if total_waste > 0 else 0, + "cost_usd": round(row_cost, 6), + "count": count, + "pricing": kind, + }) + data["waste_breakdown"] = { + "sources": sources, + "total_tokens": total_waste, + "total_cost_usd": round(_compute_waste_cost(result), 6), + } + if result.economics is not None: + econ = result.economics + data["economics"] = { + "total_input_tokens": econ.total_input_tokens, + "total_output_tokens": econ.total_output_tokens, + "total_cache_creation_tokens": econ.total_cache_creation_tokens, + "total_cache_read_tokens": econ.total_cache_read_tokens, + "input_output_ratio": econ.input_output_ratio, + "cache_hit_rate": econ.cache_hit_rate, + "estimated_cost_usd": econ.estimated_cost_usd, + "estimated_waste_cost_usd": econ.estimated_waste_cost_usd, + "cost_model": { + "input_rate": econ.cost_model.input_rate, + "output_rate": econ.cost_model.output_rate, + "cache_read_rate": econ.cost_model.cache_read_rate, + "cache_write_rate": econ.cost_model.cache_write_rate, + }, + "positional": { + "early_ter": econ.positional.early_ter, + "mid_ter": econ.positional.mid_ter, + "late_ter": econ.positional.late_ter, + "early_span_count": econ.positional.early_span_count, + "mid_span_count": econ.positional.mid_span_count, + "late_span_count": econ.positional.late_span_count, + }, + "input_growth": { + "turn_input_tokens": econ.input_growth.turn_input_tokens, + "growth_rate": econ.input_growth.growth_rate, + "is_superlinear": econ.input_growth.is_superlinear, + "context_bloat_detected": econ.input_growth.context_bloat_detected, + }, + } + if result.input_analysis is not None: + ia = result.input_analysis + bd = ia.token_breakdown + ps = ia.prompt_similarity + data["input_analysis"] = { + "token_breakdown": { + "user_input_tokens": bd.user_input_tokens, + "user_result_tokens": bd.user_result_tokens, + "model_reasoning_tokens": bd.model_reasoning_tokens, + "model_tool_tokens": bd.model_tool_tokens, + "model_generation_tokens": bd.model_generation_tokens, + "total_user_tokens": bd.total_user_tokens, + "total_model_tokens": bd.total_model_tokens, + "user_ratio": bd.user_ratio, + }, + "prompt_similarity": { + "prompt_count": ps.prompt_count, + "prompt_redundancy_score": ps.prompt_redundancy_score, + "similar_pairs": [ + { + "prompt_a_index": p.prompt_a_index, + "prompt_b_index": p.prompt_b_index, + "similarity": p.similarity, + "prompt_a_text": p.prompt_a_text, + "prompt_b_text": p.prompt_b_text, + } + for p in ps.similar_pairs + ], + }, + "intent_drift": { + "overall_trajectory": ia.intent_drift.overall_trajectory, + "average_drift": ia.intent_drift.average_drift, + "steps": [ + { + "from_index": s.from_index, + "to_index": s.to_index, + "similarity": s.similarity, + "drift_type": s.drift_type, + } + for s in ia.intent_drift.steps + ], + }, + "prompt_response_alignment": { + "average_alignment": ia.prompt_response_alignment.average_alignment, + "low_alignment_count": ia.prompt_response_alignment.low_alignment_count, + "pairs": [ + { + "prompt_index": p.prompt_index, + "prompt_text": p.prompt_text, + "response_text": p.response_text, + "alignment": p.alignment, + } + for p in ia.prompt_response_alignment.pairs + ], + }, + } + if result.cost_report is not None: + cr = result.cost_report + data["cost_report"] = { + "cost_weighted_ter": cr.cost_ter.cost_weighted_ter, + "raw_ter": cr.cost_ter.raw_ter, + "total_cost_usd": cr.cost_ter.total_cost_usd, + "waste_cost_usd": cr.cost_ter.waste_cost_usd, + "savings_if_perfect": cr.cost_ter.savings_if_perfect, + "semantic_density": { + "density_score": cr.session_density.density_score, + "vocabulary_richness": cr.session_density.vocabulary_richness, + "information_entropy": cr.session_density.information_entropy, + "redundancy_ratio": cr.session_density.redundancy_ratio, + }, + "recommendations": cr.recommendations, + "model_tier": cr.model_tier, + } + if result.overthinking_result is not None: + ot = result.overthinking_result + data["overthinking_analysis"] = { + "is_overthinking": ot.is_overthinking, + "total_reasoning_tokens": ot.total_reasoning_tokens, + "useful_reasoning_tokens": ot.useful_reasoning_tokens, + "wasted_reasoning_tokens": ot.wasted_reasoning_tokens, + "reasoning_efficiency": ot.reasoning_efficiency, + "optimal_cutoff_index": ot.optimal_cutoff_index, + "recommended_budget": ot.recommended_budget, + "explanation": ot.explanation, + } + return data diff --git a/src/ter_calculator/formatter_rich.py b/src/ter_calculator/formatter_rich.py new file mode 100644 index 0000000..8db3464 --- /dev/null +++ b/src/ter_calculator/formatter_rich.py @@ -0,0 +1,476 @@ +"""Rich terminal formatting for TER results.""" + +from __future__ import annotations + +import io + +from .models import CostModel, InputAnalysis, TERResult +from .rich_components import ter_color as _ter_color + + +def format_rich(result: TERResult) -> str: + """Format TER result using Rich library.""" + from rich.console import Console + from rich.panel import Panel + from rich.table import Table + from rich.text import Text + + from .formatter import _build_waste_breakdown, _compute_waste_cost + + buf = io.StringIO() + console = Console(file=buf, force_terminal=True, width=72) + + ter_text = Text(f"{result.aggregate_ter:.2f}", style=_ter_color(result.aggregate_ter)) + waste_pct = (result.waste_tokens / result.total_tokens * 100) if result.total_tokens else 0 + sid = result.session_id + if len(sid) > 20: + sid = sid[:8] + "..." + + line1_parts: list = [("TER: ", "bold"), ter_text] + line1_parts.append((" | ", "")) + line1_parts.append((f"Waste: {waste_pct:.1f}%", "red" if waste_pct > 10 else "")) + if result.economics: + line1_parts.append((" | ", "")) + line1_parts.append((f"Cost: ${result.economics.estimated_cost_usd:.2f}", "")) + waste_cost = _compute_waste_cost(result) + if waste_cost > 0: + line1_parts.append((" | ", "")) + line1_parts.append((f"Waste $: ${waste_cost:.2f}", "red")) + + ia = result.input_analysis + if ia is not None: + drift = ia.intent_drift + pra = ia.prompt_response_alignment + ps = ia.prompt_similarity + bd = ia.token_breakdown + + drift_colors = { + "convergent": "red", "divergent": "green", + "stable": "green", "mixed": "yellow", + } + d_color = drift_colors.get(drift.overall_trajectory, "") + a_color = "red" if pra.average_alignment < 0.3 else ( + "yellow" if pra.average_alignment < 0.5 else "green" + ) + r_color = "red" if ps.prompt_redundancy_score > 0.5 else ( + "yellow" if ps.prompt_redundancy_score > 0 else "green" + ) + + line2_parts: list = [ + ("Drift: ", "bold"), + (f"{drift.overall_trajectory}", d_color), + ] + if pra.pairs: + line2_parts.append((" | ", "")) + line2_parts.append(("Alignment: ", "bold")) + line2_parts.append((f"{pra.average_alignment:.2f}", a_color)) + if ps.prompt_count >= 2: + line2_parts.append((" | ", "")) + line2_parts.append(("Redundancy: ", "bold")) + line2_parts.append((f"{ps.prompt_redundancy_score:.0%}", r_color)) + line2_parts.append((" | ", "")) + line2_parts.append((f"User: {bd.user_ratio:.0%}", "dim")) + + header = Text.assemble( + *line1_parts, ("\n", ""), *line2_parts, + ) + else: + header = Text.assemble(*line1_parts) + + console.print(Panel(header, title=sid, expand=False)) + + table = Table(show_header=True, show_edge=True) + table.add_column("Phase", style="bold", width=12) + table.add_column("TER", justify="right", width=6) + table.add_column("", width=3) + table.add_column("Metric", style="bold", width=14) + table.add_column("Value", justify="right", width=12) + + phases = [ + ("Reasoning", result.phase_scores.get("reasoning", 0)), + ("Tool Use", result.phase_scores.get("tool_use", 0)), + ("Generation", result.phase_scores.get("generation", 0)), + ] + right_rows = [ + ("Output Tokens", f"{result.total_tokens:,}"), + ("Aligned", f"{result.aligned_tokens:,}"), + ("Waste", f"{result.waste_tokens:,}"), + ] + + for i in range(3): + p_name, p_score = phases[i] + p_color = _ter_color(p_score) + r_label, r_value = right_rows[i] + table.add_row( + p_name, + f"[{p_color}]{p_score:.2f}[/{p_color}]", + "", + r_label, + r_value, + ) + console.print(table) + + if result.economics is not None: + econ = result.economics + cache_pct = econ.cache_hit_rate * 100 + cache_color = "green" if cache_pct >= 50 else "yellow" if cache_pct >= 20 else "red" + + econ_table = Table(show_header=True, show_edge=True) + econ_table.add_column("Economics", style="bold", width=18) + econ_table.add_column("", justify="right", width=12) + econ_table.add_column("", width=3) + econ_table.add_column("Context", style="bold", width=14) + econ_table.add_column("", justify="right", width=12) + + pos = econ.positional + g = econ.input_growth + bloat_str = "[red]YES[/red]" if g.context_bloat_detected else ( + "[yellow]WATCH[/yellow]" if g.is_superlinear else "[green]NO[/green]" + ) + + left_rows = [ + ("Input Tokens", f"{econ.total_input_tokens:,}"), + ("Cache Read", f"{econ.total_cache_read_tokens:,}"), + ("Cache Hit Rate", f"[{cache_color}]{cache_pct:.1f}%[/{cache_color}]"), + ] + right_rows_e = [ + ("Growth", f"{g.growth_rate:.1f}x ({len(g.turn_input_tokens)} turns)"), + ("Bloat", bloat_str), + ("Positional", f"{pos.early_ter:.2f} / {pos.mid_ter:.2f} / {pos.late_ter:.2f}"), + ] + + for i in range(3): + l_label, l_value = left_rows[i] + r_label, r_value = right_rows_e[i] + econ_table.add_row(l_label, l_value, "", r_label, r_value) + console.print(econ_table) + + _format_waste_breakdown_rich(console, result) + + if result.input_analysis is not None: + _format_input_analysis_rich(console, result.input_analysis) + + if result.cost_report is not None: + _format_cost_report_rich(console, result.cost_report) + + if result.overthinking_result is not None: + _format_overthinking_rich(console, result.overthinking_result) + + return buf.getvalue().rstrip() + + +def _format_waste_breakdown_rich(console, result: TERResult) -> None: + from rich.table import Table + from .formatter import _build_waste_breakdown, _compute_waste_cost + + rows = _build_waste_breakdown(result) + if not rows: + return + + total_waste = sum(t for _, t, _, _ in rows) + cm = result.economics.cost_model if result.economics else CostModel() + + table = Table(show_header=True, show_edge=True, title="Waste Breakdown") + table.add_column("Source", style="bold", width=22) + table.add_column("Tokens", justify="right", width=10) + table.add_column("%", justify="right", width=6) + table.add_column("Cost", justify="right", width=8) + table.add_column("Count", justify="right", width=6, style="dim") + + for label, tokens, count, kind in rows: + pct = (tokens / total_waste * 100) if total_waste > 0 else 0 + rate = cm.output_rate if kind == "output" else cm.input_rate + row_cost = float(tokens) * rate / 1_000_000 + table.add_row( + label, + f"{tokens:,}", + f"{pct:.0f}%", + f"${row_cost:.4f}", + str(count), + ) + + table.add_section() + total_cost = _compute_waste_cost(result) + table.add_row( + "[bold]Total[/bold]", + f"[bold]{total_waste:,}[/bold]", + "[bold]100%[/bold]", + f"[bold]${total_cost:.4f}[/bold]", + "", + ) + console.print(table) + + +def format_comparison_rich(results: list[TERResult]) -> str: + from rich.console import Console + from rich.table import Table + from .formatter import _compute_waste_cost + + buf = io.StringIO() + console = Console(file=buf, force_terminal=True, width=90) + + table = Table(title="TER Comparison", show_header=True) + table.add_column("#", justify="right", style="dim") + table.add_column("Session", style="bold") + table.add_column("TER", justify="right") + table.add_column("Waste%", justify="right") + table.add_column("Cache%", justify="right") + table.add_column("Cost", justify="right") + table.add_column("Waste $", justify="right") + table.add_column("Patterns", justify="right") + + for i, r in enumerate(results, 1): + color = _ter_color(r.aggregate_ter) + pattern_count = len(r.waste_patterns) if r.waste_patterns else 0 + waste_pct = (r.waste_tokens / r.total_tokens * 100) if r.total_tokens else 0 + cache_str = "" + cost_str = "" + waste_cost_str = "" + if r.economics: + cache_pct = r.economics.cache_hit_rate * 100 + cache_str = f"{cache_pct:.0f}%" + cost_str = f"${r.economics.estimated_cost_usd:.2f}" + wc = _compute_waste_cost(r) + waste_cost_str = f"[red]${wc:.2f}[/red]" + sid = r.session_id + if len(sid) > 20: + sid = sid[:8] + "..." + table.add_row( + str(i), + sid, + f"[{color}]{r.aggregate_ter:.2f}[/{color}]", + f"{waste_pct:.1f}%", + cache_str, + cost_str, + waste_cost_str, + str(pattern_count), + ) + + console.print(table) + + if results: + avg_ter = sum(r.aggregate_ter for r in results) / len(results) + total_cost = sum(r.economics.estimated_cost_usd for r in results if r.economics) + total_waste_cost = sum(_compute_waste_cost(r) for r in results) + color = _ter_color(avg_ter) + console.print(f"\nAverage TER: [{color}]{avg_ter:.2f}[/{color}] | Total Cost: ${total_cost:.2f} | Total Waste: [red]${total_waste_cost:.2f}[/red]") + + return buf.getvalue().rstrip() + + +def format_grouped_rich( + parent_result: TERResult, + subagent_results: list[TERResult], +) -> str: + from rich.console import Console + from rich.panel import Panel + from rich.table import Table + from rich.text import Text + from .formatter import _compute_group_aggregates, _compute_waste_cost + + all_results = [parent_result] + subagent_results + agg = _compute_group_aggregates(all_results) + + buf = io.StringIO() + console = Console(file=buf, force_terminal=True, width=90) + + sid = parent_result.session_id + if len(sid) > 20: + sid = sid[:8] + "..." + ter_text = Text(f"{agg['weighted_ter']:.2f}", style=_ter_color(agg["weighted_ter"])) + + header = Text.assemble( + ("TER: ", "bold"), ter_text, + (" | ", ""), + (f"Waste: {agg['waste_pct']:.1f}%", "red" if agg["waste_pct"] > 10 else ""), + (" | ", ""), + (f"Cost: ${agg['total_cost_usd']:.2f}", ""), + (" | ", ""), + (f"Waste $: ${agg['total_waste_cost_usd']:.2f}", "red"), + ("\n", ""), + (f"Sessions: 1 parent + {len(subagent_results)} subagent(s)", "dim"), + (" | ", ""), + (f"Tokens: {agg['total_tokens']:,}", "dim"), + ) + console.print(Panel(header, title=f"Group: {sid}", expand=False)) + + table = Table(show_header=True, title="Session Breakdown") + table.add_column("Role", width=10) + table.add_column("Session", width=14) + table.add_column("TER", justify="right", width=6) + table.add_column("Waste%", justify="right", width=7) + table.add_column("Tokens", justify="right", width=10) + table.add_column("Cost", justify="right", width=8) + table.add_column("Waste $", justify="right", width=8) + table.add_column("Patterns", justify="right", width=8) + + def _add_session_row(r: TERResult, role: str): + color = _ter_color(r.aggregate_ter) + waste_pct = (r.waste_tokens / r.total_tokens * 100) if r.total_tokens else 0 + cost_str = f"${r.economics.estimated_cost_usd:.2f}" if r.economics else "" + wc = _compute_waste_cost(r) + waste_str = f"[red]${wc:.2f}[/red]" if wc > 0 else "" + pattern_count = len(r.waste_patterns) if r.waste_patterns else 0 + rsid = r.session_id + if len(rsid) > 14: + rsid = rsid[:8] + "..." + table.add_row( + role, rsid, + f"[{color}]{r.aggregate_ter:.2f}[/{color}]", + f"{waste_pct:.1f}%", + f"{r.total_tokens:,}", + cost_str, waste_str, str(pattern_count), + ) + + _add_session_row(parent_result, "parent") + for r in subagent_results: + _add_session_row(r, "agent") + + table.add_section() + color = _ter_color(agg["weighted_ter"]) + table.add_row( + "[bold]Total[/bold]", "", + f"[bold][{color}]{agg['weighted_ter']:.2f}[/{color}][/bold]", + f"[bold]{agg['waste_pct']:.1f}%[/bold]", + f"[bold]{agg['total_tokens']:,}[/bold]", + f"[bold]${agg['total_cost_usd']:.2f}[/bold]", + f"[bold][red]${agg['total_waste_cost_usd']:.2f}[/red][/bold]", + "", + ) + console.print(table) + + return buf.getvalue().rstrip() + + +def _format_input_analysis_rich(console, ia: InputAnalysis) -> None: + from rich.table import Table + + bd = ia.token_breakdown + ps = ia.prompt_similarity + + console.print("\n[bold]Input Analysis[/bold]") + tb = Table(show_header=True, show_edge=True) + tb.add_column("Origin", style="bold", width=14) + tb.add_column("Category", width=16) + tb.add_column("Tokens", justify="right", width=10) + + tb.add_row("User", "Prompt Text", f"{bd.user_input_tokens:,}") + tb.add_row("User", "Tool Results", f"{bd.user_result_tokens:,}") + tb.add_row("Model", "Reasoning", f"{bd.model_reasoning_tokens:,}") + tb.add_row("Model", "Tool Calls", f"{bd.model_tool_tokens:,}") + tb.add_row("Model", "Generation", f"{bd.model_generation_tokens:,}") + tb.add_section() + tb.add_row("[bold]User Total[/bold]", "", f"[bold]{bd.total_user_tokens:,}[/bold]") + tb.add_row("[bold]Model Total[/bold]", "", f"[bold]{bd.total_model_tokens:,}[/bold]") + tb.add_row("User Ratio", "", f"{bd.user_ratio:.1%}") + console.print(tb) + + if ps.prompt_count >= 2: + r_color = "red" if ps.prompt_redundancy_score > 0.5 else ( + "yellow" if ps.prompt_redundancy_score > 0 else "green" + ) + console.print( + f"\nPrompt Redundancy: [{r_color}]{ps.prompt_redundancy_score:.0%}[/{r_color}]" + f" ({ps.prompt_count} prompts, {len(ps.similar_pairs)} similar pair(s))" + ) + for pair in ps.similar_pairs[:5]: + a_text = pair.prompt_a_text[:40] + "..." if len(pair.prompt_a_text) > 40 else pair.prompt_a_text + b_text = pair.prompt_b_text[:40] + "..." if len(pair.prompt_b_text) > 40 else pair.prompt_b_text + console.print( + f' [dim]#{pair.prompt_a_index+1}[/dim] "{a_text}" ' + f'[dim]~[/dim] [dim]#{pair.prompt_b_index+1}[/dim] "{b_text}" ' + f'[yellow]({pair.similarity:.2f})[/yellow]' + ) + + drift = ia.intent_drift + if drift.steps: + _drift_colors = { + "convergent": "red", "divergent": "green", + "stable": "green", "mixed": "yellow", + } + t_color = _drift_colors.get(drift.overall_trajectory, "") + console.print( + f"\nIntent Drift: [{t_color}]{drift.overall_trajectory}[/{t_color}]" + f" (avg similarity: {drift.average_drift:.2f})" + ) + for step in drift.steps: + s_color = "red" if step.drift_type == "convergent" else ( + "green" if step.drift_type == "divergent" else "yellow" + ) + console.print( + f" #{step.from_index+1} -> #{step.to_index+1}: " + f"[{s_color}]{step.drift_type}[/{s_color}] ({step.similarity:.2f})" + ) + + pra = ia.prompt_response_alignment + if pra.pairs: + a_color = "red" if pra.average_alignment < 0.3 else ( + "yellow" if pra.average_alignment < 0.5 else "green" + ) + console.print( + f"\nPrompt-Response Alignment: [{a_color}]{pra.average_alignment:.2f}[/{a_color}]" + f" ({len(pra.pairs)} pair(s), {pra.low_alignment_count} low)" + ) + for pair in pra.pairs: + p_color = "red" if pair.alignment < 0.3 else ( + "yellow" if pair.alignment < 0.5 else "green" + ) + prompt_short = pair.prompt_text[:50] + "..." if len(pair.prompt_text) > 50 else pair.prompt_text + console.print( + f' [dim]#{pair.prompt_index+1}[/dim] "{prompt_short}" ' + f'-> [{p_color}]{pair.alignment:.2f}[/{p_color}]' + ) + + +def _format_cost_report_rich(console, cost_report) -> None: + from rich.table import Table + + console.print("\n[bold]Cost Analysis[/bold]") + + cost_table = Table(show_header=True, show_edge=True) + cost_table.add_column("Metric", style="cyan", width=20) + cost_table.add_column("Value", justify="right", width=16) + + cwter = cost_report.cost_ter + cost_table.add_row("Cost-Weighted TER", f"{cwter.cost_weighted_ter:.4f}") + cost_table.add_row("Raw TER", f"{cwter.raw_ter:.4f}") + cost_table.add_row("Total Cost", f"${cwter.total_cost_usd:.4f}") + cost_table.add_row("Waste Cost", f"${cwter.waste_cost_usd:.4f}") + waste_pct = (cwter.waste_cost_usd / cwter.total_cost_usd * 100) if cwter.total_cost_usd > 0 else 0 + cost_table.add_row("Waste %", f"{waste_pct:.1f}%") + cost_table.add_row("Semantic Density", f"{cost_report.session_density.density_score:.2%}") + cost_table.add_row("Redundancy", f"{cost_report.session_density.redundancy_ratio:.2%}") + + console.print(cost_table) + + if cost_report.recommendations: + console.print("\n[bold]Recommendations:[/bold]") + for rec in cost_report.recommendations: + console.print(f" • {rec}") + + +def _format_overthinking_rich(console, ot) -> None: + from rich.table import Table + + console.print("\n[bold]Overthinking Analysis[/bold]") + + status_color = "red" if ot.is_overthinking else "green" + status_text = "OVERTHINKING DETECTED" if ot.is_overthinking else "Efficient Reasoning" + console.print(f"Status: [{status_color}]{status_text}[/{status_color}]") + + ot_table = Table(show_header=True, show_edge=True) + ot_table.add_column("Metric", style="cyan", width=20) + ot_table.add_column("Value", justify="right", width=16) + + ot_table.add_row("Total Reasoning", f"{ot.total_reasoning_tokens:,} tokens") + ot_table.add_row("Useful", f"{ot.useful_reasoning_tokens:,} tokens") + ot_table.add_row("Efficiency", f"{ot.reasoning_efficiency:.0%}") + ot_table.add_row("Wasted", f"{ot.wasted_reasoning_tokens:,} tokens") + + if ot.optimal_cutoff_index is not None: + ot_table.add_row("Optimal Cutoff", f"Span {ot.optimal_cutoff_index} (of {len(ot.segments)})") + + ot_table.add_row("Recommended Budget", f"{ot.recommended_budget:,} tokens") + + console.print(ot_table) + console.print(f"\n{ot.explanation}") diff --git a/src/ter_calculator/formatter_text.py b/src/ter_calculator/formatter_text.py new file mode 100644 index 0000000..bd2026b --- /dev/null +++ b/src/ter_calculator/formatter_text.py @@ -0,0 +1,212 @@ +"""Plain text formatting for TER results.""" + +from __future__ import annotations + +from .models import CostModel, InputAnalysis, TERResult + + +def format_text(result: TERResult) -> str: + from .formatter import _build_waste_breakdown, _compute_waste_cost + + waste_pct = (result.waste_tokens / result.total_tokens * 100) if result.total_tokens else 0 + sid = result.session_id + if len(sid) > 20: + sid = sid[:8] + "..." + + lines = [ + f"TER Report: {sid}", + "═" * 40, + "", + ] + + cost_str = "" + if result.economics: + cost_str = f" | Cost: ${result.economics.estimated_cost_usd:.2f}" + waste_cost = _compute_waste_cost(result) + if waste_cost > 0: + cost_str += f" | Waste $: ${waste_cost:.2f}" + lines.append(f"TER: {result.aggregate_ter:.2f} | Waste: {waste_pct:.1f}%{cost_str}") + + ia = result.input_analysis + if ia is not None: + drift = ia.intent_drift + pra = ia.prompt_response_alignment + ps = ia.prompt_similarity + parts = [f"Drift: {drift.overall_trajectory}"] + if pra.pairs: + parts.append(f"Alignment: {pra.average_alignment:.2f}") + if ps.prompt_count >= 2: + parts.append(f"Redundancy: {ps.prompt_redundancy_score:.0%}") + parts.append(f"User: {ia.token_breakdown.user_ratio:.0%}") + lines.append(" | ".join(parts)) + + lines.append("") + + lines.append("Phases: Reasoning Tool Use Generation") + lines.append( + f" {result.phase_scores.get('reasoning', 0):.2f}" + f" {result.phase_scores.get('tool_use', 0):.2f}" + f" {result.phase_scores.get('generation', 0):.2f}" + ) + lines.append("") + + lines.append(f"Output Tokens: {result.total_tokens:,} (aligned: {result.aligned_tokens:,} waste: {result.waste_tokens:,})") + + if result.economics is not None: + econ = result.economics + cache_pct = econ.cache_hit_rate * 100 + pos = econ.positional + g = econ.input_growth + + lines.extend([ + "", + f"Input: {econ.total_input_tokens:,} Cache Read: {econ.total_cache_read_tokens:,} Cache Hit: {cache_pct:.1f}%", + f"Context Growth: {g.growth_rate:.1f}x over {len(g.turn_input_tokens)} turns" + + (" [BLOAT]" if g.context_bloat_detected else (" [WATCH]" if g.is_superlinear else "")), + f"Positional TER: {pos.early_ter:.2f} (early) / {pos.mid_ter:.2f} (mid) / {pos.late_ter:.2f} (late)", + ]) + + rows = _build_waste_breakdown(result) + if rows: + total_waste = sum(t for _, t, _, _ in rows) + cm = result.economics.cost_model if result.economics else CostModel() + lines.extend(["", "Waste Breakdown:"]) + lines.append(f" {'Source':<24} {'Tokens':>10} {'%':>5} {'Cost':>10} {'Count':>6}") + for label, tokens, count, kind in rows: + pct = (tokens / total_waste * 100) if total_waste > 0 else 0 + rate = cm.output_rate if kind == "output" else cm.input_rate + row_cost = float(tokens) * rate / 1_000_000 + lines.append( + f" {label:<24} {tokens:>10,} {pct:>4.0f}% ${row_cost:>8.4f} {count:>6}" + ) + total_cost = _compute_waste_cost(result) + lines.append(f" {'Total':<24} {total_waste:>10,} 100% ${total_cost:>8.4f}") + + if result.input_analysis is not None: + lines.extend(_format_input_analysis_text(result.input_analysis)) + + return "\n".join(lines) + + +def format_comparison_text(results: list[TERResult]) -> str: + from .formatter import _compute_waste_cost + + lines = [ + "TER Comparison", + "═" * 40, + "", + f" {'#':<3} {'Session':<12} {'TER':<6} {'Waste%':<8} {'Cache%':<8} {'Cost':<10} {'Waste $':<10} {'Patterns':<8}", + ] + + for i, r in enumerate(results, 1): + sid = r.session_id[:12] if len(r.session_id) <= 12 else r.session_id[:8] + "..." + pattern_count = len(r.waste_patterns) if r.waste_patterns else 0 + waste_pct = (r.waste_tokens / r.total_tokens * 100) if r.total_tokens else 0 + cache_str = "" + cost_str = "" + waste_cost_str = "" + if r.economics: + cache_pct = r.economics.cache_hit_rate * 100 + cache_str = f"{cache_pct:.0f}%" + cost_str = f"${r.economics.estimated_cost_usd:.2f}" + wc = _compute_waste_cost(r) + waste_cost_str = f"${wc:.2f}" + lines.append( + f" {i:<3} {sid:<12} {r.aggregate_ter:<6.2f} " + f"{waste_pct:<8.1f} {cache_str:<8} {cost_str:<10} {waste_cost_str:<10} {pattern_count:<8}" + ) + + if results: + avg_ter = sum(r.aggregate_ter for r in results) / len(results) + total_cost = sum(r.economics.estimated_cost_usd for r in results if r.economics) + total_waste_cost = sum(_compute_waste_cost(r) for r in results) + lines.extend(["", f"Average TER: {avg_ter:.2f} | Total Cost: ${total_cost:.2f} | Total Waste: ${total_waste_cost:.2f}"]) + + return "\n".join(lines) + + +def format_grouped_text( + parent_result: TERResult, + subagent_results: list[TERResult], +) -> str: + from .formatter import _compute_group_aggregates, _compute_waste_cost + + all_results = [parent_result] + subagent_results + agg = _compute_group_aggregates(all_results) + + sid = parent_result.session_id + if len(sid) > 20: + sid = sid[:8] + "..." + + lines = [ + f"Group Analysis: {sid}", + "═" * 50, + "", + f"TER: {agg['weighted_ter']:.2f} | Waste: {agg['waste_pct']:.1f}%" + f" | Cost: ${agg['total_cost_usd']:.2f}" + f" | Waste $: ${agg['total_waste_cost_usd']:.2f}", + f"Sessions: 1 parent + {len(subagent_results)} subagent(s) | Tokens: {agg['total_tokens']:,}", + "", + f" {'Role':<10} {'Session':<14} {'TER':<6} {'Waste%':<8} {'Tokens':<10} {'Cost':<10} {'Waste $':<10} {'Patterns':<8}", + ] + + def _add_row(r: TERResult, role: str): + rsid = r.session_id[:14] if len(r.session_id) <= 14 else r.session_id[:8] + "..." + waste_pct = (r.waste_tokens / r.total_tokens * 100) if r.total_tokens else 0 + cost_str = f"${r.economics.estimated_cost_usd:.2f}" if r.economics else "" + wc = _compute_waste_cost(r) + waste_str = f"${wc:.2f}" if wc > 0 else "" + pattern_count = len(r.waste_patterns) if r.waste_patterns else 0 + lines.append( + f" {role:<10} {rsid:<14} {r.aggregate_ter:<6.2f} " + f"{waste_pct:<8.1f} {r.total_tokens:<10,} {cost_str:<10} {waste_str:<10} {pattern_count:<8}" + ) + + _add_row(parent_result, "[parent]") + for r in subagent_results: + _add_row(r, "[agent]") + + lines.extend([ + "", + f" {'Total':<10} {'':<14} {agg['weighted_ter']:<6.2f} " + f"{agg['waste_pct']:<8.1f} {agg['total_tokens']:<10,} " + f"${agg['total_cost_usd']:<9.2f} ${agg['total_waste_cost_usd']:<9.2f}", + ]) + + return "\n".join(lines) + + +def _format_input_analysis_text(ia: InputAnalysis) -> list[str]: + bd = ia.token_breakdown + ps = ia.prompt_similarity + + lines = [ + "", + "Input Analysis:", + f" User Tokens: {bd.total_user_tokens:,} (prompt: {bd.user_input_tokens:,}, tool results: {bd.user_result_tokens:,})", + f" Model Tokens: {bd.total_model_tokens:,} (reasoning: {bd.model_reasoning_tokens:,}, tool: {bd.model_tool_tokens:,}, generation: {bd.model_generation_tokens:,})", + f" User Ratio: {bd.user_ratio:.1%}", + ] + + if ps.prompt_count >= 2: + lines.append(f" Prompt Redundancy: {ps.prompt_redundancy_score:.0%} ({ps.prompt_count} prompts, {len(ps.similar_pairs)} similar pair(s))") + for pair in ps.similar_pairs[:5]: + a_text = pair.prompt_a_text[:40] + "..." if len(pair.prompt_a_text) > 40 else pair.prompt_a_text + b_text = pair.prompt_b_text[:40] + "..." if len(pair.prompt_b_text) > 40 else pair.prompt_b_text + lines.append(f' #{pair.prompt_a_index+1} "{a_text}" ~ #{pair.prompt_b_index+1} "{b_text}" ({pair.similarity:.2f})') + + drift = ia.intent_drift + if drift.steps: + lines.append(f" Intent Drift: {drift.overall_trajectory} (avg similarity: {drift.average_drift:.2f})") + for step in drift.steps: + lines.append(f" #{step.from_index+1} -> #{step.to_index+1}: {step.drift_type} ({step.similarity:.2f})") + + pra = ia.prompt_response_alignment + if pra.pairs: + lines.append(f" Prompt-Response Alignment: {pra.average_alignment:.2f} ({len(pra.pairs)} pair(s), {pra.low_alignment_count} low)") + for pair in pra.pairs: + prompt_short = pair.prompt_text[:50] + "..." if len(pair.prompt_text) > 50 else pair.prompt_text + marker = " [LOW]" if pair.alignment < 0.3 else "" + lines.append(f' #{pair.prompt_index+1} "{prompt_short}" -> {pair.alignment:.2f}{marker}') + + return lines diff --git a/tests/unit/test_token_counting.py b/tests/unit/test_token_counting.py new file mode 100644 index 0000000..d23adf2 --- /dev/null +++ b/tests/unit/test_token_counting.py @@ -0,0 +1,305 @@ +"""Unit tests for ter_calculator.token_counting module.""" + +from __future__ import annotations + +import pytest + +from ter_calculator.token_counting import ( + CountMethod, + PhaseMultipliers, + TokenCountResult, + _code_density, + calibrate_multiplier, + count_tokens, + estimate_tokens_heuristic, + token_count_confidence, +) + + +# ── _code_density ────────────────────────────────────────────────────────── + + +class TestCodeDensity: + def test_empty_text_returns_zero(self): + assert _code_density("") == 0.0 + + def test_natural_text_low_density(self): + text = "This is a simple natural language sentence with no code." + density = _code_density(text) + assert density < 0.05 + + def test_code_text_higher_density(self): + text = "if (x > 0) { return arr[i]; }" + density = _code_density(text) + assert density > 0.1 + + def test_pure_punctuation_capped_at_one(self): + text = "{}[]();=<>" + density = _code_density(text) + assert density <= 1.0 + + def test_json_like_text(self): + text = '{"key": "value", "list": [1, 2, 3]}' + density = _code_density(text) + assert density > 0.05 + + +# ── estimate_tokens_heuristic ────────────────────────────────────────────── + + +class TestEstimateTokensHeuristic: + def test_empty_text_returns_zero(self): + assert estimate_tokens_heuristic("") == 0 + + def test_normal_text_default_ratio(self): + text = "a" * 40 # 40 chars / 4.0 = 10 tokens + result = estimate_tokens_heuristic(text) + assert result == 10 + + def test_with_reasoning_phase(self): + text = "a" * 40 # ratio 4.0 => 10 tokens + result = estimate_tokens_heuristic(text, phase="reasoning") + assert result == 10 + + def test_with_tool_use_phase(self): + text = "a" * 32 # ratio 3.2 => 10 tokens + result = estimate_tokens_heuristic(text, phase="tool_use") + assert result == 10 + + def test_with_custom_multipliers(self): + custom = PhaseMultipliers(reasoning=2.0, generation=2.0, tool_use=2.0) + text = "a" * 20 # 20 / 2.0 = 10 + result = estimate_tokens_heuristic( + text, phase="reasoning", multipliers=custom + ) + assert result == 10 + + def test_unknown_phase_falls_back_to_default(self): + text = "a" * 40 # default ratio 4.0 => 10 + result = estimate_tokens_heuristic(text, phase="unknown_phase") + assert result == 10 + + def test_no_phase_uses_default_ratio(self): + text = "a" * 100 # 100 / 4.0 = 25 + result = estimate_tokens_heuristic(text) + assert result == 25 + + def test_result_is_non_negative(self): + # Even for very short text, result should be >= 0 + result = estimate_tokens_heuristic("a") + assert result >= 0 + + def test_rounding(self): + # 5 chars / 4.0 = 1.25, rounds to 1 + assert estimate_tokens_heuristic("a" * 5) == 1 + # 6 chars / 4.0 = 1.5, rounds to 2 + assert estimate_tokens_heuristic("a" * 6) == 2 + + +# ── calibrate_multiplier ────────────────────────────────────────────────── + + +class TestCalibrateMultiplier: + def test_normal_samples(self): + # If text has 40 chars and known count is 10, multiplier = 40/10 = 4.0 + # OLS formula: m = sum(c*t) / sum(t*t) = (40*10)/(10*10) = 4.0 + samples = [("a" * 40, 10)] + result = calibrate_multiplier(samples) + assert result == pytest.approx(4.0) + + def test_multiple_samples(self): + samples = [ + ("a" * 40, 10), # c=40, t=10 + ("b" * 80, 20), # c=80, t=20 + ] + # sum_ct = 40*10 + 80*20 = 400 + 1600 = 2000 + # sum_tt = 10*10 + 20*20 = 100 + 400 = 500 + # m = 2000/500 = 4.0 + result = calibrate_multiplier(samples) + assert result == pytest.approx(4.0) + + def test_different_ratios(self): + samples = [ + ("a" * 30, 10), # ratio 3.0 + ("b" * 50, 10), # ratio 5.0 + ] + # sum_ct = 30*10 + 50*10 = 300 + 500 = 800 + # sum_tt = 10*10 + 10*10 = 100 + 100 = 200 + # m = 800/200 = 4.0 + result = calibrate_multiplier(samples) + assert result == pytest.approx(4.0) + + def test_empty_samples_raises_value_error(self): + with pytest.raises(ValueError, match="non-empty"): + calibrate_multiplier([]) + + def test_all_zero_token_counts_raises_value_error(self): + samples = [("some text", 0), ("more text", 0)] + with pytest.raises(ValueError, match="zero or negative"): + calibrate_multiplier(samples) + + def test_negative_token_counts_skipped(self): + # Negative counts are skipped; if all are negative => error + samples = [("text", -5)] + with pytest.raises(ValueError, match="zero or negative"): + calibrate_multiplier(samples) + + def test_mixed_valid_and_invalid(self): + samples = [ + ("a" * 40, 10), # valid + ("ignored", 0), # skipped (zero) + ("also ignored", -1), # skipped (negative) + ] + # Only first sample contributes: m = (40*10)/(10*10) = 4.0 + result = calibrate_multiplier(samples) + assert result == pytest.approx(4.0) + + +# ── token_count_confidence ───────────────────────────────────────────────── + + +class TestTokenCountConfidence: + def test_api_method_always_1_0(self): + assert token_count_confidence("any text", CountMethod.API) == 1.0 + + def test_api_method_with_code_still_1_0(self): + code_text = "if (x) { return arr[i]; }" + assert token_count_confidence(code_text, CountMethod.API) == 1.0 + + def test_heuristic_normal_text(self): + text = "This is a normal English sentence without code." + conf = token_count_confidence(text, CountMethod.HEURISTIC) + # Base 0.8, low density -> minimal penalty + assert 0.75 <= conf <= 0.80 + + def test_heuristic_code_heavy_lower_confidence(self): + code = "{[()];=<>{[()];=<>}" + conf = token_count_confidence(code, CountMethod.HEURISTIC) + # Code density is high -> larger penalty from base 0.8 + assert conf < 0.80 + + def test_calibrated_normal_text(self): + text = "Normal text for calibrated counting." + conf = token_count_confidence(text, CountMethod.CALIBRATED) + # Base 0.9, low density -> near 0.9 + assert 0.85 <= conf <= 0.90 + + def test_calibrated_code_text_penalized(self): + code = "function() { return {}; }" + conf = token_count_confidence(code, CountMethod.CALIBRATED) + assert conf < 0.90 + + def test_confidence_never_exceeds_one(self): + conf = token_count_confidence("hello", CountMethod.API) + assert conf <= 1.0 + + def test_confidence_never_below_zero(self): + # Even with maximum code density, confidence >= 0 + extreme = "{" * 1000 + conf = token_count_confidence(extreme, CountMethod.HEURISTIC) + assert conf >= 0.0 + + def test_empty_text_heuristic(self): + conf = token_count_confidence("", CountMethod.HEURISTIC) + # Empty text -> density 0.0 -> no penalty -> base 0.8 + assert conf == pytest.approx(0.8) + + +# ── count_tokens ─────────────────────────────────────────────────────────── + + +class TestCountTokens: + def test_empty_text(self): + result = count_tokens("") + assert result.estimated_tokens == 0 + assert result.confidence == 1.0 + assert result.method_used is CountMethod.HEURISTIC + + def test_default_heuristic(self): + text = "a" * 40 + result = count_tokens(text) + assert result.estimated_tokens == 10 + assert result.method_used is CountMethod.HEURISTIC + assert 0.0 <= result.confidence <= 1.0 + + def test_with_phase(self): + text = "a" * 32 + result = count_tokens(text, phase="tool_use") + assert result.estimated_tokens == 10 + assert result.method_used is CountMethod.HEURISTIC + + def test_with_calibrated_multiplier(self): + text = "a" * 50 + result = count_tokens(text, calibrated_multiplier=5.0) + assert result.estimated_tokens == 10 # 50 / 5.0 + assert result.method_used is CountMethod.CALIBRATED + assert result.confidence <= 0.9 # calibrated base + + def test_calibrated_takes_precedence_over_heuristic(self): + text = "a" * 40 + result = count_tokens( + text, phase="reasoning", calibrated_multiplier=4.0 + ) + # Calibrated path should be chosen over heuristic + assert result.method_used is CountMethod.CALIBRATED + + def test_zero_calibrated_multiplier_falls_to_heuristic(self): + text = "a" * 40 + result = count_tokens(text, calibrated_multiplier=0.0) + assert result.method_used is CountMethod.HEURISTIC + + def test_negative_calibrated_multiplier_falls_to_heuristic(self): + text = "a" * 40 + result = count_tokens(text, calibrated_multiplier=-1.0) + assert result.method_used is CountMethod.HEURISTIC + + def test_use_api_false_skips_api(self): + text = "some text" + result = count_tokens(text, use_api=False) + assert result.method_used in (CountMethod.HEURISTIC, CountMethod.CALIBRATED) + + def test_custom_multipliers_passed_through(self): + custom = PhaseMultipliers(reasoning=2.0, generation=2.0, tool_use=2.0) + text = "a" * 20 # 20 / 2.0 = 10 + result = count_tokens(text, phase="reasoning", multipliers=custom) + assert result.estimated_tokens == 10 + assert result.method_used is CountMethod.HEURISTIC + + def test_result_is_token_count_result(self): + result = count_tokens("hello world") + assert isinstance(result, TokenCountResult) + + def test_result_is_frozen(self): + result = count_tokens("hello world") + with pytest.raises(AttributeError): + result.estimated_tokens = 999 # type: ignore[misc] + + +# ── PhaseMultipliers defaults ────────────────────────────────────────────── + + +class TestPhaseMultipliers: + def test_default_values(self): + pm = PhaseMultipliers() + assert pm.reasoning == 4.0 + assert pm.generation == 4.0 + assert pm.tool_use == 3.2 + + def test_custom_values(self): + pm = PhaseMultipliers(reasoning=3.0, generation=5.0, tool_use=2.5) + assert pm.reasoning == 3.0 + assert pm.generation == 5.0 + assert pm.tool_use == 2.5 + + +# ── CountMethod enum ────────────────────────────────────────────────────── + + +class TestCountMethod: + def test_values(self): + assert CountMethod.API.value == "api" + assert CountMethod.CALIBRATED.value == "calibrated" + assert CountMethod.HEURISTIC.value == "heuristic" + + def test_members_count(self): + assert len(CountMethod) == 3 diff --git a/tests/unit/test_validation.py b/tests/unit/test_validation.py new file mode 100644 index 0000000..e3f2c81 --- /dev/null +++ b/tests/unit/test_validation.py @@ -0,0 +1,935 @@ +"""Tests for JSONL session validation.""" + +import json + +import pytest + +from ter_calculator.validation import ( + CompletenessAssessment, + ContentDistribution, + FileValidationResult, + HealthReport, + SessionValidationResult, + ValidationResult, + assess_completeness, + generate_health_report, + validate_jsonl_file, + validate_jsonl_line, + validate_session, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_line( + *, + line_type="user", + role="user", + content="Hello", + uuid="u1", + session_id="s1", + extra_top=None, + extra_msg=None, + omit_top=None, + omit_msg=None, +): + """Build a valid JSONL dict, then serialise it.""" + top = { + "type": line_type, + "uuid": uuid, + "sessionId": session_id, + "message": { + "role": role, + "content": content, + }, + } + if extra_top: + top.update(extra_top) + if extra_msg: + top["message"].update(extra_msg) + if omit_top: + for k in omit_top: + top.pop(k, None) + if omit_msg: + for k in omit_msg: + top["message"].pop(k, None) + return json.dumps(top) + + +def _make_assistant_line( + content=None, + uuid="a1", + stop_reason="end_turn", + usage=None, + timestamp=None, +): + """Build an assistant JSONL dict.""" + if content is None: + content = [{"type": "text", "text": "Hi there!"}] + msg = {"role": "assistant", "content": content, "stop_reason": stop_reason} + if usage: + msg["usage"] = usage + entry = { + "type": "assistant", + "uuid": uuid, + "sessionId": "s1", + "message": msg, + } + if timestamp: + entry["timestamp"] = timestamp + return entry + + +def _make_user_line(content="Hello", uuid="u1", timestamp=None): + """Build a user JSONL dict.""" + entry = { + "type": "user", + "uuid": uuid, + "sessionId": "s1", + "message": {"role": "user", "content": content}, + } + if timestamp: + entry["timestamp"] = timestamp + return entry + + +# --------------------------------------------------------------------------- +# 1. validate_jsonl_line +# --------------------------------------------------------------------------- + + +class TestValidateJsonlLine: + """Tests for single-line JSONL validation.""" + + def test_valid_user_line(self): + line = _make_line(role="user", content="Hello") + result = validate_jsonl_line(line, line_number=1) + assert result.valid is True + assert result.errors == [] + + def test_valid_assistant_line_with_text_block(self): + content = [{"type": "text", "text": "Response"}] + line = _make_line( + line_type="assistant", role="assistant", content=content + ) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is True + assert result.errors == [] + + def test_invalid_json(self): + result = validate_jsonl_line("{not valid json", line_number=5) + assert result.valid is False + assert len(result.errors) == 1 + assert "Invalid JSON" in result.errors[0] + assert result.line_number == 5 + + def test_non_object_json(self): + result = validate_jsonl_line(json.dumps([1, 2, 3]), line_number=2) + assert result.valid is False + assert "Expected a JSON object" in result.errors[0] + + def test_empty_line(self): + result = validate_jsonl_line("", line_number=1) + assert result.valid is True + assert any("Empty line" in w for w in result.warnings) + + def test_whitespace_only_line(self): + result = validate_jsonl_line(" \t ", line_number=1) + assert result.valid is True + assert any("Empty line" in w for w in result.warnings) + + def test_missing_required_top_level_fields(self): + line = _make_line(omit_top=["type", "uuid"]) + result = validate_jsonl_line(line, line_number=3) + assert result.valid is False + assert any("Missing required top-level fields" in e for e in result.errors) + assert "type" in result.errors[0] + assert "uuid" in result.errors[0] + + def test_missing_message_field(self): + line = _make_line(omit_top=["message"]) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is False + assert any("Missing required top-level fields" in e for e in result.errors) + + def test_missing_required_message_fields(self): + line = _make_line(omit_msg=["role", "content"]) + result = validate_jsonl_line(line, line_number=4) + assert result.valid is False + assert any("Missing required message fields" in e for e in result.errors) + + def test_message_not_dict(self): + raw = json.dumps({ + "type": "user", + "uuid": "u1", + "sessionId": "s1", + "message": "not a dict", + }) + result = validate_jsonl_line(raw, line_number=1) + assert result.valid is False + assert any("'message' must be a dict" in e for e in result.errors) + + def test_unexpected_role_warning(self): + line = _make_line(role="system") + result = validate_jsonl_line(line, line_number=1) + assert result.valid is True + assert any("Unexpected role" in w for w in result.warnings) + + def test_unknown_block_type_warning(self): + content = [{"type": "image_url", "url": "http://example.com"}] + line = _make_line(role="assistant", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is True + assert any("unknown block type" in w for w in result.warnings) + + def test_content_block_missing_type(self): + content = [{"text": "no type field here"}] + line = _make_line(role="assistant", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is False + assert any("missing 'type' field" in e for e in result.errors) + + def test_text_block_missing_text_field(self): + content = [{"type": "text"}] + line = _make_line(role="assistant", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is False + assert any("missing 'text' field" in e for e in result.errors) + + def test_text_block_text_not_string(self): + content = [{"type": "text", "text": 42}] + line = _make_line(role="assistant", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is False + assert any("'text' must be a string" in e for e in result.errors) + + def test_thinking_block_valid(self): + content = [{"type": "thinking", "thinking": "Let me think..."}] + line = _make_line(role="assistant", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is True + + def test_thinking_block_with_text_key(self): + content = [{"type": "thinking", "text": "Thinking via text key"}] + line = _make_line(role="assistant", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is True + assert result.warnings == [] + + def test_thinking_block_missing_both_fields(self): + content = [{"type": "thinking"}] + line = _make_line(role="assistant", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is True # warning, not an error + assert any("missing" in w and "thinking" in w for w in result.warnings) + + def test_tool_use_block_valid(self): + content = [ + {"type": "tool_use", "id": "t1", "name": "bash", "input": {"cmd": "ls"}} + ] + line = _make_line(role="assistant", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is True + + def test_tool_use_block_missing_name(self): + content = [{"type": "tool_use", "id": "t1", "input": {}}] + line = _make_line(role="assistant", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is False + assert any("missing 'name'" in e for e in result.errors) + + def test_tool_use_block_missing_id(self): + content = [{"type": "tool_use", "name": "bash", "input": {}}] + line = _make_line(role="assistant", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is False + assert any("missing 'id'" in e for e in result.errors) + + def test_tool_result_block_valid(self): + content = [ + {"type": "tool_result", "tool_use_id": "t1", "content": "OK"} + ] + line = _make_line(role="user", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is True + + def test_tool_result_block_missing_tool_use_id(self): + content = [{"type": "tool_result", "content": "OK"}] + line = _make_line(role="user", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is False + assert any("missing 'tool_use_id'" in e for e in result.errors) + + def test_content_not_string_or_list(self): + line = _make_line(content=42) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is False + assert any("'content' must be a string or list" in e for e in result.errors) + + def test_content_block_not_dict_warning(self): + content = ["just a string in the list"] + line = _make_line(role="assistant", content=content) + result = validate_jsonl_line(line, line_number=1) + assert result.valid is True + assert any("is not a dict" in w for w in result.warnings) + + @pytest.mark.parametrize( + "meta_type", + [ + "attachment", + "file-history-snapshot", + "last-prompt", + "permission-mode", + "progress", + "queue-operation", + "summary", + "system", + ], + ) + def test_meta_line_types_are_valid(self, meta_type): + raw = json.dumps({"type": meta_type, "data": "something"}) + result = validate_jsonl_line(raw, line_number=1) + assert result.valid is True + assert result.errors == [] + assert result.warnings == [] + + def test_line_number_is_preserved(self): + result = validate_jsonl_line("{bad json", line_number=99) + assert result.line_number == 99 + + def test_string_content_is_valid(self): + line = _make_line(role="user", content="plain text content") + result = validate_jsonl_line(line, line_number=1) + assert result.valid is True + + +# --------------------------------------------------------------------------- +# 2. validate_session +# --------------------------------------------------------------------------- + + +class TestValidateSession: + """Tests for full session validation.""" + + def test_empty_session(self): + result = validate_session([]) + assert result.valid is False + assert result.message_count == 0 + assert any("no user messages" in e for e in result.errors) + assert any("no assistant messages" in e for e in result.errors) + + def test_single_user_message(self): + entries = [_make_user_line()] + result = validate_session(entries) + assert result.valid is False + assert result.message_count == 1 + assert any("no assistant messages" in e for e in result.errors) + + def test_single_assistant_message(self): + entries = [_make_assistant_line()] + result = validate_session(entries) + assert result.valid is False + assert result.message_count == 1 + assert any("no user messages" in e for e in result.errors) + + def test_valid_user_assistant_pair(self): + entries = [ + _make_user_line(timestamp="2026-04-01T10:00:00.000Z"), + _make_assistant_line(timestamp="2026-04-01T10:00:01.000Z"), + ] + result = validate_session(entries) + assert result.valid is True + assert result.message_count == 2 + assert result.errors == [] + + def test_timestamp_out_of_order(self): + entries = [ + _make_user_line( + uuid="u1", timestamp="2026-04-01T10:00:05.000Z" + ), + _make_assistant_line( + uuid="a1", timestamp="2026-04-01T10:00:01.000Z" + ), + ] + result = validate_session(entries) + assert result.valid is False + assert any("Timestamp out of order" in e for e in result.errors) + + def test_negative_token_count(self): + entries = [ + _make_user_line(), + _make_assistant_line( + usage={ + "input_tokens": -5, + "output_tokens": 10, + "cache_creation_input_tokens": 0, + "cache_read_input_tokens": 0, + } + ), + ] + result = validate_session(entries) + assert result.valid is False + assert any("Negative token count" in e for e in result.errors) + + def test_tool_result_without_matching_tool_use(self): + entries = [ + _make_user_line( + content=[ + {"type": "tool_result", "tool_use_id": "orphan_id", "content": "ok"} + ] + ), + _make_assistant_line(), + ] + result = validate_session(entries) + assert result.valid is False + assert any("non-existent tool_use ids" in e for e in result.errors) + + def test_tool_use_without_matching_tool_result_warning(self): + entries = [ + _make_user_line(), + _make_assistant_line( + content=[ + { + "type": "tool_use", + "id": "t_orphan", + "name": "bash", + "input": {}, + }, + {"type": "text", "text": "done"}, + ] + ), + ] + result = validate_session(entries) + assert any("without matching tool_result" in w for w in result.warnings) + + def test_matched_tool_use_and_result(self): + entries = [ + _make_user_line(), + _make_assistant_line( + content=[ + { + "type": "tool_use", + "id": "t1", + "name": "bash", + "input": {"cmd": "ls"}, + } + ] + ), + _make_user_line( + uuid="u2", + content=[ + {"type": "tool_result", "tool_use_id": "t1", "content": "file.txt"} + ], + ), + _make_assistant_line(uuid="a2"), + ] + result = validate_session(entries) + assert result.valid is True + assert result.errors == [] + assert result.warnings == [] + + def test_meta_lines_are_skipped(self): + entries = [ + {"type": "system", "data": "init"}, + _make_user_line(), + {"type": "summary", "text": "summary"}, + _make_assistant_line(), + ] + result = validate_session(entries) + assert result.valid is True + assert result.message_count == 2 + + def test_string_content_counts_as_block(self): + entries = [ + _make_user_line(content="simple string"), + _make_assistant_line(), + ] + result = validate_session(entries) + assert result.content_block_count >= 2 + + def test_non_dict_entries_skipped(self): + entries = [ + "not a dict", + 42, + None, + _make_user_line(), + _make_assistant_line(), + ] + result = validate_session(entries) + assert result.valid is True + assert result.message_count == 2 + + +# --------------------------------------------------------------------------- +# 3. validate_jsonl_file +# --------------------------------------------------------------------------- + + +class TestValidateJsonlFile: + """Tests for whole-file JSONL validation.""" + + def test_valid_file(self, tmp_path): + f = tmp_path / "valid.jsonl" + lines = [ + _make_line(role="user", content="Hi"), + _make_line( + line_type="assistant", + role="assistant", + content=[{"type": "text", "text": "Hello"}], + uuid="a1", + ), + ] + f.write_text("\n".join(lines), encoding="utf-8") + + result = validate_jsonl_file(str(f)) + assert result.valid is True + assert result.total_lines == 2 + assert result.valid_lines == 2 + assert result.error_lines == [] + assert result.errors == [] + + def test_nonexistent_file(self): + with pytest.raises(FileNotFoundError, match="JSONL file not found"): + validate_jsonl_file("/tmp/does_not_exist_abc123.jsonl") + + def test_file_with_invalid_lines(self, tmp_path): + f = tmp_path / "mixed.jsonl" + valid = _make_line(role="user", content="Hello") + invalid = "{bad json" + f.write_text(f"{valid}\n{invalid}\n", encoding="utf-8") + + result = validate_jsonl_file(str(f)) + assert result.valid is False + assert result.total_lines == 2 + assert result.valid_lines == 1 + assert 2 in result.error_lines + assert len(result.errors) == 1 + assert "Invalid JSON" in result.errors[0] + + def test_file_with_empty_lines(self, tmp_path): + f = tmp_path / "empties.jsonl" + valid = _make_line(role="user", content="Hello") + f.write_text(f"{valid}\n\n\n", encoding="utf-8") + + result = validate_jsonl_file(str(f)) + assert result.valid is True + assert any("Empty line" in w for w in result.warnings) + + def test_file_with_meta_lines(self, tmp_path): + f = tmp_path / "meta.jsonl" + meta = json.dumps({"type": "permission-mode", "mode": "default"}) + valid = _make_line(role="user", content="Hello") + f.write_text(f"{meta}\n{valid}\n", encoding="utf-8") + + result = validate_jsonl_file(str(f)) + assert result.valid is True + assert result.total_lines == 2 + assert result.valid_lines == 2 + + def test_file_all_invalid(self, tmp_path): + f = tmp_path / "bad.jsonl" + f.write_text("{bad\n{also bad\n", encoding="utf-8") + + result = validate_jsonl_file(str(f)) + assert result.valid is False + assert result.total_lines == 2 + assert result.valid_lines == 0 + assert result.error_lines == [1, 2] + assert len(result.errors) == 2 + + def test_file_accepts_path_object(self, tmp_path): + f = tmp_path / "path_obj.jsonl" + f.write_text(_make_line(role="user", content="Hi"), encoding="utf-8") + result = validate_jsonl_file(f) # Pass Path object, not str + assert result.valid is True + + +# --------------------------------------------------------------------------- +# 4. assess_completeness +# --------------------------------------------------------------------------- + + +class TestAssessCompleteness: + """Tests for session completeness assessment.""" + + def test_complete_session(self): + entries = [ + _make_user_line(), + _make_assistant_line(stop_reason="end_turn"), + ] + result = assess_completeness(entries) + assert result.is_complete is True + assert result.completeness_score == 1.0 + assert result.issues == [] + + def test_no_assistant_messages(self): + entries = [_make_user_line()] + result = assess_completeness(entries) + assert result.is_complete is False + assert result.completeness_score < 1.0 + assert any("No assistant messages" in i for i in result.issues) + + def test_wrong_stop_reason(self): + entries = [ + _make_user_line(), + _make_assistant_line(stop_reason="max_tokens"), + ] + result = assess_completeness(entries) + assert result.is_complete is False + assert result.completeness_score < 1.0 + assert any("stop_reason" in i for i in result.issues) + + def test_none_stop_reason(self): + entries = [ + _make_user_line(), + _make_assistant_line(stop_reason=None), + ] + result = assess_completeness(entries) + assert result.is_complete is False + + def test_unresolved_tool_use(self): + entries = [ + _make_user_line(), + _make_assistant_line( + content=[ + { + "type": "tool_use", + "id": "t1", + "name": "bash", + "input": {}, + }, + {"type": "text", "text": "done"}, + ], + stop_reason="end_turn", + ), + ] + result = assess_completeness(entries) + assert result.is_complete is False + assert any("unresolved tool_use" in i for i in result.issues) + + def test_session_ends_mid_tool_use(self): + entries = [ + _make_user_line(), + _make_assistant_line( + content=[ + { + "type": "tool_use", + "id": "t1", + "name": "bash", + "input": {}, + } + ], + stop_reason="tool_use", + ), + ] + result = assess_completeness(entries) + assert result.is_complete is False + assert any("ends mid-tool-use" in i for i in result.issues) + + def test_completeness_score_clamped_to_zero(self): + result = assess_completeness([]) + assert result.completeness_score >= 0.0 + + def test_resolved_tool_use_is_complete(self): + entries = [ + _make_user_line(), + _make_assistant_line( + uuid="a1", + content=[ + { + "type": "tool_use", + "id": "t1", + "name": "bash", + "input": {}, + } + ], + stop_reason="tool_use", + ), + _make_user_line( + uuid="u2", + content=[ + {"type": "tool_result", "tool_use_id": "t1", "content": "done"} + ], + ), + _make_assistant_line( + uuid="a2", + content=[{"type": "text", "text": "Finished."}], + stop_reason="end_turn", + ), + ] + result = assess_completeness(entries) + assert result.is_complete is True + assert result.completeness_score == 1.0 + + def test_meta_lines_ignored(self): + entries = [ + {"type": "system", "data": "init"}, + _make_user_line(), + _make_assistant_line(stop_reason="end_turn"), + ] + result = assess_completeness(entries) + assert result.is_complete is True + + +# --------------------------------------------------------------------------- +# 5. generate_health_report +# --------------------------------------------------------------------------- + + +class TestGenerateHealthReport: + """Tests for the pre-analysis health report.""" + + def test_basic_health_report(self): + entries = [ + _make_user_line(content="Hello"), + _make_assistant_line( + content=[{"type": "text", "text": "Hi there!"}] + ), + ] + report = generate_health_report(entries) + assert report.user_message_count == 1 + assert report.assistant_message_count == 1 + assert report.content_distribution.text_count >= 2 + assert report.generation_tokens > 0 + assert report.parsing_warnings == [] + + def test_empty_session_report(self): + report = generate_health_report([]) + assert report.user_message_count == 0 + assert report.assistant_message_count == 0 + assert report.estimated_total_tokens == 0 + assert report.content_distribution.total == 0 + + def test_content_distribution_counts(self): + entries = [ + _make_user_line(), + _make_assistant_line( + content=[ + {"type": "thinking", "thinking": "Let me think..."}, + {"type": "text", "text": "Here is the answer"}, + { + "type": "tool_use", + "id": "t1", + "name": "bash", + "input": {"cmd": "ls"}, + }, + ] + ), + _make_user_line( + uuid="u2", + content=[ + {"type": "tool_result", "tool_use_id": "t1", "content": "file.txt"} + ], + ), + ] + report = generate_health_report(entries) + dist = report.content_distribution + assert dist.thinking_count == 1 + assert dist.text_count >= 1 + assert dist.tool_use_count == 1 + assert dist.tool_result_count == 1 + assert dist.total >= 4 + + def test_reasoning_tokens_from_thinking_blocks(self): + entries = [ + _make_user_line(), + _make_assistant_line( + content=[ + {"type": "thinking", "thinking": "Deep reasoning here " * 20}, + {"type": "text", "text": "Answer"}, + ] + ), + ] + report = generate_health_report(entries) + assert report.reasoning_tokens > 0 + assert report.generation_tokens > 0 + + def test_tool_use_tokens(self): + entries = [ + _make_user_line(), + _make_assistant_line( + content=[ + { + "type": "tool_use", + "id": "t1", + "name": "bash", + "input": {"cmd": "echo hello world"}, + }, + {"type": "text", "text": "Done"}, + ] + ), + _make_user_line( + uuid="u2", + content=[ + { + "type": "tool_result", + "tool_use_id": "t1", + "content": "hello world", + } + ], + ), + ] + report = generate_health_report(entries) + assert report.tool_use_tokens > 0 + + def test_api_tokens_used_when_available(self): + entries = [ + _make_user_line(), + _make_assistant_line( + usage={ + "input_tokens": 100, + "output_tokens": 50, + "cache_creation_input_tokens": 10, + "cache_read_input_tokens": 5, + } + ), + ] + report = generate_health_report(entries) + assert report.estimated_total_tokens == 165 # 100 + 50 + 10 + 5 + + def test_fallback_to_estimated_tokens(self): + entries = [ + _make_user_line(content="Hello world"), + _make_assistant_line( + content=[{"type": "text", "text": "Goodbye world"}] + ), + ] + report = generate_health_report(entries) + assert report.estimated_total_tokens > 0 + assert report.estimated_total_tokens == ( + report.reasoning_tokens + report.tool_use_tokens + report.generation_tokens + ) + + def test_non_dict_entry_produces_warning(self): + entries = ["not a dict", _make_user_line(), _make_assistant_line()] + report = generate_health_report(entries) + assert len(report.parsing_warnings) == 1 + assert "Non-dict" in report.parsing_warnings[0] + + def test_meta_lines_skipped(self): + entries = [ + {"type": "system", "data": "init"}, + {"type": "summary", "text": "blah"}, + _make_user_line(), + _make_assistant_line(), + ] + report = generate_health_report(entries) + assert report.user_message_count == 1 + assert report.assistant_message_count == 1 + + def test_estimated_analysis_seconds(self): + entries = [ + _make_user_line(content="Hello"), + _make_assistant_line( + content=[ + {"type": "text", "text": "A"}, + {"type": "text", "text": "B"}, + {"type": "text", "text": "C"}, + ] + ), + ] + report = generate_health_report(entries) + # 1 span for user string + 3 spans for assistant blocks = 4 spans + # 4 * 0.0005 = 0.002 + assert report.estimated_analysis_seconds == pytest.approx(0.002) + + def test_string_content_counted_as_text(self): + entries = [ + _make_user_line(content="Plain text user message"), + _make_assistant_line(), + ] + report = generate_health_report(entries) + assert report.content_distribution.text_count >= 1 + assert report.generation_tokens > 0 + + +# --------------------------------------------------------------------------- +# 6. ContentDistribution properties +# --------------------------------------------------------------------------- + + +class TestContentDistribution: + """Tests for ContentDistribution percentage calculations.""" + + def test_total(self): + cd = ContentDistribution( + text_count=5, + tool_use_count=3, + tool_result_count=3, + thinking_count=2, + other_count=1, + ) + assert cd.total == 14 + + def test_percentages(self): + cd = ContentDistribution( + text_count=50, + tool_use_count=25, + tool_result_count=15, + thinking_count=10, + other_count=0, + ) + assert cd.text_pct == 50.0 + assert cd.tool_use_pct == 25.0 + assert cd.tool_result_pct == 15.0 + assert cd.thinking_pct == 10.0 + assert cd.other_pct == 0.0 + + def test_zero_total_returns_zero_pct(self): + cd = ContentDistribution() + assert cd.total == 0 + assert cd.text_pct == 0.0 + assert cd.tool_use_pct == 0.0 + assert cd.tool_result_pct == 0.0 + assert cd.thinking_pct == 0.0 + assert cd.other_pct == 0.0 + + +# --------------------------------------------------------------------------- +# 7. Dataclass defaults +# --------------------------------------------------------------------------- + + +class TestDataclassDefaults: + """Tests for dataclass default values.""" + + def test_validation_result_defaults(self): + vr = ValidationResult(valid=True) + assert vr.errors == [] + assert vr.warnings == [] + assert vr.line_number == 0 + + def test_session_validation_result_defaults(self): + svr = SessionValidationResult(valid=True) + assert svr.errors == [] + assert svr.warnings == [] + assert svr.message_count == 0 + assert svr.content_block_count == 0 + + def test_file_validation_result_defaults(self): + fvr = FileValidationResult(valid=True, total_lines=0, valid_lines=0) + assert fvr.error_lines == [] + assert fvr.errors == [] + assert fvr.warnings == [] + + def test_health_report_defaults(self): + hr = HealthReport( + user_message_count=0, + assistant_message_count=0, + estimated_total_tokens=0, + content_distribution=ContentDistribution(), + reasoning_tokens=0, + tool_use_tokens=0, + generation_tokens=0, + ) + assert hr.parsing_warnings == [] + assert hr.estimated_analysis_seconds == 0.0 + + def test_completeness_assessment_defaults(self): + ca = CompletenessAssessment( + is_complete=True, completeness_score=1.0 + ) + assert ca.issues == [] diff --git a/tests/unit/test_waste_detectors.py b/tests/unit/test_waste_detectors.py new file mode 100644 index 0000000..cd2a9cb --- /dev/null +++ b/tests/unit/test_waste_detectors.py @@ -0,0 +1,954 @@ +"""Tests for extended waste pattern detectors (waste_detectors.py).""" + +import pytest + +from ter_calculator.models import ( + ClassifiedSpan, + SpanLabel, + SpanPhase, + TokenSpan, +) +from ter_calculator.waste_detectors import ( + ExtendedWasteType, + detect_abandoned_approaches, + detect_all_extended, + detect_error_retry_spirals, + detect_over_reading, + detect_permission_loops, + detect_verbose_thinking, +) + + +# --------------------------------------------------------------------------- +# Helper factories +# --------------------------------------------------------------------------- + + +def _make_cs( + phase: SpanPhase, + text: str = "test", + position: int = 0, + token_count: int = 50, + block_type: str = "", + label: SpanLabel = SpanLabel.ALIGNED_TOOL_CALL, +) -> ClassifiedSpan: + """Build a ClassifiedSpan with convenient defaults.""" + if not block_type: + if phase == SpanPhase.TOOL_USE: + block_type = "tool_use" + elif phase == SpanPhase.REASONING: + block_type = "thinking" + else: + block_type = "text" + span = TokenSpan( + text=text, + phase=phase, + position=position, + token_count=token_count, + source_message_uuid="msg-1", + block_type=block_type, + ) + return ClassifiedSpan( + span=span, + label=label, + confidence=0.9, + cosine_similarity=0.5, + ) + + +def _tool_use(text: str, position: int, token_count: int = 50) -> ClassifiedSpan: + """Shorthand for a tool_use span.""" + return _make_cs( + SpanPhase.TOOL_USE, + text=text, + position=position, + token_count=token_count, + block_type="tool_use", + ) + + +def _tool_result(text: str, position: int, token_count: int = 20) -> ClassifiedSpan: + """Shorthand for a tool_result span.""" + return _make_cs( + SpanPhase.TOOL_USE, + text=text, + position=position, + token_count=token_count, + block_type="tool_result", + ) + + +def _reasoning(text: str, position: int, token_count: int = 100) -> ClassifiedSpan: + """Shorthand for a reasoning span.""" + return _make_cs( + SpanPhase.REASONING, + text=text, + position=position, + token_count=token_count, + label=SpanLabel.ALIGNED_REASONING, + ) + + +def _generation(text: str, position: int, token_count: int = 50) -> ClassifiedSpan: + """Shorthand for a generation span.""" + return _make_cs( + SpanPhase.GENERATION, + text=text, + position=position, + token_count=token_count, + label=SpanLabel.ALIGNED_RESPONSE, + ) + + +# =================================================================== +# 1. detect_permission_loops +# =================================================================== + + +class TestDetectPermissionLoops: + def test_empty_input(self): + assert detect_permission_loops([]) == [] + + def test_no_tool_spans_returns_empty(self): + """Non-tool spans should produce no permission-loop patterns.""" + spans = [ + _reasoning("thinking about it", position=0), + _generation("some output", position=1), + ] + assert detect_permission_loops(spans) == [] + + def test_no_permission_issues(self): + """Tool calls that succeed should produce no patterns.""" + spans = [ + _tool_use('Bash {"command":"ls"}', position=0), + _tool_result("file1.py file2.py", position=1), + _tool_use('Bash {"command":"cat file1.py"}', position=2), + _tool_result("contents...", position=3), + ] + assert detect_permission_loops(spans) == [] + + def test_detects_permission_loop_default_min_retries(self): + """Three identical calls with denial results between them = 2 retries.""" + spans = [ + _tool_use('Bash {"command":"rm /etc/passwd"}', position=0), + _tool_result("permission denied", position=1), + _tool_use('Bash {"command":"rm /etc/passwd"}', position=2), + _tool_result("permission denied", position=3), + _tool_use('Bash {"command":"rm /etc/passwd"}', position=4), + ] + patterns = detect_permission_loops(spans) + assert len(patterns) == 1 + p = patterns[0] + assert p.pattern_type == ExtendedWasteType.PERMISSION_LOOP.value + assert p.details["tool_name"] == "Bash" + assert p.details["retries"] == 2 + assert p.start_position == 0 + assert p.end_position == 4 + assert p.spans_involved == 3 + # Wasted tokens = token_count of the 2 retries (positions 2 and 4) + assert p.tokens_wasted == 100 + + def test_below_min_retries_threshold(self): + """Only 1 retry (2 calls total) with default min_retries=2 -- not flagged.""" + spans = [ + _tool_use('Bash {"command":"rm /root/x"}', position=0), + _tool_result("access denied", position=1), + _tool_use('Bash {"command":"rm /root/x"}', position=2), + ] + assert detect_permission_loops(spans) == [] + + def test_custom_min_retries_1(self): + """Lowering min_retries=1 should flag a single retry.""" + spans = [ + _tool_use('Bash {"command":"rm /root/x"}', position=0), + _tool_result("access denied", position=1), + _tool_use('Bash {"command":"rm /root/x"}', position=2), + ] + patterns = detect_permission_loops(spans, min_retries=1) + assert len(patterns) == 1 + assert patterns[0].details["retries"] == 1 + + def test_high_min_retries_not_flagged(self): + """Raising min_retries above actual retries prevents detection.""" + spans = [ + _tool_use('Write {"file_path":"/etc/secret"}', position=0), + _tool_result("permission denied", position=1), + _tool_use('Write {"file_path":"/etc/secret"}', position=2), + _tool_result("permission denied", position=3), + _tool_use('Write {"file_path":"/etc/secret"}', position=4), + ] + # 2 retries, but we require 3 + patterns = detect_permission_loops(spans, min_retries=3) + assert patterns == [] + + def test_different_tool_breaks_chain(self): + """Switching to a different tool between denied calls breaks the chain.""" + spans = [ + _tool_use('Bash {"command":"rm /root/x"}', position=0), + _tool_result("permission denied", position=1), + _tool_use('Read {"file_path":"/root/x"}', position=2), + _tool_result("permission denied", position=3), + _tool_use('Bash {"command":"rm /root/x"}', position=4), + ] + # The chain for Bash is broken by the intervening Read tool_use + assert detect_permission_loops(spans) == [] + + def test_all_permission_keywords(self): + """Each denial keyword should be recognised (case-insensitive).""" + for keyword in [ + "permission denied", + "not allowed", + "access denied", + "EACCES: operation not permitted", + "unauthorized request", + ]: + spans = [ + _tool_use('Write {"file_path":"/etc/secret"}', position=0), + _tool_result(keyword, position=1), + _tool_use('Write {"file_path":"/etc/secret"}', position=2), + _tool_result(keyword, position=3), + _tool_use('Write {"file_path":"/etc/secret"}', position=4), + ] + patterns = detect_permission_loops(spans) + assert len(patterns) == 1, f"Failed for keyword: {keyword}" + + def test_intervening_reasoning_does_not_break_chain(self): + """Reasoning spans between tool_use spans should not affect detection.""" + spans = [ + _tool_use('Bash {"command":"sudo rm"}', position=0), + _tool_result("permission denied", position=1), + _reasoning("Let me try again", position=2), + _tool_use('Bash {"command":"sudo rm"}', position=3), + _tool_result("permission denied", position=4), + _reasoning("Still denied, trying once more", position=5), + _tool_use('Bash {"command":"sudo rm"}', position=6), + ] + patterns = detect_permission_loops(spans) + assert len(patterns) == 1 + assert patterns[0].details["retries"] == 2 + + def test_no_denial_result_between_calls(self): + """If the result between two identical calls is not a denial, no pattern.""" + spans = [ + _tool_use('Bash {"command":"make"}', position=0), + _tool_result("build succeeded", position=1), + _tool_use('Bash {"command":"make"}', position=2), + _tool_result("build succeeded", position=3), + _tool_use('Bash {"command":"make"}', position=4), + ] + assert detect_permission_loops(spans) == [] + + +# =================================================================== +# 2. detect_error_retry_spirals +# =================================================================== + + +class TestDetectErrorRetrySpirals: + def test_empty_input(self): + assert detect_error_retry_spirals([]) == [] + + def test_no_errors(self): + """Successful tool calls should produce no patterns.""" + spans = [ + _tool_use('Bash {"command":"ls"}', position=0), + _tool_result("file1.py", position=1), + _tool_use('Bash {"command":"cat file1.py"}', position=2), + _tool_result("content", position=3), + ] + assert detect_error_retry_spirals(spans) == [] + + def test_detects_error_spiral_default_min_3(self): + """4 identical calls with error results between them = 3 retries.""" + spans = [ + _tool_use('Bash {"command":"python run.py --flag=val"}', position=0), + _tool_result("error: ModuleNotFoundError", position=1), + _tool_use('Bash {"command":"python run.py --flag=val"}', position=2), + _tool_result("error: ModuleNotFoundError", position=3), + _tool_use('Bash {"command":"python run.py --flag=val"}', position=4), + _tool_result("error: ModuleNotFoundError", position=5), + _tool_use('Bash {"command":"python run.py --flag=val"}', position=6), + ] + patterns = detect_error_retry_spirals(spans) + assert len(patterns) == 1 + p = patterns[0] + assert p.pattern_type == ExtendedWasteType.ERROR_RETRY_SPIRAL.value + assert p.details["tool_name"] == "Bash" + assert p.details["retries"] == 3 + assert p.spans_involved == 4 + + def test_below_min_retries_threshold(self): + """2 retries with default min_retries=3 should not be flagged.""" + spans = [ + _tool_use('Bash {"command":"make build"}', position=0), + _tool_result("error: compilation failed", position=1), + _tool_use('Bash {"command":"make build"}', position=2), + _tool_result("error: compilation failed", position=3), + _tool_use('Bash {"command":"make build"}', position=4), + ] + assert detect_error_retry_spirals(spans) == [] + + def test_custom_min_retries_lower(self): + """With min_retries=2, two retries should be flagged.""" + spans = [ + _tool_use('Bash {"command":"make build"}', position=0), + _tool_result("error: compilation failed", position=1), + _tool_use('Bash {"command":"make build"}', position=2), + _tool_result("error: compilation failed", position=3), + _tool_use('Bash {"command":"make build"}', position=4), + ] + patterns = detect_error_retry_spirals(spans, min_retries=2) + assert len(patterns) == 1 + assert patterns[0].details["retries"] == 2 + + def test_significantly_different_params_break_chain(self): + """Completely different params should have low similarity and break chain.""" + spans = [ + _tool_use( + 'Bash {"command":"python run.py --mode=fast --verbose"}', + position=0, + ), + _tool_result("error: failed to parse", position=1), + _tool_use( + 'Bash {"command":"node server.js --port=3000 --host=localhost"}', + position=2, + ), + _tool_result("error: failed to start", position=3), + _tool_use( + 'Bash {"command":"cargo build --release --target=x86_64"}', + position=4, + ), + _tool_result("error: missing dependency", position=5), + _tool_use( + 'Bash {"command":"go run main.go --config=/etc/app.yaml"}', + position=6, + ), + ] + # Same tool name but very different params -> low similarity -> no chain + patterns = detect_error_retry_spirals(spans, min_retries=2) + assert len(patterns) == 0 + + def test_different_tool_breaks_chain(self): + """Switching tool names should break the chain.""" + spans = [ + _tool_use('Bash {"command":"ls"}', position=0), + _tool_result("error: no such file", position=1), + _tool_use('Read {"file_path":"x.py"}', position=2), + _tool_result("error: file not found", position=3), + _tool_use('Bash {"command":"ls"}', position=4), + ] + patterns = detect_error_retry_spirals(spans, min_retries=1) + assert len(patterns) == 0 + + def test_error_keywords_case_insensitive(self): + """Error keywords are matched case-insensitively.""" + for keyword in [ + "Error occurred", + "FAILED to execute", + "Exception raised", + "Traceback (most recent call last)", + ]: + spans = [ + _tool_use('Bash {"command":"test"}', position=0), + _tool_result(keyword, position=1), + _tool_use('Bash {"command":"test"}', position=2), + _tool_result(keyword, position=3), + _tool_use('Bash {"command":"test"}', position=4), + _tool_result(keyword, position=5), + _tool_use('Bash {"command":"test"}', position=6), + ] + patterns = detect_error_retry_spirals(spans) + assert len(patterns) == 1, f"Failed for keyword: {keyword}" + + def test_custom_similarity_threshold(self): + """A lower similarity threshold allows more variation in params.""" + # Params vary slightly each time + spans = [ + _tool_use('Bash {"command":"python test.py --flag=a"}', position=0), + _tool_result("error: test failed", position=1), + _tool_use('Bash {"command":"python test.py --flag=b"}', position=2), + _tool_result("error: test failed", position=3), + _tool_use('Bash {"command":"python test.py --flag=c"}', position=4), + _tool_result("error: test failed", position=5), + _tool_use('Bash {"command":"python test.py --flag=d"}', position=6), + ] + # With a very strict threshold these may not chain; with relaxed they will + patterns_strict = detect_error_retry_spirals( + spans, similarity_threshold=0.99 + ) + patterns_relaxed = detect_error_retry_spirals( + spans, similarity_threshold=0.50 + ) + # Relaxed should find at least as many patterns as strict + assert len(patterns_relaxed) >= len(patterns_strict) + + def test_wasted_tokens_excludes_first_call(self): + """tokens_wasted should only count retry calls, not the original.""" + spans = [ + _tool_use('Bash {"command":"test"}', position=0, token_count=100), + _tool_result("error: fail", position=1), + _tool_use('Bash {"command":"test"}', position=2, token_count=100), + _tool_result("error: fail", position=3), + _tool_use('Bash {"command":"test"}', position=4, token_count=100), + _tool_result("error: fail", position=5), + _tool_use('Bash {"command":"test"}', position=6, token_count=100), + ] + patterns = detect_error_retry_spirals(spans) + assert len(patterns) == 1 + # 3 retries x 100 tokens = 300 wasted (first call excluded) + assert patterns[0].tokens_wasted == 300 + + +# =================================================================== +# 3. detect_over_reading +# =================================================================== + + +class TestDetectOverReading: + def test_empty_input(self): + assert detect_over_reading([]) == [] + + def test_single_read_no_pattern(self): + """A single read should never produce a pattern.""" + spans = [ + _tool_use('Read {"file_path":"src/main.py"}', position=0), + _tool_result("def main(): pass", position=1), + ] + assert detect_over_reading(spans) == [] + + def test_two_reads_no_pattern_default_min(self): + """Two reads total = 1 redundant read, below default min_reads=2.""" + spans = [ + _tool_use('Read {"file_path":"src/main.py"}', position=0), + _tool_result("content", position=1), + _tool_use('Read {"file_path":"src/main.py"}', position=2), + _tool_result("content", position=3), + ] + assert detect_over_reading(spans) == [] + + def test_three_reads_detected(self): + """Three reads of the same file = 2 redundant, triggers default min_reads=2.""" + spans = [ + _tool_use('Read {"file_path":"src/main.py"}', position=0), + _tool_result("content", position=1), + _tool_use('Read {"file_path":"src/main.py"}', position=2), + _tool_result("content", position=3), + _tool_use('Read {"file_path":"src/main.py"}', position=4), + _tool_result("content", position=5), + ] + patterns = detect_over_reading(spans) + assert len(patterns) == 1 + p = patterns[0] + assert p.pattern_type == ExtendedWasteType.OVER_READING.value + assert p.details["file_path"] == "src/main.py" + assert p.details["read_count"] == 3 + assert p.details["redundant_reads"] == 2 + # Wasted tokens = token_count of the 2 redundant reads + assert p.tokens_wasted == 100 # 2 x 50 + + def test_edit_resets_read_count(self): + """An intervening Edit to the same file resets the read chain.""" + spans = [ + _tool_use('Read {"file_path":"src/main.py"}', position=0), + _tool_result("content", position=1), + _tool_use('Read {"file_path":"src/main.py"}', position=2), + _tool_result("content", position=3), + _tool_use('Edit {"file_path":"src/main.py"}', position=4), + _tool_result("ok", position=5), + _tool_use('Read {"file_path":"src/main.py"}', position=6), + _tool_result("content", position=7), + ] + # After Edit at position 4, tracker resets. Only 1 read post-edit. + assert detect_over_reading(spans) == [] + + def test_write_resets_read_count(self): + """Write tool also resets the read chain.""" + spans = [ + _tool_use('Read {"file_path":"x.py"}', position=0), + _tool_use('Read {"file_path":"x.py"}', position=1), + _tool_use('Read {"file_path":"x.py"}', position=2), + _tool_use('Write {"file_path":"x.py"}', position=3), + _tool_use('Read {"file_path":"x.py"}', position=4), + ] + # Write at position 3 resets; only 1 read afterwards + assert detect_over_reading(spans) == [] + + def test_different_files_tracked_independently(self): + """Reads of different files should be tracked separately.""" + spans = [ + _tool_use('Read {"file_path":"a.py"}', position=0), + _tool_use('Read {"file_path":"b.py"}', position=1), + _tool_use('Read {"file_path":"a.py"}', position=2), + _tool_use('Read {"file_path":"b.py"}', position=3), + ] + # Each file read twice = 1 redundant read each, below min_reads=2 + assert detect_over_reading(spans) == [] + + def test_custom_min_reads_1(self): + """Lowering min_reads=1 flags files read just twice.""" + spans = [ + _tool_use('Read {"file_path":"src/main.py"}', position=0), + _tool_result("content", position=1), + _tool_use('Read {"file_path":"src/main.py"}', position=2), + _tool_result("content", position=3), + ] + patterns = detect_over_reading(spans, min_reads=1) + assert len(patterns) == 1 + assert patterns[0].details["redundant_reads"] == 1 + + def test_cat_tool_recognised_as_read(self): + """The 'cat' tool name should also be treated as a read.""" + spans = [ + _tool_use('cat {"file_path":"src/main.py"}', position=0), + _tool_use('cat {"file_path":"src/main.py"}', position=1), + _tool_use('cat {"file_path":"src/main.py"}', position=2), + ] + patterns = detect_over_reading(spans) + assert len(patterns) == 1 + + def test_results_sorted_by_wasted_tokens_descending(self): + """Multiple over-read files should be sorted by tokens_wasted descending.""" + spans = [ + # a.py read 3 times at 50 tokens each => 100 wasted + _tool_use('Read {"file_path":"a.py"}', position=0, token_count=50), + _tool_use('Read {"file_path":"a.py"}', position=1, token_count=50), + _tool_use('Read {"file_path":"a.py"}', position=2, token_count=50), + # b.py read 3 times at 200 tokens each => 400 wasted + _tool_use('Read {"file_path":"b.py"}', position=3, token_count=200), + _tool_use('Read {"file_path":"b.py"}', position=4, token_count=200), + _tool_use('Read {"file_path":"b.py"}', position=5, token_count=200), + ] + patterns = detect_over_reading(spans) + assert len(patterns) == 2 + assert patterns[0].details["file_path"] == "b.py" + assert patterns[1].details["file_path"] == "a.py" + + def test_path_key_fallback(self): + """When 'file_path' is absent, 'path' key should be used.""" + spans = [ + _tool_use('Read {"path":"src/utils.py"}', position=0), + _tool_use('Read {"path":"src/utils.py"}', position=1), + _tool_use('Read {"path":"src/utils.py"}', position=2), + ] + patterns = detect_over_reading(spans) + assert len(patterns) == 1 + assert patterns[0].details["file_path"] == "src/utils.py" + + def test_no_file_path_spans_ignored(self): + """Tool calls without parseable file paths are skipped.""" + spans = [ + _tool_use("Bash {}", position=0), + _tool_use("Bash {}", position=1), + _tool_use("Bash {}", position=2), + ] + assert detect_over_reading(spans) == [] + + def test_non_tool_use_spans_ignored(self): + """Reasoning / generation spans should not affect over-reading detection.""" + spans = [ + _tool_use('Read {"file_path":"x.py"}', position=0), + _reasoning("thinking", position=1), + _tool_use('Read {"file_path":"x.py"}', position=2), + _generation("output", position=3), + _tool_use('Read {"file_path":"x.py"}', position=4), + ] + patterns = detect_over_reading(spans) + assert len(patterns) == 1 + + +# =================================================================== +# 4. detect_abandoned_approaches +# =================================================================== + + +class TestDetectAbandonedApproaches: + def test_empty_input(self): + assert detect_abandoned_approaches([]) == [] + + def test_no_abandonment_when_file_revisited(self): + """File edited and then touched again later -- not abandoned.""" + spans = [ + _tool_use('Edit {"file_path":"src/a.py"}', position=0), + _tool_result("ok", position=1), + _tool_use('Read {"file_path":"src/b.py"}', position=2), + _tool_result("content", position=3), + _tool_use('Read {"file_path":"src/a.py"}', position=4), + _tool_result("content", position=5), + ] + assert detect_abandoned_approaches(spans) == [] + + def test_detects_abandoned_file(self): + """File edited, then agent moves to different file and never returns.""" + spans = [ + _tool_use( + 'Edit {"file_path":"src/attempt1.py"}', + position=0, + token_count=80, + ), + _tool_result("ok", position=1), + _tool_use( + 'Edit {"file_path":"src/attempt2.py"}', + position=2, + token_count=60, + ), + _tool_result("ok", position=3), + ] + patterns = detect_abandoned_approaches(spans) + assert len(patterns) == 1 + p = patterns[0] + assert p.pattern_type == ExtendedWasteType.ABANDONED_APPROACH.value + assert p.details["file_path"] == "src/attempt1.py" + assert "attempt1.py" in p.description + + def test_last_file_not_abandoned(self): + """The last file touched should not be flagged (no subsequent work).""" + spans = [ + _tool_use('Edit {"file_path":"src/only.py"}', position=0), + _tool_result("ok", position=1), + ] + assert detect_abandoned_approaches(spans) == [] + + def test_file_revisited_later_not_abandoned(self): + """If file is touched again after other work, it is not abandoned.""" + spans = [ + _tool_use('Edit {"file_path":"src/a.py"}', position=0), + _tool_use('Edit {"file_path":"src/b.py"}', position=2), + _tool_use('Edit {"file_path":"src/a.py"}', position=4), + ] + # a.py revisited at position 4, so not abandoned. + # b.py: last touch is 2, agent works on a.py at 4 -> b.py abandoned. + patterns = detect_abandoned_approaches(spans) + assert any(p.details["file_path"] == "src/b.py" for p in patterns) + assert not any(p.details["file_path"] == "src/a.py" for p in patterns) + + def test_write_tool_also_counts(self): + """Write tool should be recognised same as Edit for abandonment.""" + spans = [ + _tool_use('Write {"file_path":"src/temp.py"}', position=0), + _tool_result("ok", position=1), + _tool_use('Edit {"file_path":"src/main.py"}', position=2), + _tool_result("ok", position=3), + ] + patterns = detect_abandoned_approaches(spans) + assert len(patterns) == 1 + assert patterns[0].details["file_path"] == "src/temp.py" + + def test_only_reads_no_abandonment(self): + """Reading files (not editing) should not produce abandoned-approach patterns.""" + spans = [ + _tool_use('Read {"file_path":"src/a.py"}', position=0), + _tool_result("content", position=1), + _tool_use('Read {"file_path":"src/b.py"}', position=2), + _tool_result("content", position=3), + ] + assert detect_abandoned_approaches(spans) == [] + + def test_no_file_path_spans_ignored(self): + """Tool calls without parseable file paths should be skipped.""" + spans = [ + _tool_use("Bash {}", position=0), + _tool_result("ok", position=1), + ] + assert detect_abandoned_approaches(spans) == [] + + def test_multiple_abandoned_files(self): + """Multiple files can be flagged as abandoned.""" + spans = [ + _tool_use('Edit {"file_path":"src/a.py"}', position=0, token_count=100), + _tool_use('Edit {"file_path":"src/b.py"}', position=1, token_count=200), + _tool_use('Edit {"file_path":"src/final.py"}', position=2, token_count=50), + ] + patterns = detect_abandoned_approaches(spans) + abandoned_files = {p.details["file_path"] for p in patterns} + assert "src/a.py" in abandoned_files + assert "src/b.py" in abandoned_files + # final.py is the last file -- not abandoned + assert "src/final.py" not in abandoned_files + + def test_results_sorted_by_wasted_tokens_descending(self): + """Patterns should be sorted by tokens_wasted descending.""" + spans = [ + _tool_use('Edit {"file_path":"src/small.py"}', position=0, token_count=50), + _tool_use('Edit {"file_path":"src/large.py"}', position=1, token_count=500), + _tool_use('Edit {"file_path":"src/final.py"}', position=2, token_count=10), + ] + patterns = detect_abandoned_approaches(spans) + assert len(patterns) == 2 + assert patterns[0].tokens_wasted >= patterns[1].tokens_wasted + + def test_duplicate_file_not_reported_twice(self): + """Same file edited multiple times then abandoned should only appear once.""" + spans = [ + _tool_use('Edit {"file_path":"src/dup.py"}', position=0, token_count=100), + _tool_use('Edit {"file_path":"src/dup.py"}', position=1, token_count=100), + _tool_use('Edit {"file_path":"src/other.py"}', position=2, token_count=50), + ] + patterns = detect_abandoned_approaches(spans) + dup_patterns = [p for p in patterns if p.details["file_path"] == "src/dup.py"] + assert len(dup_patterns) == 1 + + +# =================================================================== +# 5. detect_verbose_thinking +# =================================================================== + + +class TestDetectVerboseThinking: + def test_empty_input(self): + assert detect_verbose_thinking([]) == [] + + def test_no_thinking_spans(self): + """Non-reasoning spans should produce no patterns.""" + spans = [ + _generation("output text", position=0), + _tool_use('Bash {"command":"ls"}', position=1), + ] + assert detect_verbose_thinking(spans) == [] + + def test_proportional_thinking_not_flagged(self): + """A reasonable thinking-to-action ratio should not be flagged.""" + spans = [ + _reasoning("Let me think about this...", position=0, token_count=200), + _tool_use('Bash {"command":"ls"}', position=1, token_count=50), + ] + # ratio = 200/50 = 4.0 < default 10.0 + assert detect_verbose_thinking(spans) == [] + + def test_below_min_thinking_tokens_not_flagged(self): + """High ratio should not flag when thinking tokens < min_thinking_tokens.""" + spans = [ + _reasoning("Short thought", position=0, token_count=100), + _tool_use('Bash {"command":"ls"}', position=1, token_count=5), + ] + # ratio = 100/5 = 20.0 > 10.0 but 100 < 500 default min + assert detect_verbose_thinking(spans) == [] + + def test_detects_verbose_thinking(self): + """Large thinking block with small action should be flagged.""" + spans = [ + _reasoning("Very long reasoning...", position=0, token_count=6000), + _tool_use('Bash {"command":"ls"}', position=1, token_count=50), + ] + # ratio = 6000/50 = 120.0 > 10.0, and 6000 > 500 + patterns = detect_verbose_thinking(spans) + assert len(patterns) == 1 + p = patterns[0] + assert p.pattern_type == ExtendedWasteType.VERBOSE_THINKING.value + assert p.details["thinking_tokens"] == 6000 + assert p.details["action_tokens"] == 50 + assert p.details["ratio"] == 120.0 + assert p.start_position == 0 + assert p.end_position == 1 + # Excess = 6000 - (50 * 10) = 5500 + assert p.tokens_wasted == 5500 + + def test_thinking_with_no_subsequent_action(self): + """Thinking block at end of session with no action is flagged.""" + spans = [ + _reasoning("Final rumination...", position=0, token_count=1000), + ] + patterns = detect_verbose_thinking(spans) + assert len(patterns) == 1 + p = patterns[0] + assert p.details["action_tokens"] == 0 + assert p.tokens_wasted == 1000 + + def test_thinking_followed_by_zero_token_action(self): + """Action with 0 tokens should produce infinite ratio and be flagged.""" + spans = [ + _reasoning("Thinking...", position=0, token_count=600), + _tool_use('Bash {"command":""}', position=1, token_count=0), + ] + patterns = detect_verbose_thinking(spans) + assert len(patterns) == 1 + assert patterns[0].details["ratio"] == float("inf") + + def test_custom_ratio_threshold(self): + """Custom ratio_threshold should change what gets flagged.""" + spans = [ + _reasoning("Moderate thinking", position=0, token_count=600), + _tool_use('Bash {"command":"ls"}', position=1, token_count=50), + ] + # ratio = 12.0 + assert detect_verbose_thinking(spans, ratio_threshold=15.0) == [] + patterns = detect_verbose_thinking(spans, ratio_threshold=5.0) + assert len(patterns) == 1 + + def test_custom_min_thinking_tokens(self): + """Custom min_thinking_tokens should change what gets flagged.""" + spans = [ + _reasoning("Some thinking", position=0, token_count=200), + _tool_use('Bash {"command":"ls"}', position=1, token_count=10), + ] + # ratio = 20.0 > 10.0, but 200 < 500 default -> not flagged + assert detect_verbose_thinking(spans) == [] + # Lower min to 100 + patterns = detect_verbose_thinking(spans, min_thinking_tokens=100) + assert len(patterns) == 1 + + def test_skips_reasoning_to_find_next_action(self): + """The detector looks past consecutive reasoning spans for the action.""" + spans = [ + _reasoning("First thought", position=0, token_count=2000), + _reasoning("Second thought", position=1, token_count=1000), + _tool_use('Bash {"command":"ls"}', position=2, token_count=50), + ] + # First reasoning: next non-reasoning = tool_use at position 2 + # ratio = 2000/50 = 40.0 + # Second reasoning: next non-reasoning = tool_use at position 2 + # ratio = 1000/50 = 20.0 + patterns = detect_verbose_thinking(spans) + assert len(patterns) == 2 + + def test_generation_as_action(self): + """A generation span is a valid action target.""" + spans = [ + _reasoning("Deep thinking...", position=0, token_count=600), + _generation("Here is the answer", position=1, token_count=50), + ] + # ratio = 600/50 = 12.0 > 10.0 + patterns = detect_verbose_thinking(spans) + assert len(patterns) == 1 + assert patterns[0].details["action_tokens"] == 50 + + def test_excess_calculation(self): + """tokens_wasted should be thinking_tokens - (action_tokens * threshold).""" + spans = [ + _reasoning("Long thought", position=0, token_count=1000), + _tool_use("Bash {}", position=1, token_count=50), + ] + # ratio = 1000/50 = 20.0, excess = 1000 - (50 * 10) = 500 + patterns = detect_verbose_thinking(spans) + assert len(patterns) == 1 + assert patterns[0].tokens_wasted == 500 + + +# =================================================================== +# 6. detect_all_extended +# =================================================================== + + +class TestDetectAllExtended: + def test_empty_input(self): + assert detect_all_extended([]) == [] + + def test_returns_list(self): + """Even with no detectable patterns the return type should be list.""" + spans = [ + _reasoning("thinking", position=0), + _generation("output", position=1), + ] + result = detect_all_extended(spans) + assert isinstance(result, list) + + def test_combines_multiple_detectors(self): + """detect_all_extended should run all five detectors and combine results.""" + spans = [ + # Permission loop: 3 calls with denial + _tool_use('Bash {"command":"rm /root"}', position=0, token_count=30), + _tool_result("permission denied", position=1), + _tool_use('Bash {"command":"rm /root"}', position=2, token_count=30), + _tool_result("permission denied", position=3), + _tool_use('Bash {"command":"rm /root"}', position=4, token_count=30), + # Over-reading: same file read 3 times + _tool_use( + 'Read {"file_path":"config.yaml"}', position=10, token_count=40 + ), + _tool_use( + 'Read {"file_path":"config.yaml"}', position=11, token_count=40 + ), + _tool_use( + 'Read {"file_path":"config.yaml"}', position=12, token_count=40 + ), + # Verbose thinking + _reasoning("Lots of thinking...", position=20, token_count=5000), + _tool_use('Bash {"command":"echo hi"}', position=21, token_count=10), + ] + + patterns = detect_all_extended(spans) + types_found = {p.pattern_type for p in patterns} + assert ExtendedWasteType.PERMISSION_LOOP.value in types_found + assert ExtendedWasteType.OVER_READING.value in types_found + assert ExtendedWasteType.VERBOSE_THINKING.value in types_found + + def test_sorted_by_start_position(self): + """Results from detect_all_extended should be sorted by start_position.""" + spans = [ + # Verbose thinking at position 20 + _reasoning("Lots of thinking...", position=20, token_count=5000), + _tool_use('Bash {"command":"echo hi"}', position=21, token_count=10), + # Permission loop at position 0 + _tool_use('Bash {"command":"rm /root"}', position=0, token_count=30), + _tool_result("permission denied", position=1), + _tool_use('Bash {"command":"rm /root"}', position=2, token_count=30), + _tool_result("permission denied", position=3), + _tool_use('Bash {"command":"rm /root"}', position=4, token_count=30), + ] + patterns = detect_all_extended(spans) + positions = [p.start_position for p in patterns] + assert positions == sorted(positions) + + def test_forwards_permission_min_retries(self): + """permission_min_retries parameter should be forwarded.""" + spans = [ + _tool_use('Bash {"command":"rm /root"}', position=0, token_count=30), + _tool_result("permission denied", position=1), + _tool_use('Bash {"command":"rm /root"}', position=2, token_count=30), + _tool_result("permission denied", position=3), + _tool_use('Bash {"command":"rm /root"}', position=4, token_count=30), + ] + # Default min_retries=2 triggers (2 retries) + patterns_default = detect_all_extended(spans) + assert any( + p.pattern_type == ExtendedWasteType.PERMISSION_LOOP.value + for p in patterns_default + ) + + # Raising to 3 prevents detection + patterns_strict = detect_all_extended(spans, permission_min_retries=3) + assert not any( + p.pattern_type == ExtendedWasteType.PERMISSION_LOOP.value + for p in patterns_strict + ) + + def test_forwards_verbose_thinking_params(self): + """verbose_ratio_threshold and verbose_min_thinking_tokens forwarded.""" + spans = [ + _reasoning("Thinking...", position=0, token_count=300), + _tool_use("Bash {}", position=1, token_count=10), + ] + # Default: 300 < 500 min_thinking_tokens -> not flagged + patterns_default = detect_all_extended(spans) + assert not any( + p.pattern_type == ExtendedWasteType.VERBOSE_THINKING.value + for p in patterns_default + ) + + # Lower min_thinking_tokens to 100 + patterns_low_min = detect_all_extended( + spans, verbose_min_thinking_tokens=100 + ) + assert any( + p.pattern_type == ExtendedWasteType.VERBOSE_THINKING.value + for p in patterns_low_min + ) + + def test_forwards_over_reading_min_reads(self): + """over_reading_min_reads parameter should be forwarded.""" + spans = [ + _tool_use('Read {"file_path":"x.py"}', position=0), + _tool_use('Read {"file_path":"x.py"}', position=1), + ] + # Default min_reads=2 requires 3 reads total -> not flagged + patterns_default = detect_all_extended(spans) + assert not any( + p.pattern_type == ExtendedWasteType.OVER_READING.value + for p in patterns_default + ) + + # Lower to min_reads=1 -> 2 reads total qualifies + patterns_low = detect_all_extended(spans, over_reading_min_reads=1) + assert any( + p.pattern_type == ExtendedWasteType.OVER_READING.value + for p in patterns_low + )