diff --git a/src/review_classification/cli/app.py b/src/review_classification/cli/app.py index f9764bd..461704a 100644 --- a/src/review_classification/cli/app.py +++ b/src/review_classification/cli/app.py @@ -8,6 +8,7 @@ import typer from .config import RepoConfig, parse_config_file +from .output import RepoClassifyResult, format_combined_results from .parser import GitHubRepo app = typer.Typer(help="Identify PR review outliers in GitHub repositories") @@ -98,16 +99,12 @@ def _detect_single( repo_name: str, threshold: float, min_samples: int, - output_format: str, verbose: bool, classify_start: str | None, classify_end: str | None, exclude_primary_merged: bool = False, -) -> tuple[bool, str]: - """Run outlier detection for a single repository. - - Returns (success, output_or_error_message). - """ +) -> RepoClassifyResult: + """Run outlier detection for a single repository.""" repo = GitHubRepo.from_string(repo_name) full_name = f"{repo.owner}/{repo.name}" @@ -148,7 +145,6 @@ def _detect_single( from ..features.engineering import create_pr_features from ..sqlite.database import get_session, init_db, save_pr_features from ..sqlite.models import PullRequest - from .output import format_outlier_results init_db() session = get_session() @@ -161,7 +157,10 @@ def _detect_single( prs = list(session.exec(statement).all()) if len(prs) == 0: - return False, f"{full_name}: no PRs found — run 'fetch' command first." + return RepoClassifyResult( + repo_name=full_name, + error="No PRs found — run 'fetch' command first.", + ) for pr in prs: if pr.id is not None: @@ -201,28 +200,32 @@ def _detect_single( f"({len(outliers) / len(results) * 100:.1f}%)" ) - output = format_outlier_results(results, output_format, repo_name=full_name) # type: ignore - return True, output + return RepoClassifyResult( + repo_name=full_name, + results=results, + total_prs=len(results), + ) except InsufficientDataError as e: - return False, f"{full_name}: insufficient data — {e}" + msg = str(e) + # e.g. "Repository x has only 18 merged PRs, need at least 30" + short = msg.split("has only")[-1].strip() if "has only" in msg else msg + return RepoClassifyResult( + repo_name=full_name, + error=f"Sample limit not reached — {short}", + ) finally: session.close() def _print_detect_results( - successes: list[str], failures: list[str], raise_on_failure: bool = True + repo_results: list[RepoClassifyResult], + output_format: str, ) -> None: - """Print deferred results: all successful outputs first, then error summaries.""" - for output in successes: - typer.echo(output) - - if failures: - typer.echo("\nRepositories that could not be classified:", err=True) - for msg in failures: - typer.echo(f" - {msg}", err=True) - if raise_on_failure: - raise typer.Exit(code=1) + """Print combined results for all repositories; exit 1 if any failed.""" + typer.echo(format_combined_results(repo_results, output_format)) # type: ignore + if any(not r.success for r in repo_results): + raise typer.Exit(code=1) # --------------------------------------------------------------------------- @@ -512,26 +515,20 @@ def classify( default_end=end, ) - successes: list[str] = [] - failures: list[str] = [] - - for target in targets: - ok, message = _detect_single( + repo_results = [ + _detect_single( target.name, target.threshold if target.threshold is not None else threshold, target.min_samples if target.min_samples is not None else min_samples, - output_format, verbose, target.start, target.end, exclude_primary_merged=exclude_primary_merged, ) - if ok: - successes.append(message) - else: - failures.append(message) + for target in targets + ] - _print_detect_results(successes, failures) + _print_detect_results(repo_results, output_format) except (FileNotFoundError, ValueError) as e: typer.echo(f"Error: {e}", err=True) @@ -685,26 +682,20 @@ def fetch_and_classify( verbose, ) - successes: list[str] = [] - failures: list[str] = [] - - for target in targets: - ok, message = _detect_single( + repo_results = [ + _detect_single( target.name, target.threshold if target.threshold is not None else threshold, target.min_samples if target.min_samples is not None else min_samples, - output_format, verbose, target.start, target.end, exclude_primary_merged=exclude_primary_merged, ) - if ok: - successes.append(message) - else: - failures.append(message) + for target in targets + ] - _print_detect_results(successes, failures) + _print_detect_results(repo_results, output_format) except (FileNotFoundError, ValueError) as e: typer.echo(f"Error: {e}", err=True) diff --git a/src/review_classification/cli/output.py b/src/review_classification/cli/output.py index e06c522..a91831f 100644 --- a/src/review_classification/cli/output.py +++ b/src/review_classification/cli/output.py @@ -1,133 +1,215 @@ """Output formatting for outlier detection results.""" import json +from dataclasses import dataclass, field from typing import Literal from ..analysis.outlier_detector import OutlierResult +# Per-column maximum display widths (characters before truncation) +_MAX_AUTHOR = 20 +_MAX_FEATURES = 40 +_MAX_TITLE = 50 +_MAX_NOTE = 65 # error/note text in the features cell -def format_outlier_results( - results: list[OutlierResult], + +@dataclass +class RepoClassifyResult: + """Classification results for a single repository.""" + + repo_name: str + results: list[OutlierResult] = field(default_factory=list) + total_prs: int = 0 + error: str | None = None + + @property + def success(self) -> bool: + return self.error is None + + +def format_combined_results( + repo_results: list[RepoClassifyResult], format_type: Literal["table", "json", "csv"] = "table", - repo_name: str | None = None, ) -> str: - """Format outlier detection results. + """Format classification results for one or more repositories. Args: - results: List of OutlierResult from detection - format_type: Output format (table, json, or csv) + repo_results: Results for each repository, in any order. + format_type: Output format (table, json, or csv). Returns: - Formatted string output + Formatted string output. """ - outliers = [r for r in results if r.is_outlier] + sorted_repos = sorted(repo_results, key=lambda r: r.repo_name) if format_type == "json": - return _format_json(outliers) + return _format_combined_json(sorted_repos) elif format_type == "csv": - return _format_csv(outliers) + return _format_combined_csv(sorted_repos) else: - return _format_table(outliers, total_prs=len(results), repo_name=repo_name) + return _format_combined_table(sorted_repos) -def _format_table( - outliers: list[OutlierResult], total_prs: int, repo_name: str | None = None -) -> str: - """Format as ASCII table.""" - if not outliers: - return f"No outliers detected out of {total_prs} PRs analyzed." - - lines = [] - if repo_name: - lines.append(f"\nRepository: {repo_name}") - lines.append("\nOutlier Pull Requests (ordered by most recently merged)") - lines.append("=" * 150) - lines.append( - f"{'PR #':<8} {'Merged':<12} {'Author':<20} {'Max |Z|':<10} " - f"{'Outlier Features':<30} {'Title':<70}" - ) - lines.append("-" * 150) +# --------------------------------------------------------------------------- +# Table helpers +# --------------------------------------------------------------------------- - # Sort by merged_at descending (most recent first) - sorted_outliers = sorted( - outliers, - key=lambda x: x.merged_at if x.merged_at else "", - reverse=True, - ) - for outlier in sorted_outliers: - features_str = ", ".join(outlier.outlier_features) - if len(features_str) > 28: - features_str = features_str[:25] + "..." +def _truncate(text: str, max_len: int) -> str: + """Truncate text to max_len, appending … if shortened.""" + return text if len(text) <= max_len else text[: max_len - 1] + "\u2026" - # Format merge date - merged_date = ( - outlier.merged_at.strftime("%Y-%m-%d") if outlier.merged_at else "N/A" - ) - # Truncate author and title if too long - author = outlier.author[:18] if len(outlier.author) > 18 else outlier.author - title = outlier.title[:68] if len(outlier.title) > 68 else outlier.title +def _fmt_row(cells: list[str], widths: list[int]) -> str: + """Render one markdown table row with cells padded to column widths.""" + padded = (cell.ljust(widths[i]) for i, cell in enumerate(cells)) + return "| " + " | ".join(padded) + " |" - lines.append( - f"#{outlier.pr_number:<7} {merged_date:<12} {author:<20} " - f"{outlier.max_abs_z_score:<10.2f} {features_str:<30} {title}" - ) - lines.append("-" * 150) - lines.append( - f"Total outliers: {len(outliers)} out of {total_prs} PRs " - f"({len(outliers) / total_prs * 100:.1f}%)" - ) +def _separator(widths: list[int]) -> str: + """Render the markdown header-separator row.""" + return "|" + "|".join("-" * (w + 2) for w in widths) + "|" - return "\n".join(lines) +# --------------------------------------------------------------------------- +# Table output +# --------------------------------------------------------------------------- -def _format_json(outliers: list[OutlierResult]) -> str: - """Format as JSON.""" - # Sort by merged_at descending (most recent first) - sorted_outliers = sorted( - outliers, - key=lambda x: x.merged_at if x.merged_at else "", - reverse=True, - ) - data = [ - { - "pr_number": o.pr_number, - "title": o.title, - "author": o.author, - "merged_at": o.merged_at.isoformat() if o.merged_at else None, - "is_outlier": o.is_outlier, - "max_abs_z_score": o.max_abs_z_score, - "outlier_features": o.outlier_features, - "z_scores": {k: v for k, v in o.z_scores.items() if v is not None}, - } - for o in sorted_outliers - ] - return json.dumps(data, indent=2) +def _format_combined_table(repo_results: list[RepoClassifyResult]) -> str: + """Render a single padded markdown/plain-text table ordered by repository. + Each cell is padded to the maximum width in its column so the table + aligns in both a terminal and a markdown renderer. + """ + headers = [ + "Repository", + "PR #", + "Merged", + "Author", + "Max Z", + "Outlier Features", + "Title", + ] -def _format_csv(outliers: list[OutlierResult]) -> str: - """Format as CSV.""" - lines = ["pr_number,merged_at,author,title,max_abs_z_score,outlier_features"] + # Build raw cell data for every row + rows: list[list[str]] = [] + for repo in repo_results: + if not repo.success: + note = _truncate(repo.error or "Could not be classified", _MAX_NOTE) + rows.append([f"`{repo.repo_name}`", "—", "—", "—", "—", note, "—"]) + continue + + outliers = sorted( + (r for r in repo.results if r.is_outlier), + key=lambda x: x.merged_at if x.merged_at else "", + reverse=True, + ) + for o in outliers: + merged = o.merged_at.strftime("%Y-%m-%d") if o.merged_at else "—" + rows.append( + [ + f"`{repo.repo_name}`", + f"#{o.pr_number}", + merged, + _truncate(o.author, _MAX_AUTHOR), + f"{o.max_abs_z_score:.2f}", + _truncate(", ".join(o.outlier_features), _MAX_FEATURES), + _truncate(o.title, _MAX_TITLE), + ] + ) + + # Column widths = max(header width, max cell width in that column) + widths = [len(h) for h in headers] + for row in rows: + for i, cell in enumerate(row): + widths[i] = max(widths[i], len(cell)) + + lines: list[str] = [ + _fmt_row(headers, widths), + _separator(widths), + *(_fmt_row(row, widths) for row in rows), + "", + ] - # Sort by merged_at descending (most recent first) - sorted_outliers = sorted( - outliers, - key=lambda x: x.merged_at if x.merged_at else "", - reverse=True, + # Summary footer + successful = [r for r in repo_results if r.success] + failed = [r for r in repo_results if not r.success] + total_outliers = sum( + sum(1 for r in repo.results if r.is_outlier) for repo in successful ) + total_prs = sum(r.total_prs for r in successful) - for outlier in sorted_outliers: - features_str = ";".join(outlier.outlier_features) - merged_date = outlier.merged_at.isoformat() if outlier.merged_at else "" - # Escape fields that might contain commas - title = f'"{outlier.title}"' if "," in outlier.title else outlier.title - author = f'"{outlier.author}"' if "," in outlier.author else outlier.author + if successful: lines.append( - f"{outlier.pr_number},{merged_date},{author},{title}," - f"{outlier.max_abs_z_score:.4f},{features_str}" + f"**{total_outliers} outlier(s)** found across {total_prs} PRs " + f"in {len(successful)} classified repo(s)." ) + if failed: + lines.append(f"**{len(failed)} repo(s)** could not be classified (see above).") + + return "\n".join(lines) + +# --------------------------------------------------------------------------- +# JSON output +# --------------------------------------------------------------------------- + + +def _format_combined_json(repo_results: list[RepoClassifyResult]) -> str: + """Format as a JSON array of outlier records, each tagged with repository.""" + records = [] + for repo in repo_results: + if not repo.success: + continue + for o in sorted( + (r for r in repo.results if r.is_outlier), + key=lambda x: x.merged_at if x.merged_at else "", + reverse=True, + ): + records.append( + { + "repository": repo.repo_name, + "pr_number": o.pr_number, + "title": o.title, + "author": o.author, + "merged_at": o.merged_at.isoformat() if o.merged_at else None, + "is_outlier": o.is_outlier, + "max_abs_z_score": o.max_abs_z_score, + "outlier_features": o.outlier_features, + "z_scores": {k: v for k, v in o.z_scores.items() if v is not None}, + } + ) + return json.dumps(records, indent=2) + + +# --------------------------------------------------------------------------- +# CSV output +# --------------------------------------------------------------------------- + + +def _format_combined_csv(repo_results: list[RepoClassifyResult]) -> str: + """Format as CSV with a leading repository column.""" + lines = [ + "repository,pr_number,merged_at,author,title,max_abs_z_score,outlier_features" + ] + for repo in repo_results: + if not repo.success: + continue + for o in sorted( + (r for r in repo.results if r.is_outlier), + key=lambda x: x.merged_at if x.merged_at else "", + reverse=True, + ): + features_str = ";".join(o.outlier_features) + merged_date = o.merged_at.isoformat() if o.merged_at else "" + title = f'"{o.title}"' if "," in o.title else o.title + author = f'"{o.author}"' if "," in o.author else o.author + repo_col = ( + f'"{repo.repo_name}"' if "," in repo.repo_name else repo.repo_name + ) + lines.append( + f"{repo_col},{o.pr_number},{merged_date},{author},{title}," + f"{o.max_abs_z_score:.4f},{features_str}" + ) return "\n".join(lines)