diff --git a/src/semble/cli.py b/src/semble/cli.py index 008f875..39286ae 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -115,7 +115,7 @@ def _load_index(path: str, content: list[ContentType]) -> SembleIndex: def _run_search(path: str, query: str, top_k: int, content: list[ContentType], max_snippet_lines: int | None) -> None: """Handle the `search` subcommand.""" index = _load_index(path, content) - results = index.search(query, top_k=top_k) + results = index.search(query, top_k=top_k, max_snippet_lines=max_snippet_lines) out = format_results(query, results, max_snippet_lines) if results else {"error": "No results found."} print(json.dumps(out)) _maybe_save_index(index, path) @@ -130,7 +130,7 @@ def _run_find_related( if chunk is None: print(f"No chunk found at {file_path}:{line}.", file=sys.stderr) sys.exit(1) - results = index.find_related(chunk, top_k=top_k) + results = index.find_related(chunk, top_k=top_k, max_snippet_lines=max_snippet_lines) label = f"Chunks related to {file_path}:{line}" out = ( format_results(label, results, max_snippet_lines) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 126cecc..f51eac8 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -224,18 +224,21 @@ def from_git( content=normalized, ) - def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]: + def find_related( + self, source: Chunk | SearchResult, *, top_k: int = 5, max_snippet_lines: int | None = None + ) -> list[SearchResult]: """Return chunks semantically similar to the given chunk or search result. :param source: A SearchResult or Chunk to use as the seed. :param top_k: Number of similar chunks to return. + :param max_snippet_lines: Lines of content to count for savings stats. None = full chunk. :return: Ranked list of SearchResult objects, most similar first. """ target = source.chunk if isinstance(source, SearchResult) else source selector = self._get_selector_vector(filter_languages=[target.language]) if target.language else None results = _search_semantic(target.content, self.model, self._semantic_index, self.chunks, top_k + 1, selector) results = [r for r in results if r.chunk != target][:top_k] - save_search_stats(results, CallType.FIND_RELATED, self._file_sizes) + save_search_stats(results, CallType.FIND_RELATED, self._file_sizes, max_snippet_lines) return results def _get_selector_vector( @@ -258,6 +261,7 @@ def search( filter_languages: list[str] | None = None, filter_paths: list[str] | None = None, rerank: bool | None = None, + max_snippet_lines: int | None = None, ) -> list[SearchResult]: """Search the index and return the top-k most relevant chunks. @@ -271,6 +275,7 @@ def search( chunks from these files are returned. :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties). Defaults to True when ContentType.CODE was indexed. + :param max_snippet_lines: Lines of content to count for savings stats. None = full chunk. :return: Ranked list of SearchResult objects, best match first. """ if not self.chunks or not query.strip(): @@ -290,7 +295,7 @@ def search( selector=selector, rerank=resolved_rerank, ) - save_search_stats(results, CallType.SEARCH, self._file_sizes) + save_search_stats(results, CallType.SEARCH, self._file_sizes, max_snippet_lines) return results @classmethod diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 5fb6683..85a8526 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -78,6 +78,7 @@ async def search( "If the snippet does not contain enough context to confirm you have the right location, " "call again with max_snippet_lines=None." ), + ge=0, ), ] = 10, ) -> str: @@ -91,7 +92,7 @@ async def search( index = await _get_index(repo, default_source, cache) except ValueError as exc: return str(exc) - results = index.search(query, top_k=top_k) + results = index.search(query, top_k=top_k, max_snippet_lines=max_snippet_lines) if not results: return json.dumps({"error": "No results found."}) return json.dumps(format_results(query, results, max_snippet_lines)) @@ -111,7 +112,8 @@ async def find_related( description=( "Lines of source per result. " "Default 10 = signature + first body lines. 0 = location only. None = full chunk." - ) + ), + ge=0, ), ] = 10, ) -> str: @@ -131,7 +133,7 @@ async def find_related( f"No chunk found at {file_path}:{line}. " "Make sure the file is indexed and the line number is within a known chunk." ) - results = index.find_related(chunk, top_k=top_k) + results = index.find_related(chunk, top_k=top_k, max_snippet_lines=max_snippet_lines) if not results: return json.dumps({"error": f"No related chunks found for {file_path}:{line}."}) label = f"Chunks related to {file_path}:{line}" diff --git a/src/semble/stats.py b/src/semble/stats.py index 31c2753..3c6288c 100644 --- a/src/semble/stats.py +++ b/src/semble/stats.py @@ -65,10 +65,18 @@ def save_search_stats( results: list[SearchResult], call_type: CallType, file_sizes: dict[str, int], + max_snippet_lines: int | None = None, ) -> None: """Save stats about a search or find_related call to the stats file.""" try: - snippet_chars = sum(len(result.chunk.content) for result in results) + snippet_chars = sum( + len("\n".join(result.chunk.content.splitlines()[:max_snippet_lines])) + if max_snippet_lines and max_snippet_lines > 0 + else 0 + if max_snippet_lines == 0 + else len(result.chunk.content) + for result in results + ) file_chars = sum( file_sizes[path] for path in {result.chunk.file_path for result in results} if path in file_sizes )