code-next-ai/main.py at main · github-hc/code-next-ai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368

import sys
import argparse
import shutil
import os
os.environ["ANONYMIZED_TELEMETRY"] = "false"
import json
from pathlib import Path

def load_settings():
    settings_path = Path("settings.json")
    if settings_path.exists():
        try:
            with open(settings_path, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return {}
SETTINGS = load_settings()
LOG_CHUNKS     = SETTINGS.get("log_chunks", False)
LOG_AST_PARSER = SETTINGS.get("log_ast_parser", False)
LOG_PARSER     = SETTINGS.get("log_parser", False)

from backend.retrieval.scanner import FileScanner
from backend.retrieval.parser import ASTParser, TreeSitterParser
from backend.retrieval.chunker import ChunkBuilder
from backend.retrieval.embeddings import OllamaEmbedder
from backend.retrieval.vector_store import WeaviateVectorStore, QdrantVectorStore

from backend.generation.ollama_llm import OllamaLLM

def _parse_file_worker(file_path_str: str) -> tuple:
    """
    Worker function to parse a single file in a separate process.
    Returns (file_path_str, parser_label, symbols).
    """
    from pathlib import Path
    from backend.retrieval.parser import ASTParser, TreeSitterParser
    file_path = Path(file_path_str)

    if file_path.suffix == '.py':
        parser = ASTParser(log_ast_parser=False)
        symbols = parser.parse_file(file_path)
        return file_path_str, "ASTParser", symbols
    else:
        parser = TreeSitterParser()
        symbols = parser.parse_file(file_path)
        return file_path_str, "TreeSitterParser", symbols


def build_index(repo_path: str):
    """
    Scans the repository, extracts symbols, generates embeddings, and stores them
    in the local Qdrant vector store (no Docker / server required).
    """
    repo_path_obj = Path(repo_path).expanduser().resolve()
    print(f"[*] Scanning repository: {repo_path_obj} ...")

    scanner = FileScanner(repo_path_obj)
    files = scanner.scan()
    print(f"[*] Found {len(files)} valid source files.")

    chunker = ChunkBuilder()
    embedder = OllamaEmbedder()

    total_chunks = 0
    global_chunk_counter = 0  # running index across all files for [chunk-N] labels

    # ── Wipe the entire Qdrant storage directory so no stale data survives ──
    qdrant_storage_path = Path("./qdrant_storage")
    if qdrant_storage_path.exists():
        print("[*] Clearing previous Qdrant storage from disk...")
        shutil.rmtree(qdrant_storage_path)
    qdrant_storage_path.mkdir(parents=True, exist_ok=True)

    # ── Parse files in parallel using multiprocessing ───────────────────────
    print(f"[*] Parsing files concurrently using multiprocessing...")
    from concurrent.futures import ProcessPoolExecutor
    file_paths_str = [str(f) for f in files]
    parsed_results = []

    with ProcessPoolExecutor() as executor:
        futures = [executor.submit(_parse_file_worker, fp) for fp in file_paths_str]
        for i, future in enumerate(futures):
            file_path_str, parser_label, symbols = future.result()
            file_path = Path(file_path_str)
            print(f"  [{i+1}/{len(files)}] Parsed {file_path.name}")
            parsed_results.append((file_path, parser_label, symbols))

    with QdrantVectorStore() as vector_store:
        print("[*] Initialising fresh Qdrant index...")

        # ── Initialise log files ────────────────────────────────────
        if LOG_PARSER:
            with open("parser_log.txt", "w", encoding="utf-8") as f:
                f.write(f"=== PARSER LOG for {repo_path_obj} ===\n")
                f.write(f"Total files to parse: {len(files)}\n")
                f.write("\nSections per file:\n")
                f.write("  [PARSER]       -> which parser handled the file (ASTParser / TreeSitterParser)\n")
                f.write("  [symbol-N]     -> each extracted symbol with full metadata\n")
                f.write("  imports        -> file-level import statements seen by the parser\n")
                f.write("  function_calls -> calls found inside this symbol's body\n")
                f.write("\n" + "=" * 70 + "\n")

        if LOG_CHUNKS:
            with open("chunks_log.txt", "w", encoding="utf-8") as f:
                f.write(f"=== CHUNK LOG for {repo_path_obj} ===\n\n")
                f.write("Legend:\n")
                f.write("  [chunk-N]            -> chunk was CREATED by the chunker\n")
                f.write("  [chunk-N embedded ✓] -> embedding was GENERATED and chunk was stored\n")
                f.write("  [chunk-N SKIPPED]    -> embedding was empty; chunk was NOT stored\n")
                f.write("\n" + "=" * 60 + "\n\n")

        for file, parser_label, symbols in parsed_results:
            if parser_label == "ASTParser" and LOG_AST_PARSER:
                with open("log.txt", "a", encoding="utf-8") as log_f:
                    log_f.write(f"--- Symbols for {file.name} ---\n")
                    log_f.write(json.dumps(symbols, indent=4) + "\n\n")

            # ── Parser log: show raw symbol output before chunking ──────────
            if LOG_PARSER:
                with open("parser_log.txt", "a", encoding="utf-8") as f:
                    sep = "═" * 70
                    rel = file.relative_to(repo_path_obj) if file.is_absolute() else file
                    f.write(f"\n{sep}\n")
                    f.write(f"FILE    : {rel}\n")
                    f.write(f"PARSER  : {parser_label}\n")
                    f.write(f"SYMBOLS : {len(symbols)}\n")
                    f.write(f"{sep}\n")
                    if not symbols:
                        f.write("  (no symbols extracted)\n")
                    for s_idx, sym in enumerate(symbols, 1):
                        f.write(f"\n  [symbol-{s_idx}]\n")
                        f.write(f"    Name          : {sym.get('name', '?')}\n")
                        f.write(f"    Type          : {sym.get('type', '?')}\n")
                        f.write(f"    Lines         : {sym.get('start_line', '?')}-{sym.get('end_line', '?')}\n")
                        docstr = (sym.get('docstring') or '').strip().replace('\n', ' ')
                        f.write(f"    Docstring     : {docstr[:120]}\n")
                        imports = sym.get('imports', [])
                        f.write(f"    Imports ({len(imports):3d}) :\n")
                        for imp in imports:
                            f.write(f"      - {imp}\n")
                        calls = sym.get('function_calls', [])
                        f.write(f"    Calls   ({len(calls):3d}) :\n")
                        for call in calls:
                            f.write(f"      - {call}\n")
                        f.write(f"    Code          :\n")
                        for line in (sym.get('code') or '').splitlines():
                            f.write(f"      {line}\n")


            chunks = chunker.build_chunks(file, symbols)
            if not chunks:
                continue

            # ── Log every CREATED chunk before embedding ──────────────────
            if LOG_CHUNKS:
                with open("chunks_log.txt", "a", encoding="utf-8") as f:
                    f.write(f"\n{'─'*60}\n")
                    f.write(f"FILE: {file.name}  ({len(chunks)} chunks)\n")
                    f.write(f"{'─'*60}\n")
                    for chunk in chunks:
                        global_chunk_counter += 1
                        label = f"[chunk-{global_chunk_counter}]"
                        f.write(
                            f"{label}\n"
                            f"  Symbol   : {chunk.symbol_name}\n"
                            f"  Type     : {chunk.chunk_type}\n"
                            f"  File     : {chunk.file_path}\n"
                            f"  Lines    : {chunk.start_line}-{chunk.end_line}\n"
                            f"  Docstring: {(chunk.docstring or '').strip()[:80]}\n"
                            f"  Imports  ({len(chunk.imports):3d}) :\n"
                        )
                        for imp in chunk.imports:
                            f.write(f"    - {imp}\n")
                        f.write(f"  Calls    ({len(chunk.function_calls):3d}) :\n")
                        for call in chunk.function_calls:
                            f.write(f"    - {call}\n")
                        f.write(f"  Code     :\n")
                        for line in chunk.code.splitlines():
                            f.write(f"    {line}\n")
                        f.write("\n")
                # Reset so second pass can write embedded markers with same numbering
                global_chunk_counter -= len(chunks)

            # ── Generate embeddings in a single batch request ────────────
            chunk_codes = [chunk.code for chunk in chunks]
            embeddings = embedder.embed_batch(chunk_codes)

            for chunk, embedding in zip(chunks, embeddings):
                chunk.embedding = embedding
                global_chunk_counter += 1
                label = f"[chunk-{global_chunk_counter}]"

                if LOG_CHUNKS:
                    if chunk.embedding:
                        embed_marker = f"{label} embedded ✓  ({len(chunk.embedding)}-dim vector)  — {chunk.symbol_name}\n"
                    else:
                        embed_marker = f"{label} SKIPPED      (empty embedding returned)       — {chunk.symbol_name}\n"
                    print(embed_marker.strip())
                    with open("chunks_log.txt", "a", encoding="utf-8") as f:
                        f.write(embed_marker)

            vector_store.add_chunks(chunks)
            total_chunks += len(chunks)

    print(f"[*] Indexing complete. Indexed {total_chunks} code chunks into Qdrant (local).")

    # Optionally build graph index
    try:
        from backend.retrieval.graph_store import Neo4jGraphStore
        from backend.retrieval.graph_retrieval import GraphIndexBuilder

        print("[*] Attempting to build graph index in Neo4j...")
        with Neo4jGraphStore() as graph_store:
            # Collect all chunks directly from already-parsed data
            # (avoids re-reading Qdrant; we have parsed_results in scope)
            all_chunks = []
            chunk_counter = 0
            for file, parser_label, symbols in parsed_results:
                file_chunks = chunker.build_chunks(file, symbols)
                for chunk in file_chunks:
                    all_chunks.append({
                        'id': f"chunk-{chunk_counter}",
                        'symbol_name': chunk.symbol_name,
                        'file_path': chunk.file_path,
                        'code': chunk.code,
                        'start_line': chunk.start_line,
                        'end_line': chunk.end_line,
                        'symbol_type': chunk.chunk_type,
                    })
                    chunk_counter += 1

            if all_chunks:
                builder = GraphIndexBuilder(graph_store)
                builder.build_index(all_chunks)
                print(f"[✓] Graph index built with {len(all_chunks)} chunks")
            else:
                print("[!] No chunks found for graph indexing")
    except Exception as e:
        print(f"[*] Graph indexing skipped: {e}")
        print("[*] Graph search will not be available. Vector search only.")

    return total_chunks

def query_repo(repo_path: str, query: str, model_name: str = "qwen2.5:7b"):
    """
    Executes a hybrid search query against the indexed repository (Qdrant, local).
    Returns (token_stream_generator, results_list).
    """
    repo_path_obj = Path(repo_path).expanduser().resolve()
    print(f"\n==================================================")
    print(f"QUERY:\n\"{query}\"\n")
    print(f"RESULTS:")

    embedder = OllamaEmbedder()
    with QdrantVectorStore() as vector_store:
        query_embedding = embedder.embed_text(query)
        # Retrieve 25 candidates for re-ranking
        results = vector_store.hybrid_search(query, query_embedding, n_results=25)

    # Rerank candidates to select the top 10 most relevant
    from backend.retrieval.reranker import CrossEncoderReranker
    reranker = CrossEncoderReranker()
    results = reranker.rerank(query, results, top_k=10)

    llm = OllamaLLM(model_name=model_name)
    token_stream = llm.generate_answer_stream(query, results)

    return token_stream, results


def query_repo_graph(repo_path: str, query: str, model_name: str = "qwen2.5:7b"):
    """
    Executes a graph-based search query against the indexed repository (Neo4j).
    Searches the code relationship graph to find related chunks.
    Returns (token_stream_generator, results_list).
    """
    repo_path_obj = Path(repo_path).expanduser().resolve()
    print(f"\n==================================================")
    print(f"GRAPH QUERY:\n\"{query}\"\n")
    print(f"RESULTS:")

    try:
        from backend.retrieval.graph_store import Neo4jGraphStore
        from backend.retrieval.graph_retrieval import GraphRetriever
    except ImportError as e:
        print(f"[!] Graph modules not available: {e}")
        # Fallback to vector search
        return query_repo(repo_path, query, model_name)

    # Extract query keywords
    query_keywords = query.lower().split()

    try:
        with Neo4jGraphStore() as graph_store:
            retriever = GraphRetriever(graph_store)

            # Search with context in the graph
            results = retriever.search_with_context(
                query_keywords, hop_depth=2, limit=10
            )

            # Convert graph results to the same format as vector search results
            formatted_results = []
            for i, result in enumerate(results):
                formatted_results.append({
                    'id': result.get('id', f'graph-{i}'),
                    'symbol_name': result.get('symbol_name', 'Unknown'),
                    'file_path': result.get('file_path', ''),
                    'code': result.get('code', ''),
                    'start_line': result.get('start_line', 0),
                    'end_line': result.get('end_line', 0),
                    'symbol_type': result.get('symbol_type', 'unknown'),
                    'score': 0.5,  # Graph search doesn't have scores like vector search
                })
    except Exception as e:
        print(f"[!] Error accessing graph database: {e}")
        print("[*] Make sure Neo4j is running at neo4j://localhost:7687")
        # Fallback to vector search
        return query_repo(repo_path, query, model_name)

    # Generate answer using LLM
    llm = OllamaLLM(model_name=model_name)
    token_stream = llm.generate_answer_stream(query, formatted_results)

    return token_stream, formatted_results

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Local AI Coding Agent MVP")
    parser.add_argument("command", choices=["index", "query"], help="Command to run: 'index' to build DB, 'query' to search.")
    parser.add_argument("--repo", default=".", help="Path to the repository to scan/query (default: current dir)")
    parser.add_argument("--query", type=str, help="The natural language query (required for 'query' command)")

    args = parser.parse_args()

    if args.command == "index":
        build_index(args.repo)
    elif args.command == "query":
        if not args.query:
            print("Error: --query argument is required when using the 'query' command.")
            sys.exit(1)
        token_stream, results = query_repo(args.repo, args.query)

        print("\n=== AI Answer ===")
        # Print each token as it arrives for a typewriter effect in the terminal
        for token in token_stream:
            print(token, end="", flush=True)
        print()
        print("\n=== References ===")

        if not results:
            print("No relevant code chunks found.")
            print("==================================================")
        else:
            repo_path_obj = Path(args.repo).expanduser().resolve()
            for i, res in enumerate(results, 1):
                print(f"{i}.")
                try:
                    rel_path = Path(res['file_path']).relative_to(repo_path_obj)
                except ValueError:
                    rel_path = res['file_path']
                print(f"File: {rel_path}")
                print(f"Symbol: {res['symbol_name']}")
                print(f"Lines: {res['start_line']}-{res['end_line']}")
                print(f"Score: {res['score']}")
                print(f"Code Snippet:\n--------------------------------------------------\n{res['code']}\n--------------------------------------------------")
                print()
            print("==================================================")