Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions codebase_rag/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ class FileAction(StrEnum):
EXT_IXX = ".ixx"
EXT_CPPM = ".cppm"
EXT_CCM = ".ccm"
EXT_C = ".c"
EXT_CS = ".cs"
EXT_PHP = ".php"
EXT_LUA = ".lua"
Expand All @@ -101,6 +102,7 @@ class FileAction(StrEnum):
GO_EXTENSIONS = (EXT_GO,)
SCALA_EXTENSIONS = (EXT_SCALA, EXT_SC)
JAVA_EXTENSIONS = (EXT_JAVA,)
C_EXTENSIONS = (EXT_C,)
CPP_EXTENSIONS = (
EXT_CPP,
EXT_H,
Expand Down Expand Up @@ -444,6 +446,7 @@ class SupportedLanguage(StrEnum):
GO = "go"
SCALA = "scala"
JAVA = "java"
C = "c"
CPP = "cpp"
CSHARP = "c-sharp"
PHP = "php"
Expand Down Expand Up @@ -477,6 +480,11 @@ class LanguageMetadata(NamedTuple):
"Interfaces, type aliases, enums, namespaces, ES6/CommonJS modules",
"TypeScript",
),
SupportedLanguage.C: LanguageMetadata(
LanguageStatus.DEV,
"Functions, structs, unions, enums, preprocessor includes",
"C",
),
SupportedLanguage.CPP: LanguageMetadata(
LanguageStatus.FULL,
"Constructors, destructors, operator overloading, templates, lambdas, C++20 modules, namespaces",
Expand Down Expand Up @@ -742,6 +750,7 @@ class TreeSitterModule(StrEnum):
GO = "tree_sitter_go"
SCALA = "tree_sitter_scala"
JAVA = "tree_sitter_java"
C = "tree_sitter_c"
CPP = "tree_sitter_cpp"
LUA = "tree_sitter_lua"

Expand Down Expand Up @@ -2650,6 +2659,13 @@ class MCPParamName(StrEnum):
TS_ENUM_SPECIFIER,
)

# (H) Derived node types for _c_get_name
C_NAME_NODE_TYPES = (
TS_STRUCT_SPECIFIER,
TS_UNION_SPECIFIER,
TS_ENUM_SPECIFIER,
)

# (H) LANGUAGE_SPECS node type tuples for Rust
SPEC_RS_FUNCTION_TYPES = (
TS_RS_FUNCTION_ITEM,
Expand Down Expand Up @@ -2746,6 +2762,26 @@ class MCPParamName(StrEnum):
PKG_CONANFILE,
)

# (H) FQN node type tuples for C
FQN_C_SCOPE_TYPES = (
TS_CPP_TRANSLATION_UNIT,
TS_STRUCT_SPECIFIER,
TS_UNION_SPECIFIER,
TS_ENUM_SPECIFIER,
)
Comment on lines +2766 to +2771
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For better clarity and maintainability, consider aliasing the reused C++ tree-sitter node types to C-specific names. For example: TS_C_TRANSLATION_UNIT = TS_CPP_TRANSLATION_UNIT. This would make the C configuration more self-documenting and less prone to confusion with the C++ spec.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a dedicated C_NAME_NODE_TYPES constant in constants.py instead of aliasing. It includes TS_STRUCT_SPECIFIER, TS_UNION_SPECIFIER, and TS_ENUM_SPECIFIER, and _c_get_name now references it. Commit 74e10f2.

FQN_C_FUNCTION_TYPES = (TS_CPP_FUNCTION_DEFINITION,)

# (H) LANGUAGE_SPECS node type tuples for C
SPEC_C_FUNCTION_TYPES = (TS_CPP_FUNCTION_DEFINITION,)
SPEC_C_CLASS_TYPES = (
TS_STRUCT_SPECIFIER,
TS_UNION_SPECIFIER,
TS_ENUM_SPECIFIER,
)
SPEC_C_MODULE_TYPES = (TS_CPP_TRANSLATION_UNIT,)
SPEC_C_CALL_TYPES = (TS_CPP_CALL_EXPRESSION,)
SPEC_C_PACKAGE_INDICATORS = (PKG_CMAKE_LISTS, PKG_MAKEFILE)

# (H) LANGUAGE_SPECS node type tuples for C#
SPEC_CS_FUNCTION_TYPES = (
TS_CS_DESTRUCTOR_DECLARATION,
Expand Down
51 changes: 51 additions & 0 deletions codebase_rag/language_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,27 @@ def _rust_file_to_module(file_path: Path, repo_root: Path) -> list[str]:
return []


def _c_unwrap_declarator(declarator: Node | None) -> Node | None:
while declarator and declarator.type == cs.CppNodeType.POINTER_DECLARATOR:
declarator = declarator.child_by_field_name(cs.FIELD_DECLARATOR)
return declarator


def _c_get_name(node: Node) -> str | None:
if node.type in cs.C_NAME_NODE_TYPES:
name_node = node.child_by_field_name(cs.FIELD_NAME)
if name_node and name_node.text:
return name_node.text.decode(cs.ENCODING_UTF8)
elif node.type == cs.TS_CPP_FUNCTION_DEFINITION:
declarator = node.child_by_field_name(cs.FIELD_DECLARATOR)
declarator = _c_unwrap_declarator(declarator)
if declarator and declarator.type == cs.TS_CPP_FUNCTION_DECLARATOR:
name_node = declarator.child_by_field_name(cs.FIELD_DECLARATOR)
if name_node and name_node.type == cs.TS_IDENTIFIER and name_node.text:
return name_node.text.decode(cs.ENCODING_UTF8)
return _generic_get_name(node)
Comment on lines +106 to +118
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

union_specifier names silently fall through to _generic_get_name

CPP_NAME_NODE_TYPES is defined in constants.py (line 2656–2660) as:

CPP_NAME_NODE_TYPES = (
    CppNodeType.CLASS_SPECIFIER,
    TS_STRUCT_SPECIFIER,
    TS_ENUM_SPECIFIER,
)

TS_UNION_SPECIFIER is not in this tuple. Because _c_get_name delegates to _generic_get_name for any node that is neither in CPP_NAME_NODE_TYPES nor a function_definition, union nodes take the generic path instead of the explicit struct/enum path. While _generic_get_name likely resolves the name field correctly in practice, it is fragile and confusing: _c_get_name is using a constant explicitly named for C++ that intentionally omits unions.

A dedicated C_NAME_NODE_TYPES constant should be defined in constants.py that includes TS_UNION_SPECIFIER:

# in constants.py
C_NAME_NODE_TYPES = (
    TS_STRUCT_SPECIFIER,
    TS_UNION_SPECIFIER,
    TS_ENUM_SPECIFIER,
)

and _c_get_name should reference cs.C_NAME_NODE_TYPES instead of cs.CPP_NAME_NODE_TYPES.

Prompt To Fix With AI
This is a comment left during a code review.
Path: codebase_rag/language_spec.py
Line: 107-120

Comment:
**`union_specifier` names silently fall through to `_generic_get_name`**

`CPP_NAME_NODE_TYPES` is defined in `constants.py` (line 2656–2660) as:

```python
CPP_NAME_NODE_TYPES = (
    CppNodeType.CLASS_SPECIFIER,
    TS_STRUCT_SPECIFIER,
    TS_ENUM_SPECIFIER,
)
```

`TS_UNION_SPECIFIER` is **not** in this tuple. Because `_c_get_name` delegates to `_generic_get_name` for any node that is neither in `CPP_NAME_NODE_TYPES` nor a `function_definition`, union nodes take the generic path instead of the explicit struct/enum path. While `_generic_get_name` likely resolves the `name` field correctly in practice, it is fragile and confusing: `_c_get_name` is using a constant explicitly named for C++ that intentionally omits unions.

A dedicated `C_NAME_NODE_TYPES` constant should be defined in `constants.py` that includes `TS_UNION_SPECIFIER`:

```python
# in constants.py
C_NAME_NODE_TYPES = (
    TS_STRUCT_SPECIFIER,
    TS_UNION_SPECIFIER,
    TS_ENUM_SPECIFIER,
)
```

and `_c_get_name` should reference `cs.C_NAME_NODE_TYPES` instead of `cs.CPP_NAME_NODE_TYPES`.

How can I resolve this? If you propose a fix, please make it concise.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added C_NAME_NODE_TYPES to constants.py with TS_STRUCT_SPECIFIER, TS_UNION_SPECIFIER, and TS_ENUM_SPECIFIER. _c_get_name now uses cs.C_NAME_NODE_TYPES so union nodes are handled explicitly. Commit 74e10f2.



def _cpp_get_name(node: Node) -> str | None:
if node.type in cs.CPP_NAME_NODE_TYPES:
name_node = node.child_by_field_name(cs.FIELD_NAME)
Expand Down Expand Up @@ -154,6 +175,13 @@ def _cpp_get_name(node: Node) -> str | None:
file_to_module_parts=_generic_file_to_module,
)

C_FQN_SPEC = FQNSpec(
scope_node_types=frozenset(cs.FQN_C_SCOPE_TYPES),
function_node_types=frozenset(cs.FQN_C_FUNCTION_TYPES),
get_name=_c_get_name,
file_to_module_parts=_generic_file_to_module,
)

LUA_FQN_SPEC = FQNSpec(
scope_node_types=frozenset(cs.FQN_LUA_SCOPE_TYPES),
function_node_types=frozenset(cs.FQN_LUA_FUNCTION_TYPES),
Expand Down Expand Up @@ -195,6 +223,7 @@ def _cpp_get_name(node: Node) -> str | None:
cs.SupportedLanguage.TS: TS_FQN_SPEC,
cs.SupportedLanguage.RUST: RUST_FQN_SPEC,
cs.SupportedLanguage.JAVA: JAVA_FQN_SPEC,
cs.SupportedLanguage.C: C_FQN_SPEC,
cs.SupportedLanguage.CPP: CPP_FQN_SPEC,
cs.SupportedLanguage.LUA: LUA_FQN_SPEC,
cs.SupportedLanguage.GO: GO_FQN_SPEC,
Expand Down Expand Up @@ -343,6 +372,28 @@ def _cpp_get_name(node: Node) -> str | None:
type: (type_identifier) @name) @call
""",
),
cs.SupportedLanguage.C: LanguageSpec(
language=cs.SupportedLanguage.C,
file_extensions=cs.C_EXTENSIONS,
function_node_types=cs.SPEC_C_FUNCTION_TYPES,
class_node_types=cs.SPEC_C_CLASS_TYPES,
module_node_types=cs.SPEC_C_MODULE_TYPES,
call_node_types=cs.SPEC_C_CALL_TYPES,
import_node_types=cs.IMPORT_NODES_INCLUDE,
import_from_node_types=cs.IMPORT_NODES_INCLUDE,
package_indicators=cs.SPEC_C_PACKAGE_INDICATORS,
function_query="""
(function_definition) @function
""",
class_query="""
(struct_specifier) @class
(union_specifier) @class
(enum_specifier) @class
""",
call_query="""
(call_expression) @call
""",
),
cs.SupportedLanguage.CPP: LanguageSpec(
language=cs.SupportedLanguage.CPP,
file_extensions=cs.CPP_EXTENSIONS,
Expand Down
6 changes: 6 additions & 0 deletions codebase_rag/parser_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,12 @@ def _import_language_loaders() -> dict[cs.SupportedLanguage, LanguageLoader]:
cs.QUERY_LANGUAGE,
cs.SupportedLanguage.JAVA,
),
LanguageImport(
cs.SupportedLanguage.C,
cs.TreeSitterModule.C,
cs.QUERY_LANGUAGE,
cs.SupportedLanguage.C,
),
LanguageImport(
cs.SupportedLanguage.CPP,
cs.TreeSitterModule.CPP,
Expand Down
7 changes: 7 additions & 0 deletions codebase_rag/tests/test_handler_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ def test_returns_base_handler_for_php(self) -> None:
assert isinstance(handler, BaseLanguageHandler)
assert type(handler) is BaseLanguageHandler

def test_returns_base_handler_for_c(self) -> None:
handler = get_handler(SupportedLanguage.C)
assert isinstance(handler, BaseLanguageHandler)
assert type(handler) is BaseLanguageHandler


class TestHandlerCaching:
def test_same_instance_returned_for_same_language(self) -> None:
Expand Down Expand Up @@ -84,6 +89,7 @@ class TestHandlerProtocol:
SupportedLanguage.PYTHON,
SupportedLanguage.GO,
SupportedLanguage.PHP,
SupportedLanguage.C,
],
)
def test_handler_has_all_protocol_methods(
Expand Down Expand Up @@ -114,6 +120,7 @@ def test_handler_has_all_protocol_methods(
SupportedLanguage.JAVA,
SupportedLanguage.LUA,
SupportedLanguage.PYTHON,
SupportedLanguage.C,
],
)
def test_handler_methods_are_callable(self, language: SupportedLanguage) -> None:
Expand Down
3 changes: 3 additions & 0 deletions codebase_rag/tests/test_language_node_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pytest

from codebase_rag.constants import (
C_EXTENSIONS,
CPP_EXTENSIONS,
CS_EXTENSIONS,
GO_EXTENSIONS,
Expand Down Expand Up @@ -60,6 +61,7 @@ def test_each_language_has_file_extensions(self, lang: SupportedLanguage) -> Non
(SupportedLanguage.GO, GO_EXTENSIONS),
(SupportedLanguage.SCALA, SCALA_EXTENSIONS),
(SupportedLanguage.JAVA, JAVA_EXTENSIONS),
(SupportedLanguage.C, C_EXTENSIONS),
(SupportedLanguage.CPP, CPP_EXTENSIONS),
(SupportedLanguage.CSHARP, CS_EXTENSIONS),
(SupportedLanguage.PHP, PHP_EXTENSIONS),
Expand Down Expand Up @@ -87,6 +89,7 @@ def test_language_spec_has_correct_extensions(
(".go", SupportedLanguage.GO),
(".scala", SupportedLanguage.SCALA),
(".java", SupportedLanguage.JAVA),
(".c", SupportedLanguage.C),
(".cpp", SupportedLanguage.CPP),
(".h", SupportedLanguage.CPP),
(".hpp", SupportedLanguage.CPP),
Expand Down
4 changes: 2 additions & 2 deletions codebase_rag/tools/semantic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ async def semantic_search_functions(query: str, top_k: int = 5) -> str:

return response

return Tool(semantic_search_functions, name=td.AgenticToolName.SEMANTIC_SEARCH)
return Tool(semantic_search_functions, name=td.AgenticToolName.SEMANTIC_SEARCH, description=td.SEMANTIC_SEARCH)


def create_get_function_source_tool() -> Tool:
Expand All @@ -153,4 +153,4 @@ async def get_function_source_by_id(node_id: int) -> str:

return cs.MSG_SEMANTIC_SOURCE_FORMAT.format(id=node_id, code=source_code)

return Tool(get_function_source_by_id, name=td.AgenticToolName.GET_FUNCTION_SOURCE)
return Tool(get_function_source_by_id, name=td.AgenticToolName.GET_FUNCTION_SOURCE, description=td.GET_FUNCTION_SOURCE)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ treesitter-full = [
"tree-sitter-go>=0.23.4",
"tree-sitter-scala>=0.24.0",
"tree-sitter-java>=0.23.5",
"tree-sitter-c>=0.24.1",
"tree-sitter-cpp>=0.23.0",
"tree-sitter-lua>=0.0.19",
]
Expand Down
21 changes: 20 additions & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.