diff --git a/src/utils/binary_detector.py b/src/utils/binary_detector.py index cdbf2a8..39af0f7 100644 --- a/src/utils/binary_detector.py +++ b/src/utils/binary_detector.py @@ -83,6 +83,11 @@ ".vbs", # VBScript ".reg", # Windows Registry ".desktop", + # Godot + ".godot", + ".gd", + ".gd.uid", + ".tscn", } ) @@ -183,6 +188,9 @@ ".db", ".sqlite", ".sqlite3", + ".pdb", + ".pyd", + ".o", } ) diff --git a/src/workspace/exclusion_manager.py b/src/workspace/exclusion_manager.py new file mode 100644 index 0000000..7e8bbe3 --- /dev/null +++ b/src/workspace/exclusion_manager.py @@ -0,0 +1,177 @@ +"""统一排除管理器 —— 合并 gitignore、用户 ignore、默认排除规则. + +区分两类排除: +- 性能排除: 缓存/构建产物等不影响安全的目录 +- 安全排除: 隐私/凭据文件等不应被 AI 访问的路径 +""" + +from __future__ import annotations + +import os +import re +from pathlib import Path +from typing import ClassVar + +from src.workspace.gitignore_loader import is_ignored_by_gitignore, load_gitignore + + +class ExclusionManager: + """排除规则统一管理器 + + 聚合三类排除源: + 1. 默认排除(内置的缓存/构建/IDE 目录) + 2. .gitignore 规则(如项目中有 .gitignore 文件) + 3. 用户临时 ignore 参数 + + Args: + workspace_root: 工作区根目录 + """ + + # 性能排除 —— 缓存、构建产物、IDE 配置等 + PERFORMANCE_EXCLUSIONS: frozenset[str] = frozenset( + { + ".git", + "__pycache__", + "node_modules", + ".venv", + "venv", + "dist", + "build", + ".idea", + ".vscode", + ".ruff_cache", + ".pytest_cache", + ".mypy_cache", + ".hypothesis", + "htmlcov", + ".coverage", + "*.pyc", + "*.pyo", + ".eggs", + "*.egg-info", + ".tox", + ".nox", + ".svn", + ".hg", + ".bzr", + "target", # Rust build + ".next", # Next.js build + ".nuxt", # Nuxt build + ".output", # Nuxt output + } + ) + + # 安全排除 —— 需要精确匹配的敏感文件正则(与 SECURITY_EXCLUSIONS 合并后的唯一来源) + # 注意: (^|/) 前缀表示匹配路径开始或目录分隔符后; .* 前缀表示匹配任意位置的文件扩展名 + SENSITIVE_FILE_PATTERNS: ClassVar[list[str]] = [ + r"(^|/)\.env$", + r"(^|/)\.env\..+$", + r".*\.pem$", + r"(^|/)credentials\..*$", + r".*\.key$", + r".*\.cert$", + r"(^|/)id_rsa$", + r"(^|/)id_ed25519$", + r".*\.cred$", + r".*\.secret$", + r"(^|/)\.ManualAid[/\\].*\.db$", + ] + + def __init__(self, workspace_root: str | Path): + self._workspace_root = Path(workspace_root).resolve() + # 从 .gitignore 加载 + self._raw_gitignore_patterns: list[str] = [] + self._gitignore_exclude_res: list[re.Pattern] = [] + self._gitignore_negate_res: list[re.Pattern] = [] + + self._reload_gitignore() + + def _reload_gitignore(self) -> None: + """(重新)加载 .gitignore.""" + raw, exclude_res, negate_res = load_gitignore(self._workspace_root) + self._raw_gitignore_patterns = raw + self._gitignore_exclude_res = exclude_res + self._gitignore_negate_res = negate_res + + def _check_performance_exclusion(self, rel_path_str: str) -> bool: + """检查路径是否匹配性能排除规则(基于目录名)""" + # 将路径拆分为各层, 检查每层是否在排除集合中 + parts = rel_path_str.replace(os.sep, "/").split("/") + for part in parts: + # 检查部分匹配: "node_modules" 或通配匹配 + if part in self.PERFORMANCE_EXCLUSIONS: + return True + # 检查 *.xxx 模式 + for exclude in self.PERFORMANCE_EXCLUSIONS: + if exclude.startswith("*.") and part.endswith(exclude[1:]): + return True + return False + + def should_exclude_path(self, path: Path) -> bool: + """检查路径是否应被排除(全面检查) + + 依次检查: 默认排除目录名 → gitignore 规则 → 否定规则 + + Args: + path: 文件的绝对路径 + + Returns: + True 表示应排除 + """ + try: + rel_path = path.relative_to(self._workspace_root) + except ValueError: + # 在工作区外, 不在这里处理(由 PathValidator 处理) + return False + + rel_str = str(rel_path).replace(os.sep, "/") + + # 1. 性能排除: 检查所有父目录 + if self._check_performance_exclusion(rel_str): + return True + + # 2. gitignore 排除 + return is_ignored_by_gitignore(rel_str, self._gitignore_exclude_res, self._gitignore_negate_res) + + def merge_ignore_regexes(self, user_ignore: list[str] | None = None) -> list[re.Pattern]: + """合并默认排除 + gitignore + 用户 ignore 为正则列表 + + 用于 search_content 等需要正则匹配排除的场景 + + 敏感文件由 PathValidator 在写入/读取时拦截,搜索场景不额外过滤 + + Args: + user_ignore: 用户传入的忽略正则列表 + + Returns: + 编译后的正则列表 + """ + result: list[re.Pattern] = [] + + # 默认排除目录名 → 正则 + for excl in self.PERFORMANCE_EXCLUSIONS: + # 处理 *.pyc 类模式 + if excl.startswith("*."): + pat = excl[1:] # .pyc + result.append(re.compile(re.escape(pat) + "$")) + else: + # 匹配路径中的此目录名 + result.append(re.compile(r"(^|/)" + re.escape(excl) + r"(/|$)")) + + # gitignore 排除正则 + result.extend(self._gitignore_exclude_res) + + # 用户传入的 ignore 正则 + if user_ignore: + for ign in user_ignore: + try: + result.append(re.compile(ign)) + except re.error: + continue + + return result + + @property + def excluded_dir_names(self) -> set[str]: + """获取所有排除目录名集合(用于快速 in 检查)""" + return {d for d in self.PERFORMANCE_EXCLUSIONS if not d.startswith("*")} diff --git a/src/workspace/gitignore_loader.py b/src/workspace/gitignore_loader.py new file mode 100644 index 0000000..d1a136e --- /dev/null +++ b/src/workspace/gitignore_loader.py @@ -0,0 +1,178 @@ +"""Parse .gitignore files and convert patterns to regex for exclusion matching. + +已知局限性: +- 不支持嵌套 .gitignore(仅读取根目录下的 .gitignore) +- 不支持行尾 \\ 续行 +- 不支持 gitignore 扩展语法中的字符类(如 [abc] 会被错误转义) +- 否定模式的优先级处理与真实 Git 不一致: 当前实现将所有否定模式提升为最高优先级, + 而真实 Git 按行号顺序逐条处理(后出现的规则覆盖先出现的). + 当前行为对于 AI 工具场景偏安全(宁可少排除), 故保留此简化实现. +""" + +from __future__ import annotations + +import os +import re +from pathlib import Path + + +def _convert_gitignore_to_regex(pattern: str) -> str | None: + """将 .gitignore 模式转换为正则表达式. + + Args: + pattern: .gitignore 模式(如 *.log, build/, /foo) + + Returns: + 对应的正则表达式字符串, 如果模式无效则返回 None + """ + # 保留原始模式用于锚定判断 + original = pattern + is_dir_only = pattern.endswith("/") + if is_dir_only: + pattern = pattern.rstrip("/") + + # 处理否定模式(仅用于判断是否为目录模式, 不处理逻辑) + if pattern.startswith("!"): + pattern = pattern[1:] + + # 转义正则特殊字符, 再处理 gitignore 通配符 + # 先处理 ** (多级通配符) + parts = [] + i = 0 + while i < len(pattern): + if pattern[i : i + 2] == "**": + parts.append(".*") + i += 2 + elif pattern[i] == "*": + # 单级通配符, 不匹配路径分隔符 + parts.append(r"[^/]*") + i += 1 + elif pattern[i] == "?": + parts.append(r"[^/]") + i += 1 + elif pattern[i] in ".+^${}()|[]\\": + parts.append("\\" + pattern[i]) + i += 1 + else: + parts.append(pattern[i]) + i += 1 + + regex_str = "".join(parts) + + # 锚定: / 开头表示从根目录匹配, 否则匹配任意路径 + if original.startswith("/"): + regex_str = "^" + regex_str[1:] # 去掉开头的 / + elif original.startswith("!"): + # 处理否定模式 - 保持锚定逻辑不变 + regex_str = "^" + regex_str[1:] if original[1:].startswith("/") else "(^|/)" + regex_str + else: + regex_str = "(^|/)" + regex_str + + if is_dir_only: + regex_str += "(/.*)?$" + else: + regex_str += "$" + + return regex_str + + +def parse_gitignore(gitignore_path: str | Path) -> list[str]: + """解析 .gitignore 文件, 返回非否定排除模式列表. + + Args: + gitignore_path: .gitignore 文件路径 + + Returns: + 排除模式列表(目录名/通配符等原始 gitignore 格式) + """ + patterns: list[str] = [] + gitignore_path = Path(gitignore_path) + + if not gitignore_path.exists(): + return patterns + + try: + text = gitignore_path.read_text(encoding="utf-8") + except Exception: + return patterns + + for line in text.splitlines(): + stripped = line.strip() + + # 跳过空行和注释 + if not stripped or stripped.startswith("#"): + continue + + # 保留否定模式供外部处理, 返回原始行 + patterns.append(stripped) + + return patterns + + +def compile_gitignore_patterns(patterns: list[str]) -> tuple[list[re.Pattern], list[re.Pattern]]: + """将 gitignore 模式编译为正则表达式. + + Args: + patterns: 原始 gitignore 模式列表 + + Returns: + (排除正则列表, 否定排除正则列表) 的元组 + """ + exclude_res: list[re.Pattern] = [] + negate_res: list[re.Pattern] = [] + + for pattern in patterns: + if pattern.startswith("!"): + # 否定模式: 取消排除 + negate_regex = _convert_gitignore_to_regex(pattern) + if negate_regex: + try: + negate_res.append(re.compile(negate_regex)) + except re.error: + continue + else: + regex = _convert_gitignore_to_regex(pattern) + if regex: + try: + exclude_res.append(re.compile(regex)) + except re.error: + continue + + return exclude_res, negate_res + + +def is_ignored_by_gitignore(path: str | Path, exclude_res: list[re.Pattern], negate_res: list[re.Pattern]) -> bool: + """检查路径是否被 .gitignore 规则忽略. + + Args: + path: 要检查的相对路径(字符串形式) + exclude_res: 排除正则列表 + negate_res: 否定排除正则列表 + + Returns: + 是否应该被忽略 + """ + path_str = str(path).replace(os.sep, "/") + + # 先检查否定模式(优先级更高) + for negate_re in negate_res: + if negate_re.search(path_str): + return False + + # 再检查排除模式 + return any(exclude_re.search(path_str) for exclude_re in exclude_res) + + +def load_gitignore(workspace_root: str | Path) -> tuple[list[str], list[re.Pattern], list[re.Pattern]]: + """从工作区根目录加载 .gitignore. + + Args: + workspace_root: 工作区根目录 + + Returns: + (原始模式列表, 排除正则列表, 否定排除正则列表) + """ + gitignore_path = Path(workspace_root) / ".gitignore" + raw_patterns = parse_gitignore(gitignore_path) + exclude_res, negate_res = compile_gitignore_patterns(raw_patterns) + return raw_patterns, exclude_res, negate_res diff --git a/src/workspace/path_validator.py b/src/workspace/path_validator.py index 2a12d7d..99a4bee 100644 --- a/src/workspace/path_validator.py +++ b/src/workspace/path_validator.py @@ -1,5 +1,9 @@ import os +import re from pathlib import Path +from typing import ClassVar + +from src.workspace.exclusion_manager import ExclusionManager class WorkspaceBoundaryError(Exception): @@ -14,6 +18,12 @@ class PathNotFoundError(Exception): pass +class SensitiveFileError(Exception): + """访问敏感文件时抛出""" + + pass + + class PathValidator: """工作区路径安全校验器,防止路径遍历和符号链接逃逸 @@ -21,12 +31,17 @@ class PathValidator: workspace_root: 工作区根目录,默认为当前目录 """ + # 敏感文件匹配模式(从 ExclusionManager 统一来源引用) + SENSITIVE_FILE_PATTERNS: ClassVar[list[re.Pattern]] = [ + re.compile(p) for p in ExclusionManager.SENSITIVE_FILE_PATTERNS + ] + def __init__(self, workspace_root: str | Path = "."): """初始化路径验证器. Args: workspace_root: 工作区根目录路径,可以是字符串或 Path 对象 - 所有后续的路径验证都将以此目录为边界 + 所有后续的路径验证都将以此目录为基准 Raises: FileNotFoundError: 当 workspace_root 不存在时抛出 @@ -71,8 +86,19 @@ def resolve_path(self, target: str | Path) -> Path: if not str(resolved).startswith(str(self.root) + os.sep) and resolved != self.root: raise WorkspaceBoundaryError(f"路径越界: {target}") + # 敏感文件检查 + self._raise_if_sensitive(resolved, target) + return resolved + @classmethod + def _raise_if_sensitive(cls, resolved: Path, original_target: str | Path) -> None: + """检查路径是否匹配敏感文件模式.""" + resolved_str = str(resolved).replace(os.sep, "/") + for pattern in cls.SENSITIVE_FILE_PATTERNS: + if pattern.search(resolved_str): + raise SensitiveFileError(f"禁止访问敏感文件: {original_target}") + def create_file_with_parents(self, target: str | Path, content: str = "") -> Path: """在工作区内创建文件,自动创建所有不存在的父目录. diff --git a/src/workspace/tools/base_tool.py b/src/workspace/tools/base_tool.py index 70e6ab4..e958559 100644 --- a/src/workspace/tools/base_tool.py +++ b/src/workspace/tools/base_tool.py @@ -244,7 +244,7 @@ def handle_tool_exceptions(func) -> Callable[..., ToolResult]: """工具方法异常处理装饰器 —— 将异常转换为 ToolResult 失败结果""" from functools import wraps - from src.workspace.path_validator import PathNotFoundError, WorkspaceBoundaryError + from src.workspace.path_validator import PathNotFoundError, SensitiveFileError, WorkspaceBoundaryError @wraps(func) def wrapper(self, *args, **kwargs): @@ -269,13 +269,20 @@ def wrapper(self, *args, **kwargs): func_kwargs=kwargs, error=f"{err2.__class__.__name__}: {err2}", ) - except PermissionError as err3: + except SensitiveFileError as err3: return ToolResult( success=False, func_name=func.__name__, func_kwargs=kwargs, error=f"{err3.__class__.__name__}: {err3}", ) + except PermissionError as err4: + return ToolResult( + success=False, + func_name=func.__name__, + func_kwargs=kwargs, + error=f"{err4.__class__.__name__}: {err4}", + ) except Exception as err: return ToolResult( success=False, func_name=func.__name__, func_kwargs=kwargs, error=f"{err.__class__.__name__}: {err}" diff --git a/src/workspace/tools/exact_search_tool.py b/src/workspace/tools/exact_search_tool.py index 873df3a..8456b04 100644 --- a/src/workspace/tools/exact_search_tool.py +++ b/src/workspace/tools/exact_search_tool.py @@ -1,8 +1,8 @@ -import contextlib import re from pathlib import Path from src.models.tools.tool_result import ToolResult +from src.utils.binary_detector import is_binary_file from src.workspace.tools.base_tool import BaseTool from src.workspace.workspace import Workspace @@ -86,6 +86,7 @@ def __init__(self, workspace: Workspace): "limit": "最大匹配数量限制", "ignore": "忽略匹配正则的文件或文件夹列表", } + self._exclusion_manager = workspace.exclusion_manager @BaseTool.handle_tool_exceptions def exact_search( @@ -107,12 +108,8 @@ def exact_search( # 准备搜索字符串 search_string = pattern if case_sensitive else pattern.lower() - # 收集忽略模式 - ignore_patterns = [] - if ignore: - for ignore_pattern in ignore: - with contextlib.suppress(re.error): - ignore_patterns.append(re.compile(ignore_pattern)) + # 收集忽略模式: 合并默认排除 + 用户传入的 ignore + ignore_patterns = self._exclusion_manager.merge_ignore_regexes(ignore) # 搜索结果 results = [] @@ -121,12 +118,24 @@ def exact_search( warnings = [""] # 确定要搜索的文件列表(支持单文件或目录) - files_to_search = [search_path] if search_path.is_file() else list(search_path.rglob(file_pattern)) + files_to_search = ( + [search_path] + if search_path.is_file() + else [ + p + for p in search_path.rglob(file_pattern) + if p.is_file() and not self._exclusion_manager.should_exclude_path(p) + ] + ) # 遍历所有文件 for file_path in files_to_search: if not file_path.is_file(): continue + + if is_binary_file(file_path): + continue + # 检查是否达到限制 if total_matches >= limit: break diff --git a/src/workspace/tools/glob_tool.py b/src/workspace/tools/glob_tool.py index df158c0..f47274a 100644 --- a/src/workspace/tools/glob_tool.py +++ b/src/workspace/tools/glob_tool.py @@ -15,6 +15,7 @@ def __init__(self, workspace: Workspace): "path": "目录路径", "max_ret": "最多返回多少条检索结果", } + self._exclusion_manager = workspace.exclusion_manager @BaseTool.handle_tool_exceptions def glob(self, pattern: str, path: str = ".", max_ret: int = 1000) -> ToolResult: @@ -30,5 +31,6 @@ def glob(self, pattern: str, path: str = ".", max_ret: int = 1000) -> ToolResult data=[ f"{'[Folder]' if item.is_dir() else '[File]'} {item.relative_to(self.workspace.root_path)}" for item in islice(root_path.glob(pattern), max_ret) + if not self._exclusion_manager.should_exclude_path(item) ], ) diff --git a/src/workspace/tools/ls_tool.py b/src/workspace/tools/ls_tool.py index 33f035d..cbb176a 100644 --- a/src/workspace/tools/ls_tool.py +++ b/src/workspace/tools/ls_tool.py @@ -13,6 +13,7 @@ def __init__(self, workspace: Workspace): self.param_descriptions = { "path": "目录路径", } + self._exclusion_manager = workspace.exclusion_manager @BaseTool.handle_tool_exceptions def ls(self, path: str = ".") -> ToolResult: @@ -27,5 +28,6 @@ def ls(self, path: str = ".") -> ToolResult: data=[ f"{'[Folder]' if item.is_dir() else '[File]'} {item.relative_to(self.workspace.root_path)}" for item in folder_path.iterdir() + if not self._exclusion_manager.should_exclude_path(item) ], ) diff --git a/src/workspace/tools/regex_search_tool.py b/src/workspace/tools/regex_search_tool.py index a8b6549..7ec6fad 100644 --- a/src/workspace/tools/regex_search_tool.py +++ b/src/workspace/tools/regex_search_tool.py @@ -1,8 +1,8 @@ -import contextlib import re from pathlib import Path from src.models.tools.tool_result import ToolResult +from src.utils.binary_detector import is_binary_file from src.workspace.tools.base_tool import BaseTool from src.workspace.workspace import Workspace @@ -111,6 +111,7 @@ def __init__(self, workspace: Workspace): "limit": "最大匹配数量限制", "ignore": "忽略匹配正则的文件或文件夹列表", } + self._exclusion_manager = workspace.exclusion_manager @BaseTool.handle_tool_exceptions def regex_search( @@ -134,12 +135,8 @@ def regex_search( except re.error as e: return self.make_failed_response(kwargs=locals().copy(), error=f"无效的正则表达式: {e}") - # 收集忽略模式 - ignore_patterns = [] - if ignore: - for ignore_pattern in ignore: - with contextlib.suppress(re.error): - ignore_patterns.append(re.compile(ignore_pattern)) + # 收集忽略模式: 合并默认排除 + 用户传入的 ignore + ignore_patterns = self._exclusion_manager.merge_ignore_regexes(ignore) # 搜索结果 results = [] @@ -148,12 +145,24 @@ def regex_search( warnings = [""] # 确定要搜索的文件列表(支持单文件或目录) - files_to_search = [search_path] if search_path.is_file() else list(search_path.rglob(file_pattern)) + files_to_search = ( + [search_path] + if search_path.is_file() + else [ + p + for p in search_path.rglob(file_pattern) + if p.is_file() and not self._exclusion_manager.should_exclude_path(p) + ] + ) # 遍历文件 for file_path in files_to_search: if not file_path.is_file(): continue + + if is_binary_file(file_path): + continue + # 检查是否达到限制 if total_matches >= limit: break diff --git a/src/workspace/workspace.py b/src/workspace/workspace.py index 56699f6..295b2d6 100644 --- a/src/workspace/workspace.py +++ b/src/workspace/workspace.py @@ -5,11 +5,9 @@ from pathlib import Path from src.models.tool_error_response import ToolErrorResponse +from src.workspace.exclusion_manager import ExclusionManager from src.workspace.path_validator import PathNotFoundError, PathValidator, WorkspaceBoundaryError -# 默认排除的目录 后续改为从项目配置加载 -DEFAULT_EXCLUDED_DIRS = {".git", "__pycache__", "node_modules", ".venv", "venv", "dist", "build", ".idea", ".vscode"} - def _highlight_matches(line: str, regex: re.Pattern) -> str: """ @@ -47,6 +45,7 @@ def __init__(self, path: str): return self.root_path = Path(path).resolve() self.path_validator: PathValidator = PathValidator(self.root_path) + self.exclusion_manager: ExclusionManager = ExclusionManager(self.root_path) self.is_git_repo: bool = (self.root_path / ".git").is_dir() self.platform: str = sys.platform self.date: str = date.today().strftime("%y-%m-%d") @@ -84,8 +83,11 @@ def search_content( try: path = self.path_validator.validate(folder_path) - # 初始化排除目录集合 - exclude_set = set(exclude_dirs or DEFAULT_EXCLUDED_DIRS) + # 初始化排除目录集合: 合并默认排除 + 用户传入排除 + if exclude_dirs is not None: + exclude_set = set(exclude_dirs) | self.exclusion_manager.excluded_dir_names + else: + exclude_set = self.exclusion_manager.excluded_dir_names # 编译正则表达式 flags = 0 if case_sensitive else re.IGNORECASE @@ -218,14 +220,8 @@ def search_content_multi_pattern( try: path = self.path_validator.validate(folder_path) - # 预编译 ignore 正则 - ignore_res: list[re.Pattern] = [] - if ignore: - for ign in ignore: - try: - ignore_res.append(re.compile(ign)) - except re.error: - continue + # 预编译 ignore 正则: 合并默认排除 + 用户传入的 ignore + ignore_res: list[re.Pattern] = self.exclusion_manager.merge_ignore_regexes(ignore) # 收集文件(一次遍历) files_to_search: list[Path] = [] @@ -234,8 +230,6 @@ def search_content_multi_pattern( else: for file_path in path.rglob(file_pattern): if file_path.is_file(): - if any(p.name in DEFAULT_EXCLUDED_DIRS for p in file_path.parents): - continue rel = str(file_path.relative_to(self.root_path)) if any(ir.search(rel) for ir in ignore_res): continue