From 8e2dcca9dc43b9d24dfb8874a629279b50b6645d Mon Sep 17 00:00:00 2001 From: ycjcl868 Date: Sat, 21 Mar 2026 22:25:38 +0800 Subject: [PATCH] fix: dense cjk utf-8 content misdetected as binary --- openhands_aci/editor/editor.py | 13 ++++++++++++- tests/integration/test_oh_editor.py | 9 +++++++++ tests/unit/test_file_validation.py | 8 ++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/openhands_aci/editor/editor.py b/openhands_aci/editor/editor.py index a968726..c291b91 100644 --- a/openhands_aci/editor/editor.py +++ b/openhands_aci/editor/editor.py @@ -608,7 +608,18 @@ def validate_file(self, path: Path) -> None: if self.is_supported_binary_file(path): return - # Check file type + # Try encoding-based text detection first (handles CJK/multi-byte UTF-8) + try: + encoding = self._encoding_manager.get_encoding(path) + with open(path, 'r', encoding=encoding) as f: + chunk = f.read(8192) + # Null characters indicate binary content even if decoding succeeded + if '\x00' not in chunk: + return # Successfully decoded as text — not binary + except (UnicodeDecodeError, ValueError, OSError, TypeError): + pass # Fall through to binaryornot check + + # Fallback to binaryornot if is_binary(str(path)): raise FileValidationError( path=str(path), diff --git a/tests/integration/test_oh_editor.py b/tests/integration/test_oh_editor.py index 9a95a82..3ac55e1 100644 --- a/tests/integration/test_oh_editor.py +++ b/tests/integration/test_oh_editor.py @@ -699,3 +699,12 @@ def test_str_replace_and_insert_snippet_output_on_a_large_file(editor): new_str='Inserted line at 500', ) assert ' 500\tInserted line at 500' in result.output + + +def test_view_dense_chinese_markdown(tmp_path): + """view should handle UTF-8 markdown files with dense CJK content.""" + md_file = tmp_path / 'test.md' + md_file.write_text('中文测试内容。' * 50, encoding='utf-8') + editor = OHEditor() + result = editor(command='view', path=str(md_file)) + assert '中文测试内容' in result.output diff --git a/tests/unit/test_file_validation.py b/tests/unit/test_file_validation.py index 422f371..aba5249 100644 --- a/tests/unit/test_file_validation.py +++ b/tests/unit/test_file_validation.py @@ -95,3 +95,11 @@ def test_validate_image_file(): editor.validate_file(image_file) assert 'file appears to be binary' in str(exc_info.value).lower() + + +def test_validate_dense_cjk_utf8_not_binary(tmp_path): + """Dense CJK UTF-8 text should not be rejected as binary.""" + f = tmp_path / 'chinese.md' + f.write_text('中文测试内容。' * 50, encoding='utf-8') + editor = OHEditor() + editor.validate_file(f) # Should not raise