From 6ecab99a89154c3a1179952f8f7976a54a0fc34c Mon Sep 17 00:00:00 2001 From: r266-tech Date: Wed, 24 Jun 2026 01:09:48 +0800 Subject: [PATCH] fix(parse): recognize .jsonl as a text file so upload encoding normalization applies Completes #2745, which added .jsonl to the vectorization text-extension set in embedding_utils.py but left the parallel upload-time encoding path treating .jsonl as non-text. is_text_file() decides text-vs-binary by exact suffix membership across CODE_EXTENSIONS + DOCUMENTATION_EXTENSIONS + ADDITIONAL_TEXT_EXTENSIONS, which had .json but not .jsonl (the suffix of data.jsonl is .jsonl, not .json). So detect_and_convert_encoding skipped UTF-8 normalization for a legacy-encoded .jsonl -- unlike .json -- which then got vectorized as text, the exact mojibake class #2770 fixed. Add .jsonl to ADDITIONAL_TEXT_EXTENSIONS (next to .json); is_text_file unions all three sets so one entry suffices. Behavior for every other extension is unchanged. Adds a test assert. Refs #2745, #2744, #2770. --- openviking/parse/parsers/constants.py | 1 + tests/test_upload_utils.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/openviking/parse/parsers/constants.py b/openviking/parse/parsers/constants.py index 74609115e..e6f49f091 100644 --- a/openviking/parse/parsers/constants.py +++ b/openviking/parse/parsers/constants.py @@ -213,6 +213,7 @@ ".properties", ".toml", ".json", + ".jsonl", ".yaml", ".yml", ".xml", diff --git a/tests/test_upload_utils.py b/tests/test_upload_utils.py index d665714da..35c0041c2 100644 --- a/tests/test_upload_utils.py +++ b/tests/test_upload_utils.py @@ -111,6 +111,11 @@ def test_documentation_extensions(self) -> None: def test_additional_text_extensions(self) -> None: assert is_text_file("settings.ini") is True assert is_text_file("data.csv") is True + # .jsonl is treated as text (matching .json) so upload-time encoding + # normalization applies, mirroring its inclusion in the vectorization + # text-extension set (#2745); otherwise a legacy-encoded .jsonl skips + # UTF-8 normalization while .json does not (#2744/#2770). + assert is_text_file("data.jsonl") is True def test_non_text_extensions(self) -> None: assert is_text_file("photo.png") is False