From eff29b610ca07a6701ca6b1baeb2574c8e569572 Mon Sep 17 00:00:00 2001 From: kgruiz Date: Tue, 20 May 2025 14:04:14 -0400 Subject: [PATCH 1/6] Improve encoding handling --- PyTokenCounter/encoding_utils.py | 37 +++++++++++++++++++------------- Tests/Input/TestFile1252.txt | 1 + Tests/Runner.py | 16 +++++++++++++- 3 files changed, 38 insertions(+), 16 deletions(-) create mode 100644 Tests/Input/TestFile1252.txt diff --git a/PyTokenCounter/encoding_utils.py b/PyTokenCounter/encoding_utils.py index 76d22bd..f810f56 100644 --- a/PyTokenCounter/encoding_utils.py +++ b/PyTokenCounter/encoding_utils.py @@ -105,24 +105,31 @@ def ReadTextFile(filePath: Path | str) -> str: return "" - with file.open("rb") as binaryFile: + rawBytes = file.read_bytes() + detection = chardet.detect(rawBytes) + detectedEncoding = detection.get("encoding") + confidence = detection.get("confidence", 0) - detection = chardet.detect(binaryFile.read()) - encoding = detection["encoding"] + encodingsToTry: list[str] = [] + if detectedEncoding: + encodingsToTry.append(detectedEncoding) - if encoding: - - actualEncoding = encoding - encoding = "utf-8" + if confidence < 0.8: + for fallback in ["windows-1252", "utf-8", "latin-1"]: + if fallback not in encodingsToTry: + encodingsToTry.append(fallback) + for enc in encodingsToTry: try: - - return file.read_text(encoding=encoding) - + text = rawBytes.decode(enc) + if enc != "utf-8": + text = text.encode("utf-8").decode("utf-8") + return text except UnicodeDecodeError: + continue - raise UnsupportedEncodingError(encoding=actualEncoding, filePath=filePath) - - else: - - raise UnsupportedEncodingError(encoding=encoding, filePath=filePath) + raise UnsupportedEncodingError( + encoding=detectedEncoding, + filePath=filePath, + message=f"Failed to decode using encodings: {', '.join(encodingsToTry)}", + ) diff --git a/Tests/Input/TestFile1252.txt b/Tests/Input/TestFile1252.txt new file mode 100644 index 0000000..9f90628 --- /dev/null +++ b/Tests/Input/TestFile1252.txt @@ -0,0 +1 @@ +Caf rsum nave fianc diff --git a/Tests/Runner.py b/Tests/Runner.py index 4665c5c..8084dd1 100644 --- a/Tests/Runner.py +++ b/Tests/Runner.py @@ -4,8 +4,8 @@ import sys from pathlib import Path -import numpy as np import PyTokenCounter as tc +from PyTokenCounter.encoding_utils import ReadTextFile import tiktoken from PIL import Image from PyTokenCounter.cli import ParseFiles @@ -748,6 +748,19 @@ def TestParseFilesGlobRecursive(): ) +def TestReadTextFileWindows1252(): + """Ensure Windows-1252 encoded files are read correctly.""" + + filePath = Path(testInputDir, "TestFile1252.txt") + expected = "Café – résumé naïve fiancé" + result = ReadTextFile(filePath) + + if result != expected: + RaiseTestAssertion( + f"Windows-1252 file was not read correctly.\nExpected: '{expected}'\nGot: '{result}'" + ) + + def TestStr(): """ Test string tokenization. @@ -909,5 +922,6 @@ def TestFileError(imgPath): TestTokenizeFileErrorType() TestParseFilesGlob() TestParseFilesGlobRecursive() + TestReadTextFileWindows1252() print("All tests passed successfully!") From 2ab567b16aa011543c88e97d0f74f6dc918a8b43 Mon Sep 17 00:00:00 2001 From: kgruiz Date: Tue, 20 May 2025 14:08:11 -0400 Subject: [PATCH 2/6] Improve encoding error handling --- PyTokenCounter/file_tokens.py | 48 ++++++++++++++++++++++++++--------- Tests/Runner.py | 18 ++++++++----- 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/PyTokenCounter/file_tokens.py b/PyTokenCounter/file_tokens.py index 82e58e2..17d8be5 100644 --- a/PyTokenCounter/file_tokens.py +++ b/PyTokenCounter/file_tokens.py @@ -582,7 +582,9 @@ def TokenizeDir( mapTokens=mapTokens, ) - except UnicodeDecodeError: + except UnicodeDecodeError as e: + + encoding = e.encoding or "unknown" if excludeBinary: @@ -591,7 +593,9 @@ def TokenizeDir( _UpdateTask( taskName=taskName, advance=1, - description=f"Skipping binary file {entry.relative_to(dirPath)}", + description=( + f"Skipping binary file {entry.relative_to(dirPath)} (encoding: {encoding})" + ), quiet=quiet, ) @@ -599,7 +603,9 @@ def TokenizeDir( else: - raise + raise UnsupportedEncodingError( + encoding=encoding, filePath=entry + ) from e if mapTokens: @@ -811,7 +817,9 @@ def GetNumTokenDir( mapTokens=False, ) - except UnicodeDecodeError: + except UnicodeDecodeError as e: + + encoding = e.encoding or "unknown" if excludeBinary: @@ -820,7 +828,9 @@ def GetNumTokenDir( _UpdateTask( taskName=taskName, advance=1, - description=f"Skipping binary file {entry.relative_to(dirPath)}", + description=( + f"Skipping binary file {entry.relative_to(dirPath)} (encoding: {encoding})" + ), quiet=quiet, ) @@ -828,7 +838,9 @@ def GetNumTokenDir( else: - raise + raise UnsupportedEncodingError( + encoding=encoding, filePath=entry + ) from e if mapTokens: @@ -1051,7 +1063,9 @@ def TokenizeFiles( mapTokens=mapTokens, ) - except UnicodeDecodeError: + except UnicodeDecodeError as e: + + encoding = e.encoding or "unknown" if excludeBinary: @@ -1060,7 +1074,9 @@ def TokenizeFiles( _UpdateTask( taskName="Tokenizing File/Directory List", advance=1, - description=f"Skipping binary file {entry.name}", + description=( + f"Skipping binary file {entry.name} (encoding: {encoding})" + ), quiet=quiet, ) @@ -1068,7 +1084,9 @@ def TokenizeFiles( else: - raise + raise UnsupportedEncodingError( + encoding=encoding, filePath=entry + ) from e if mapTokens: @@ -1329,7 +1347,9 @@ def GetNumTokenFiles( mapTokens=False, ) - except UnicodeDecodeError: + except UnicodeDecodeError as e: + + encoding = e.encoding or "unknown" if excludeBinary: @@ -1338,7 +1358,9 @@ def GetNumTokenFiles( _UpdateTask( taskName="Counting Tokens in File/Directory List", advance=1, - description=f"Skipping binary file {entry.name}", + description=( + f"Skipping binary file {entry.name} (encoding: {encoding})" + ), quiet=quiet, ) @@ -1346,7 +1368,9 @@ def GetNumTokenFiles( else: - raise + raise UnsupportedEncodingError( + encoding=encoding, filePath=entry + ) from e if mapTokens: diff --git a/Tests/Runner.py b/Tests/Runner.py index 4665c5c..2c8b2b8 100644 --- a/Tests/Runner.py +++ b/Tests/Runner.py @@ -171,6 +171,7 @@ def TestTokenizeFilesMultiple(): """ inputFiles = [ Path(testInputDir, "TestFile1.txt"), + Path(testInputDir, "TestImg.jpg"), Path(testInputDir, "TestFile2.txt"), ] answerFiles = [ @@ -179,7 +180,7 @@ def TestTokenizeFilesMultiple(): ] expectedTokenLists = {} - for inputFile, answerFile in zip(inputFiles, answerFiles): + for inputFile, answerFile in zip([inputFiles[0], inputFiles[2]], answerFiles): with answerFile.open("r") as file: answer = json.load(file) expectedTokenLists[inputFile.name] = answer["tokens"] @@ -302,6 +303,7 @@ def TestTokenizeFilesListQuietFalse(): """ inputFiles = [ Path(testInputDir, "TestFile1.txt"), + Path(testInputDir, "TestImg.jpg"), Path(testInputDir, "TestFile2.txt"), ] answerFiles = [ @@ -310,7 +312,7 @@ def TestTokenizeFilesListQuietFalse(): ] expectedTokenLists = {} - for inputFile, answerFile in zip(inputFiles, answerFiles): + for inputFile, answerFile in zip([inputFiles[0], inputFiles[2]], answerFiles): with answerFile.open("r") as file: answer = json.load(file) expectedTokenLists[inputFile.name] = answer["tokens"] @@ -327,9 +329,9 @@ def TestTokenizeFilesListQuietFalse(): # Check if any progress messages were printed output = capturedOutput.getvalue() - if not output.strip(): + if "Skipping binary file TestImg.jpg" not in output: RaiseTestAssertion( - "Expected progress messages to be printed when quiet=False, but no output was captured." + "Expected skip message for binary file was not printed when quiet=False." ) # Verify tokenization results @@ -674,8 +676,12 @@ def TestTokenizeFileWithUnsupportedEncoding(): try: tc.TokenizeFile(filePath=unsupportedFilePath, model="gpt-4o", quiet=True) - except tc.UnsupportedEncodingError: - pass # Expected exception + except tc.UnsupportedEncodingError as e: + message = str(e) + if str(unsupportedFilePath) not in message or "encoding" not in message: + RaiseTestAssertion( + "Error message did not include file path and encoding information" + ) except Exception as e: RaiseTestAssertion( f"Test Failed: Unexpected error type raised for file '{unsupportedFilePath}' - {type(e).__name__}" From f5eba4b9cab86f22b0c51d570d38c0fac678a1bc Mon Sep 17 00:00:00 2001 From: kgruiz Date: Tue, 20 May 2025 18:32:48 -0400 Subject: [PATCH 3/6] Convert tests to pytest and add CI --- .github/workflows/ci.yml | 47 +++++++++++ README.md | 10 +++ pyproject.toml | 4 +- pytest.ini | 4 + tests/test_models.py | 152 ++++++++++++++++++++++++++++++++++ tests/test_tokenization.py | 161 +++++++++++++++++++++++++++++++++++++ tests/test_utils.py | 42 ++++++++++ 7 files changed, 419 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/ci.yml create mode 100644 pytest.ini create mode 100644 tests/test_models.py create mode 100644 tests/test_tokenization.py create mode 100644 tests/test_utils.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..23c49e1 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,47 @@ +name: CI + +on: + push: + branches: [ main ] + paths-ignore: + - '**/*.md' + pull_request: + paths-ignore: + - '**/*.md' + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + - name: Detect comment-only changes + id: changes + run: | + git fetch --depth=2 origin ${{ github.base_ref }} + CHANGED=$(git diff --name-only HEAD origin/${{ github.base_ref }} -- '*.py') + if [ -z "$CHANGED" ]; then + echo "run=false" >> $GITHUB_OUTPUT + else + DIFF=$(git diff origin/${{ github.base_ref }} HEAD -- '*.py' | grep '^+' | grep -v '+++' | grep -v '^+\s*#') + if [ -z "$DIFF" ]; then + echo "run=false" >> $GITHUB_OUTPUT + else + echo "run=true" >> $GITHUB_OUTPUT + fi + fi + - name: Install dependencies + if: steps.changes.outputs.run == 'true' + run: | + python -m pip install --upgrade pip + pip install -e . + pip install flake8 + - name: Run flake8 + if: steps.changes.outputs.run == 'true' + run: flake8 . + - name: Run pytest + if: steps.changes.outputs.run == 'true' + run: pytest diff --git a/README.md b/README.md index ff0fcc4..ac434c7 100644 --- a/README.md +++ b/README.md @@ -1070,6 +1070,16 @@ Along with ignoring the extensions in the exclude list to quickly bypass known f --- +## Running Tests + +After installing the package's dependencies, run the test suite with `pytest`: + +```bash +pytest +``` + +The tests rely on the sample data located in the `Tests/` directory. + ## Maintainers - [Kaden Gruizenga](https://github.com/kgruiz) diff --git a/pyproject.toml b/pyproject.toml index 782b15c..db3d81a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,8 @@ dependencies = [ "rich>=13.9.4", "chardet>=5.2.0", "colorlog>=6.9.0", + "pytest>=7.4.0", + "flake8>=6.1.0", ] classifiers = [ @@ -62,4 +64,4 @@ requires = ["setuptools>=43.0.0", "wheel"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] -exclude = ["Tests"] \ No newline at end of file +exclude = ["Tests"] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..52387a2 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +addopts = -v +pythonpaths = PyTokenCounter + diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..f7ce786 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,152 @@ +import json +from pathlib import Path +import pytest +import tiktoken +import PyTokenCounter as tc + +TEST_DIR = Path(__file__).resolve().parent.parent / "Tests" +ANSWERS_DIR = TEST_DIR / "Answers" + + +def test_get_model_mappings(): + expected = { + "gpt-4o": "o200k_base", + "gpt-4o-mini": "o200k_base", + "gpt-4-turbo": "cl100k_base", + "gpt-4": "cl100k_base", + "gpt-3.5-turbo": "cl100k_base", + "text-embedding-ada-002": "cl100k_base", + "text-embedding-3-small": "cl100k_base", + "text-embedding-3-large": "cl100k_base", + "Codex models": "p50k_base", + "text-davinci-002": "p50k_base", + "text-davinci-003": "p50k_base", + "GPT-3 models like davinci": "r50k_base", + } + assert tc.GetModelMappings() == expected + + +def test_get_valid_models(): + expected = [ + "gpt-4o", + "gpt-4o-mini", + "gpt-4-turbo", + "gpt-4", + "gpt-3.5-turbo", + "text-embedding-ada-002", + "text-embedding-3-small", + "text-embedding-3-large", + "Codex models", + "text-davinci-002", + "text-davinci-003", + "GPT-3 models like davinci", + ] + assert set(tc.GetValidModels()) == set(expected) + + +def test_get_valid_encodings(): + expected = ["o200k_base", "cl100k_base", "p50k_base", "r50k_base"] + assert set(tc.GetValidEncodings()) == set(expected) + + +def test_get_model_for_encoding(): + mapping = { + "o200k_base": ["gpt-4o", "gpt-4o-mini"], + "cl100k_base": [ + "gpt-3.5-turbo", + "gpt-4", + "gpt-4-turbo", + "text-embedding-3-large", + "text-embedding-3-small", + "text-embedding-ada-002", + ], + "p50k_base": ["Codex models", "text-davinci-002", "text-davinci-003"], + "r50k_base": "GPT-3 models like davinci", + } + for name, expected in mapping.items(): + encoding = tiktoken.get_encoding(encoding_name=name) + result = tc.GetModelForEncoding(encoding=encoding) + if isinstance(expected, list): + assert sorted(result) == sorted(expected) + else: + assert result == expected + + +def test_get_model_for_encoding_name(): + mapping = { + "o200k_base": ["gpt-4o", "gpt-4o-mini"], + "cl100k_base": [ + "gpt-3.5-turbo", + "gpt-4", + "gpt-4-turbo", + "text-embedding-3-large", + "text-embedding-3-small", + "text-embedding-ada-002", + ], + "p50k_base": ["Codex models", "text-davinci-002", "text-davinci-003"], + "r50k_base": "GPT-3 models like davinci", + } + for name, expected in mapping.items(): + result = tc.GetModelForEncodingName(encodingName=name) + if isinstance(expected, list): + assert sorted(result) == sorted(expected) + else: + assert result == expected + + +def test_get_encoding_for_model(): + mapping = { + "gpt-4o": "o200k_base", + "gpt-4o-mini": "o200k_base", + "gpt-4-turbo": "cl100k_base", + "gpt-4": "cl100k_base", + "gpt-3.5-turbo": "cl100k_base", + "text-embedding-ada-002": "cl100k_base", + "text-embedding-3-small": "cl100k_base", + "text-embedding-3-large": "cl100k_base", + "text-davinci-002": "p50k_base", + "text-davinci-003": "p50k_base", + } + for model, encoding_name in mapping.items(): + result = tc.GetEncodingForModel(modelName=model) + assert result.name == encoding_name + + +def test_get_encoding_name_for_model(): + mapping = { + "gpt-4o": "o200k_base", + "gpt-4o-mini": "o200k_base", + "gpt-4-turbo": "cl100k_base", + "gpt-4": "cl100k_base", + "gpt-3.5-turbo": "cl100k_base", + "text-embedding-ada-002": "cl100k_base", + "text-embedding-3-small": "cl100k_base", + "text-embedding-3-large": "cl100k_base", + "Codex models": "p50k_base", + "text-davinci-002": "p50k_base", + "text-davinci-003": "p50k_base", + "GPT-3 models like davinci": "r50k_base", + } + for model, encoding_name in mapping.items(): + result = tc.GetEncodingNameForModel(modelName=model) + assert result == encoding_name + + +def test_get_encoding(): + encoding = tc.GetEncoding(model="gpt-3.5-turbo") + assert encoding.name == "cl100k_base" + + encoding = tc.GetEncoding(encodingName="p50k_base") + assert encoding.name == "p50k_base" + + encoding = tc.GetEncoding(model="gpt-4-turbo", encodingName="cl100k_base") + assert encoding.name == "cl100k_base" + + with pytest.raises(ValueError): + tc.GetEncoding(model="gpt-3.5-turbo", encodingName="p50k_base") + + +def test_get_encoding_error(): + with pytest.raises(ValueError): + tc.GetEncoding() + diff --git a/tests/test_tokenization.py b/tests/test_tokenization.py new file mode 100644 index 0000000..5035681 --- /dev/null +++ b/tests/test_tokenization.py @@ -0,0 +1,161 @@ +import json +from pathlib import Path +import io +import sys +import PyTokenCounter as tc +from PyTokenCounter.cli import ParseFiles + +# Paths to test resources +TEST_DIR = Path(__file__).resolve().parent.parent / "Tests" +INPUT_DIR = TEST_DIR / "Input" +ANSWERS_DIR = TEST_DIR / "Answers" + + +def load_answer(name: str): + with (ANSWERS_DIR / name).open("r") as f: + return json.load(f) + +def test_tokenize_directory(): + expected = load_answer("TestDirectory.json") + result = tc.TokenizeDir( + dirPath=INPUT_DIR / "TestDirectory", + model="gpt-4o", + recursive=True, + quiet=True, + ) + assert isinstance(result, dict) + assert result == expected + + +def test_tokenize_files_with_directory(): + expected = load_answer("TestDirectory.json") + result = tc.TokenizeFiles(INPUT_DIR / "TestDirectory", model="gpt-4o", quiet=True) + assert isinstance(result, dict) + assert result == expected + + +def test_tokenize_files_multiple(): + files = [ + INPUT_DIR / "TestFile1.txt", + INPUT_DIR / "TestImg.jpg", + INPUT_DIR / "TestFile2.txt", + ] + answers = [load_answer("TestFile1.json"), load_answer("TestFile2.json")] + result = tc.TokenizeFiles(files, model="gpt-4o", quiet=True) + expected = { + "TestFile1.txt": {"tokens": answers[0]["tokens"]}, + "TestFile2.txt": {"tokens": answers[1]["tokens"]}, + } + assert isinstance(result, dict) + assert result == expected + + +def test_tokenize_files_exit_on_list_error_false(): + files = [ + INPUT_DIR / "TestFile1.txt", + INPUT_DIR / "TestImg.jpg", + INPUT_DIR / "TestFile2.txt", + ] + answers = [load_answer("TestFile1.json"), load_answer("TestFile2.json")] + result = tc.TokenizeFiles(files, model="gpt-4o", quiet=True, exitOnListError=False) + expected = { + "TestFile1.txt": {"tokens": answers[0]["tokens"]}, + "TestFile2.txt": {"tokens": answers[1]["tokens"]}, + } + assert isinstance(result, dict) + assert result == expected + + +def test_tokenize_directory_no_recursion(): + expected = load_answer("TestDirectoryNoRecursion.json") + result = tc.TokenizeDir( + dirPath=INPUT_DIR / "TestDirectory", + model="gpt-4o", + recursive=False, + quiet=True, + ) + assert isinstance(result, dict) + assert result == expected + # ensure subdirectories not included + for entry in (INPUT_DIR / "TestDirectory").iterdir(): + if entry.is_dir(): + assert entry.name not in result + + +def test_tokenize_files_with_invalid_input(): + with pytest.raises(TypeError): + tc.TokenizeFiles(67890) + + +def test_tokenize_files_list_quiet_false(capsys): + files = [ + INPUT_DIR / "TestFile1.txt", + INPUT_DIR / "TestImg.jpg", + INPUT_DIR / "TestFile2.txt", + ] + answers = [load_answer("TestFile1.json"), load_answer("TestFile2.json")] + result = None + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + result = tc.TokenizeFiles(files, model="gpt-4o", quiet=False) + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + assert "Skipping binary file TestImg.jpg" in output + expected = { + "TestFile1.txt": {"tokens": answers[0]["tokens"]}, + "TestFile2.txt": {"tokens": answers[1]["tokens"]}, + } + assert isinstance(result, dict) + assert result == expected + + +import pytest + +@pytest.mark.parametrize( + "input_name,answer_name", + [ + ("TestFile1.txt", "TestFile1.json"), + ("TestFile2.txt", "TestFile2.json"), + ], +) +def test_tokenize_file(input_name, answer_name): + answer = load_answer(answer_name) + result = tc.TokenizeFile(INPUT_DIR / input_name, model="gpt-4o", quiet=True) + assert result == answer["tokens"] + count = tc.GetNumTokenFile(INPUT_DIR / input_name, model="gpt-4o", quiet=True) + assert count == answer["numTokens"] + + +def test_tokenize_file_error(): + with pytest.raises(tc.UnsupportedEncodingError): + tc.TokenizeFile(INPUT_DIR / "TestImg.jpg", model="gpt-4o", quiet=True) + + +def test_tokenize_file_error_type(): + with pytest.raises(TypeError): + tc.TokenizeFile(54321, model="gpt-4o", quiet=True) + + +def test_tokenize_file_with_unsupported_encoding(): + path = INPUT_DIR / "TestImg.jpg" + with pytest.raises(tc.UnsupportedEncodingError) as exc: + tc.TokenizeFile(filePath=path, model="gpt-4o", quiet=True) + assert str(path) in str(exc.value) + assert "encoding" in str(exc.value) + + +def test_tokenize_str(): + expected_strings = { + "Hail to the Victors!": [39, 663, 316, 290, 16566, 914, 0], + "2024 National Champions": [1323, 19, 6743, 40544], + "Corum 4 Heisman": [11534, 394, 220, 19, 1679, 107107], + } + for text, tokens in expected_strings.items(): + result = tc.TokenizeStr(string=text, model="gpt-4o", quiet=True) + assert result == tokens + count = tc.GetNumTokenStr(string=text, model="gpt-4o", quiet=True) + assert count == len(tokens) + diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..5f2266b --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,42 @@ +import json +from pathlib import Path +import PyTokenCounter as tc +from PyTokenCounter.encoding_utils import ReadTextFile +from PyTokenCounter.cli import ParseFiles + +TEST_DIR = Path(__file__).resolve().parent.parent / "Tests" +INPUT_DIR = TEST_DIR / "Input" +ANSWERS_DIR = TEST_DIR / "Answers" + + +def test_parse_files_glob(): + pattern = "Input/TestFile*.txt" + expected = { + "Input/TestFile1.txt", + "Input/TestFile2.txt", + } + result = set(ParseFiles([pattern])) + assert result == expected + + +def test_parse_files_glob_recursive(): + pattern = "Input/**/*.txt" + expected = { + "Input/TestFile1.txt", + "Input/TestFile2.txt", + "Input/TestDirectory/TestDir1.txt", + "Input/TestDirectory/TestDir2.txt", + "Input/TestDirectory/TestDir3.txt", + "Input/TestDirectory/TestSubDir/TestDir4.txt", + "Input/TestDirectory/TestSubDir/TestDir5.txt", + } + result = set(ParseFiles([pattern])) + assert result == expected + + +def test_read_text_file_windows1252(): + file_path = INPUT_DIR / "TestFile1252.txt" + expected = "Café – résumé naïve fiancé" + result = ReadTextFile(file_path) + assert result == expected + From d6284a76bf672ad1e9cae67414551d118f332ccb Mon Sep 17 00:00:00 2001 From: Kaden Gruizenga Date: Thu, 21 Aug 2025 22:29:12 -0400 Subject: [PATCH 4/6] fix(progress): prevent unfinished bars by matching totals to processed items Update _CountDirFiles to respect includeHidden/excludeBinary and pass flags in TokenizeDir/GetNumTokenDir. --- PyTokenCounter/file_tokens.py | 53 +++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/PyTokenCounter/file_tokens.py b/PyTokenCounter/file_tokens.py index 17d8be5..f211d2a 100644 --- a/PyTokenCounter/file_tokens.py +++ b/PyTokenCounter/file_tokens.py @@ -7,7 +7,13 @@ from .progress import _InitializeTask, _UpdateTask, _tasks from .core import BINARY_EXTENSIONS, TokenizeStr -def _CountDirFiles(dirPath: Path, recursive: bool = True) -> int: +def _CountDirFiles( + dirPath: Path, + recursive: bool = True, + *, + includeHidden: bool = False, + excludeBinary: bool = True, +) -> int: """ Count the number of files in a directory. @@ -20,6 +26,10 @@ def _CountDirFiles(dirPath: Path, recursive: bool = True) -> int: The path to the directory in which to count files. recursive : bool, optional Whether to count files in subdirectories recursively (default is True). + includeHidden : bool, optional + Whether to include hidden files and directories (default is False). + excludeBinary : bool, optional + Whether to exclude binary files based on extension (default is True). Returns ------- @@ -42,17 +52,33 @@ def _CountDirFiles(dirPath: Path, recursive: bool = True) -> int: for entry in dirPath.iterdir(): - if entry.is_dir(): + # Skip hidden files and directories entirely if not including hidden + if not includeHidden and entry.name.startswith("."): + continue - numFiles += _CountDirFiles(entry, recursive=recursive) + if entry.is_dir(): + # Recurse into subdirectories (respect hidden handling) + numFiles += _CountDirFiles( + entry, + recursive=recursive, + includeHidden=includeHidden, + excludeBinary=excludeBinary, + ) else: - + # Optionally skip binary files + if excludeBinary and entry.suffix.lower() in BINARY_EXTENSIONS: + continue numFiles += 1 else: - - numFiles = sum(1 for entry in dirPath.iterdir() if entry.is_file()) + for entry in dirPath.iterdir(): + if entry.is_file(): + if not includeHidden and entry.name.startswith("."): + continue + if excludeBinary and entry.suffix.lower() in BINARY_EXTENSIONS: + continue + numFiles += 1 return numFiles @@ -519,7 +545,12 @@ def TokenizeDir( raise ValueError(f'Given directory path "{dirPath}" is not a directory.') - numFiles = _CountDirFiles(dirPath=dirPath, recursive=recursive) + numFiles = _CountDirFiles( + dirPath=dirPath, + recursive=recursive, + includeHidden=includeHidden, + excludeBinary=excludeBinary, + ) if not quiet: @@ -748,7 +779,12 @@ def GetNumTokenDir( raise ValueError(f'Given path "{dirPath}" is not a directory.') - numFiles = _CountDirFiles(dirPath=dirPath, recursive=recursive) + numFiles = _CountDirFiles( + dirPath=dirPath, + recursive=recursive, + includeHidden=includeHidden, + excludeBinary=excludeBinary, + ) if not quiet: @@ -1478,4 +1514,3 @@ def GetNumTokenFiles( raise RuntimeError( f'Unexpected error. Given inputPath "{inputPath}" is neither a file, a directory, nor a list.' ) - From c62db548a41751731236820f94bb346c1f2b5f02 Mon Sep 17 00:00:00 2001 From: Kaden Gruizenga Date: Thu, 21 Aug 2025 22:37:52 -0400 Subject: [PATCH 5/6] fix(cli): prevent error filepath wrapping by printing path outside panel and remove fixed console width --- PyTokenCounter/encoding_utils.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/PyTokenCounter/encoding_utils.py b/PyTokenCounter/encoding_utils.py index f810f56..7f9bd4f 100644 --- a/PyTokenCounter/encoding_utils.py +++ b/PyTokenCounter/encoding_utils.py @@ -44,19 +44,26 @@ def __init__( errorText.append("Detected encoding: ", style="green") errorText.append(f"{encoding}", style="bold") errorText.append("\n") - errorText.append("File path: ", style="green") - errorText.append(f"{filePath}", style="bold blue") + # Intentionally do not include file path inside the panel to avoid line wrapping issues panel = Panel( errorText, title="Encoding Error", title_align="left", border_style="red" ) - console = Console(width=80, color_system="truecolor", record=True) + console = Console(color_system="truecolor", record=True) with console.capture() as capture: console.print("") # Add a new line before the panel console.print(panel) + console.print("") + + # Print the file path outside the panel and prevent wrapping so it remains clickable + pathText = Text() + pathText.append("File path: ", style="green") + pathText.append(f"{filePath}", style="bold blue") + pathText.no_wrap = True + console.print(pathText) captured = capture.get() # Store the formatted panel; pass a plain message to the base Exception From 2b76fd6987c9330620a7f077476efaf7cdbb2419 Mon Sep 17 00:00:00 2001 From: Kaden Gruizenga Date: Thu, 21 Aug 2025 22:43:55 -0400 Subject: [PATCH 6/6] chore(release): bump version to v1.8.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index db3d81a..ddd2178 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "PyTokenCounter" -version = "1.7.0" +version = "1.8.0" description = "A Python library for tokenizing text and counting tokens using various encoding schemes." readme = {file = "README.md", content-type = "text/markdown"} requires-python = ">=3.11"