From eff29b610ca07a6701ca6b1baeb2574c8e569572 Mon Sep 17 00:00:00 2001
From: kgruiz <kjgruiz@pm.me>
Date: Tue, 20 May 2025 14:04:14 -0400
Subject: [PATCH 1/6] Improve encoding handling

---
 PyTokenCounter/encoding_utils.py | 37 +++++++++++++++++++-------------
 Tests/Input/TestFile1252.txt     |  1 +
 Tests/Runner.py                  | 16 +++++++++++++-
 3 files changed, 38 insertions(+), 16 deletions(-)
 create mode 100644 Tests/Input/TestFile1252.txt

diff --git a/PyTokenCounter/encoding_utils.py b/PyTokenCounter/encoding_utils.py
index 76d22bd..f810f56 100644
--- a/PyTokenCounter/encoding_utils.py
+++ b/PyTokenCounter/encoding_utils.py
@@ -105,24 +105,31 @@ def ReadTextFile(filePath: Path | str) -> str:
 
         return ""
 
-    with file.open("rb") as binaryFile:
+    rawBytes = file.read_bytes()
+    detection = chardet.detect(rawBytes)
+    detectedEncoding = detection.get("encoding")
+    confidence = detection.get("confidence", 0)
 
-        detection = chardet.detect(binaryFile.read())
-        encoding = detection["encoding"]
+    encodingsToTry: list[str] = []
+    if detectedEncoding:
+        encodingsToTry.append(detectedEncoding)
 
-    if encoding:
-
-        actualEncoding = encoding
-        encoding = "utf-8"
+    if confidence < 0.8:
+        for fallback in ["windows-1252", "utf-8", "latin-1"]:
+            if fallback not in encodingsToTry:
+                encodingsToTry.append(fallback)
 
+    for enc in encodingsToTry:
         try:
-
-            return file.read_text(encoding=encoding)
-
+            text = rawBytes.decode(enc)
+            if enc != "utf-8":
+                text = text.encode("utf-8").decode("utf-8")
+            return text
         except UnicodeDecodeError:
+            continue
 
-            raise UnsupportedEncodingError(encoding=actualEncoding, filePath=filePath)
-
-    else:
-
-        raise UnsupportedEncodingError(encoding=encoding, filePath=filePath)
+    raise UnsupportedEncodingError(
+        encoding=detectedEncoding,
+        filePath=filePath,
+        message=f"Failed to decode using encodings: {', '.join(encodingsToTry)}",
+    )
diff --git a/Tests/Input/TestFile1252.txt b/Tests/Input/TestFile1252.txt
new file mode 100644
index 0000000..9f90628
--- /dev/null
+++ b/Tests/Input/TestFile1252.txt
@@ -0,0 +1 @@
+Caf  rsum nave fianc
diff --git a/Tests/Runner.py b/Tests/Runner.py
index 4665c5c..8084dd1 100644
--- a/Tests/Runner.py
+++ b/Tests/Runner.py
@@ -4,8 +4,8 @@
 import sys
 from pathlib import Path
 
-import numpy as np
 import PyTokenCounter as tc
+from PyTokenCounter.encoding_utils import ReadTextFile
 import tiktoken
 from PIL import Image
 from PyTokenCounter.cli import ParseFiles
@@ -748,6 +748,19 @@ def TestParseFilesGlobRecursive():
         )
 
 
+def TestReadTextFileWindows1252():
+    """Ensure Windows-1252 encoded files are read correctly."""
+
+    filePath = Path(testInputDir, "TestFile1252.txt")
+    expected = "Café – résumé naïve fiancé"
+    result = ReadTextFile(filePath)
+
+    if result != expected:
+        RaiseTestAssertion(
+            f"Windows-1252 file was not read correctly.\nExpected: '{expected}'\nGot: '{result}'"
+        )
+
+
 def TestStr():
     """
     Test string tokenization.
@@ -909,5 +922,6 @@ def TestFileError(imgPath):
     TestTokenizeFileErrorType()
     TestParseFilesGlob()
     TestParseFilesGlobRecursive()
+    TestReadTextFileWindows1252()
 
     print("All tests passed successfully!")

From 2ab567b16aa011543c88e97d0f74f6dc918a8b43 Mon Sep 17 00:00:00 2001
From: kgruiz <kjgruiz@pm.me>
Date: Tue, 20 May 2025 14:08:11 -0400
Subject: [PATCH 2/6] Improve encoding error handling

---
 PyTokenCounter/file_tokens.py | 48 ++++++++++++++++++++++++++---------
 Tests/Runner.py               | 18 ++++++++-----
 2 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/PyTokenCounter/file_tokens.py b/PyTokenCounter/file_tokens.py
index 82e58e2..17d8be5 100644
--- a/PyTokenCounter/file_tokens.py
+++ b/PyTokenCounter/file_tokens.py
@@ -582,7 +582,9 @@ def TokenizeDir(
                     mapTokens=mapTokens,
                 )
 
-            except UnicodeDecodeError:
+            except UnicodeDecodeError as e:
+
+                encoding = e.encoding or "unknown"
 
                 if excludeBinary:
 
@@ -591,7 +593,9 @@ def TokenizeDir(
                         _UpdateTask(
                             taskName=taskName,
                             advance=1,
-                            description=f"Skipping binary file {entry.relative_to(dirPath)}",
+                            description=(
+                                f"Skipping binary file {entry.relative_to(dirPath)} (encoding: {encoding})"
+                            ),
                             quiet=quiet,
                         )
 
@@ -599,7 +603,9 @@ def TokenizeDir(
 
                 else:
 
-                    raise
+                    raise UnsupportedEncodingError(
+                        encoding=encoding, filePath=entry
+                    ) from e
 
             if mapTokens:
 
@@ -811,7 +817,9 @@ def GetNumTokenDir(
                     mapTokens=False,
                 )
 
-            except UnicodeDecodeError:
+            except UnicodeDecodeError as e:
+
+                encoding = e.encoding or "unknown"
 
                 if excludeBinary:
 
@@ -820,7 +828,9 @@ def GetNumTokenDir(
                         _UpdateTask(
                             taskName=taskName,
                             advance=1,
-                            description=f"Skipping binary file {entry.relative_to(dirPath)}",
+                            description=(
+                                f"Skipping binary file {entry.relative_to(dirPath)} (encoding: {encoding})"
+                            ),
                             quiet=quiet,
                         )
 
@@ -828,7 +838,9 @@ def GetNumTokenDir(
 
                 else:
 
-                    raise
+                    raise UnsupportedEncodingError(
+                        encoding=encoding, filePath=entry
+                    ) from e
 
             if mapTokens:
 
@@ -1051,7 +1063,9 @@ def TokenizeFiles(
                         mapTokens=mapTokens,
                     )
 
-                except UnicodeDecodeError:
+                except UnicodeDecodeError as e:
+
+                    encoding = e.encoding or "unknown"
 
                     if excludeBinary:
 
@@ -1060,7 +1074,9 @@ def TokenizeFiles(
                             _UpdateTask(
                                 taskName="Tokenizing File/Directory List",
                                 advance=1,
-                                description=f"Skipping binary file {entry.name}",
+                                description=(
+                                    f"Skipping binary file {entry.name} (encoding: {encoding})"
+                                ),
                                 quiet=quiet,
                             )
 
@@ -1068,7 +1084,9 @@ def TokenizeFiles(
 
                     else:
 
-                        raise
+                        raise UnsupportedEncodingError(
+                            encoding=encoding, filePath=entry
+                        ) from e
 
                 if mapTokens:
 
@@ -1329,7 +1347,9 @@ def GetNumTokenFiles(
                         mapTokens=False,
                     )
 
-                except UnicodeDecodeError:
+                except UnicodeDecodeError as e:
+
+                    encoding = e.encoding or "unknown"
 
                     if excludeBinary:
 
@@ -1338,7 +1358,9 @@ def GetNumTokenFiles(
                             _UpdateTask(
                                 taskName="Counting Tokens in File/Directory List",
                                 advance=1,
-                                description=f"Skipping binary file {entry.name}",
+                                description=(
+                                    f"Skipping binary file {entry.name} (encoding: {encoding})"
+                                ),
                                 quiet=quiet,
                             )
 
@@ -1346,7 +1368,9 @@ def GetNumTokenFiles(
 
                     else:
 
-                        raise
+                        raise UnsupportedEncodingError(
+                            encoding=encoding, filePath=entry
+                        ) from e
 
                 if mapTokens:
 
diff --git a/Tests/Runner.py b/Tests/Runner.py
index 4665c5c..2c8b2b8 100644
--- a/Tests/Runner.py
+++ b/Tests/Runner.py
@@ -171,6 +171,7 @@ def TestTokenizeFilesMultiple():
     """
     inputFiles = [
         Path(testInputDir, "TestFile1.txt"),
+        Path(testInputDir, "TestImg.jpg"),
         Path(testInputDir, "TestFile2.txt"),
     ]
     answerFiles = [
@@ -179,7 +180,7 @@ def TestTokenizeFilesMultiple():
     ]
 
     expectedTokenLists = {}
-    for inputFile, answerFile in zip(inputFiles, answerFiles):
+    for inputFile, answerFile in zip([inputFiles[0], inputFiles[2]], answerFiles):
         with answerFile.open("r") as file:
             answer = json.load(file)
             expectedTokenLists[inputFile.name] = answer["tokens"]
@@ -302,6 +303,7 @@ def TestTokenizeFilesListQuietFalse():
     """
     inputFiles = [
         Path(testInputDir, "TestFile1.txt"),
+        Path(testInputDir, "TestImg.jpg"),
         Path(testInputDir, "TestFile2.txt"),
     ]
     answerFiles = [
@@ -310,7 +312,7 @@ def TestTokenizeFilesListQuietFalse():
     ]
 
     expectedTokenLists = {}
-    for inputFile, answerFile in zip(inputFiles, answerFiles):
+    for inputFile, answerFile in zip([inputFiles[0], inputFiles[2]], answerFiles):
         with answerFile.open("r") as file:
             answer = json.load(file)
             expectedTokenLists[inputFile.name] = answer["tokens"]
@@ -327,9 +329,9 @@ def TestTokenizeFilesListQuietFalse():
 
     # Check if any progress messages were printed
     output = capturedOutput.getvalue()
-    if not output.strip():
+    if "Skipping binary file TestImg.jpg" not in output:
         RaiseTestAssertion(
-            "Expected progress messages to be printed when quiet=False, but no output was captured."
+            "Expected skip message for binary file was not printed when quiet=False."
         )
 
     # Verify tokenization results
@@ -674,8 +676,12 @@ def TestTokenizeFileWithUnsupportedEncoding():
 
     try:
         tc.TokenizeFile(filePath=unsupportedFilePath, model="gpt-4o", quiet=True)
-    except tc.UnsupportedEncodingError:
-        pass  # Expected exception
+    except tc.UnsupportedEncodingError as e:
+        message = str(e)
+        if str(unsupportedFilePath) not in message or "encoding" not in message:
+            RaiseTestAssertion(
+                "Error message did not include file path and encoding information"
+            )
     except Exception as e:
         RaiseTestAssertion(
             f"Test Failed: Unexpected error type raised for file '{unsupportedFilePath}' - {type(e).__name__}"

From f5eba4b9cab86f22b0c51d570d38c0fac678a1bc Mon Sep 17 00:00:00 2001
From: kgruiz <kjgruiz@pm.me>
Date: Tue, 20 May 2025 18:32:48 -0400
Subject: [PATCH 3/6] Convert tests to pytest and add CI

---
 .github/workflows/ci.yml   |  47 +++++++++++
 README.md                  |  10 +++
 pyproject.toml             |   4 +-
 pytest.ini                 |   4 +
 tests/test_models.py       | 152 ++++++++++++++++++++++++++++++++++
 tests/test_tokenization.py | 161 +++++++++++++++++++++++++++++++++++++
 tests/test_utils.py        |  42 ++++++++++
 7 files changed, 419 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 pytest.ini
 create mode 100644 tests/test_models.py
 create mode 100644 tests/test_tokenization.py
 create mode 100644 tests/test_utils.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..23c49e1
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,47 @@
+name: CI
+
+on:
+  push:
+    branches: [ main ]
+    paths-ignore:
+      - '**/*.md'
+  pull_request:
+    paths-ignore:
+      - '**/*.md'
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.x'
+      - name: Detect comment-only changes
+        id: changes
+        run: |
+          git fetch --depth=2 origin ${{ github.base_ref }}
+          CHANGED=$(git diff --name-only HEAD origin/${{ github.base_ref }} -- '*.py')
+          if [ -z "$CHANGED" ]; then
+            echo "run=false" >> $GITHUB_OUTPUT
+          else
+            DIFF=$(git diff origin/${{ github.base_ref }} HEAD -- '*.py' | grep '^+' | grep -v '+++' | grep -v '^+\s*#')
+            if [ -z "$DIFF" ]; then
+              echo "run=false" >> $GITHUB_OUTPUT
+            else
+              echo "run=true" >> $GITHUB_OUTPUT
+            fi
+          fi
+      - name: Install dependencies
+        if: steps.changes.outputs.run == 'true'
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .
+          pip install flake8
+      - name: Run flake8
+        if: steps.changes.outputs.run == 'true'
+        run: flake8 .
+      - name: Run pytest
+        if: steps.changes.outputs.run == 'true'
+        run: pytest
diff --git a/README.md b/README.md
index ff0fcc4..ac434c7 100644
--- a/README.md
+++ b/README.md
@@ -1070,6 +1070,16 @@ Along with ignoring the extensions in the exclude list to quickly bypass known f
 
 ---
 
+## Running Tests
+
+After installing the package's dependencies, run the test suite with `pytest`:
+
+```bash
+pytest
+```
+
+The tests rely on the sample data located in the `Tests/` directory.
+
 ## Maintainers
 
 - [Kaden Gruizenga](https://github.com/kgruiz)
diff --git a/pyproject.toml b/pyproject.toml
index 782b15c..db3d81a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,8 @@ dependencies = [
     "rich>=13.9.4",
     "chardet>=5.2.0",
     "colorlog>=6.9.0",
+    "pytest>=7.4.0",
+    "flake8>=6.1.0",
 ]
 
 classifiers = [
@@ -62,4 +64,4 @@ requires = ["setuptools>=43.0.0", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools.packages.find]
-exclude = ["Tests"]
\ No newline at end of file
+exclude = ["Tests"]
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..52387a2
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+addopts = -v
+pythonpaths = PyTokenCounter
+
diff --git a/tests/test_models.py b/tests/test_models.py
new file mode 100644
index 0000000..f7ce786
--- /dev/null
+++ b/tests/test_models.py
@@ -0,0 +1,152 @@
+import json
+from pathlib import Path
+import pytest
+import tiktoken
+import PyTokenCounter as tc
+
+TEST_DIR = Path(__file__).resolve().parent.parent / "Tests"
+ANSWERS_DIR = TEST_DIR / "Answers"
+
+
+def test_get_model_mappings():
+    expected = {
+        "gpt-4o": "o200k_base",
+        "gpt-4o-mini": "o200k_base",
+        "gpt-4-turbo": "cl100k_base",
+        "gpt-4": "cl100k_base",
+        "gpt-3.5-turbo": "cl100k_base",
+        "text-embedding-ada-002": "cl100k_base",
+        "text-embedding-3-small": "cl100k_base",
+        "text-embedding-3-large": "cl100k_base",
+        "Codex models": "p50k_base",
+        "text-davinci-002": "p50k_base",
+        "text-davinci-003": "p50k_base",
+        "GPT-3 models like davinci": "r50k_base",
+    }
+    assert tc.GetModelMappings() == expected
+
+
+def test_get_valid_models():
+    expected = [
+        "gpt-4o",
+        "gpt-4o-mini",
+        "gpt-4-turbo",
+        "gpt-4",
+        "gpt-3.5-turbo",
+        "text-embedding-ada-002",
+        "text-embedding-3-small",
+        "text-embedding-3-large",
+        "Codex models",
+        "text-davinci-002",
+        "text-davinci-003",
+        "GPT-3 models like davinci",
+    ]
+    assert set(tc.GetValidModels()) == set(expected)
+
+
+def test_get_valid_encodings():
+    expected = ["o200k_base", "cl100k_base", "p50k_base", "r50k_base"]
+    assert set(tc.GetValidEncodings()) == set(expected)
+
+
+def test_get_model_for_encoding():
+    mapping = {
+        "o200k_base": ["gpt-4o", "gpt-4o-mini"],
+        "cl100k_base": [
+            "gpt-3.5-turbo",
+            "gpt-4",
+            "gpt-4-turbo",
+            "text-embedding-3-large",
+            "text-embedding-3-small",
+            "text-embedding-ada-002",
+        ],
+        "p50k_base": ["Codex models", "text-davinci-002", "text-davinci-003"],
+        "r50k_base": "GPT-3 models like davinci",
+    }
+    for name, expected in mapping.items():
+        encoding = tiktoken.get_encoding(encoding_name=name)
+        result = tc.GetModelForEncoding(encoding=encoding)
+        if isinstance(expected, list):
+            assert sorted(result) == sorted(expected)
+        else:
+            assert result == expected
+
+
+def test_get_model_for_encoding_name():
+    mapping = {
+        "o200k_base": ["gpt-4o", "gpt-4o-mini"],
+        "cl100k_base": [
+            "gpt-3.5-turbo",
+            "gpt-4",
+            "gpt-4-turbo",
+            "text-embedding-3-large",
+            "text-embedding-3-small",
+            "text-embedding-ada-002",
+        ],
+        "p50k_base": ["Codex models", "text-davinci-002", "text-davinci-003"],
+        "r50k_base": "GPT-3 models like davinci",
+    }
+    for name, expected in mapping.items():
+        result = tc.GetModelForEncodingName(encodingName=name)
+        if isinstance(expected, list):
+            assert sorted(result) == sorted(expected)
+        else:
+            assert result == expected
+
+
+def test_get_encoding_for_model():
+    mapping = {
+        "gpt-4o": "o200k_base",
+        "gpt-4o-mini": "o200k_base",
+        "gpt-4-turbo": "cl100k_base",
+        "gpt-4": "cl100k_base",
+        "gpt-3.5-turbo": "cl100k_base",
+        "text-embedding-ada-002": "cl100k_base",
+        "text-embedding-3-small": "cl100k_base",
+        "text-embedding-3-large": "cl100k_base",
+        "text-davinci-002": "p50k_base",
+        "text-davinci-003": "p50k_base",
+    }
+    for model, encoding_name in mapping.items():
+        result = tc.GetEncodingForModel(modelName=model)
+        assert result.name == encoding_name
+
+
+def test_get_encoding_name_for_model():
+    mapping = {
+        "gpt-4o": "o200k_base",
+        "gpt-4o-mini": "o200k_base",
+        "gpt-4-turbo": "cl100k_base",
+        "gpt-4": "cl100k_base",
+        "gpt-3.5-turbo": "cl100k_base",
+        "text-embedding-ada-002": "cl100k_base",
+        "text-embedding-3-small": "cl100k_base",
+        "text-embedding-3-large": "cl100k_base",
+        "Codex models": "p50k_base",
+        "text-davinci-002": "p50k_base",
+        "text-davinci-003": "p50k_base",
+        "GPT-3 models like davinci": "r50k_base",
+    }
+    for model, encoding_name in mapping.items():
+        result = tc.GetEncodingNameForModel(modelName=model)
+        assert result == encoding_name
+
+
+def test_get_encoding():
+    encoding = tc.GetEncoding(model="gpt-3.5-turbo")
+    assert encoding.name == "cl100k_base"
+
+    encoding = tc.GetEncoding(encodingName="p50k_base")
+    assert encoding.name == "p50k_base"
+
+    encoding = tc.GetEncoding(model="gpt-4-turbo", encodingName="cl100k_base")
+    assert encoding.name == "cl100k_base"
+
+    with pytest.raises(ValueError):
+        tc.GetEncoding(model="gpt-3.5-turbo", encodingName="p50k_base")
+
+
+def test_get_encoding_error():
+    with pytest.raises(ValueError):
+        tc.GetEncoding()
+
diff --git a/tests/test_tokenization.py b/tests/test_tokenization.py
new file mode 100644
index 0000000..5035681
--- /dev/null
+++ b/tests/test_tokenization.py
@@ -0,0 +1,161 @@
+import json
+from pathlib import Path
+import io
+import sys
+import PyTokenCounter as tc
+from PyTokenCounter.cli import ParseFiles
+
+# Paths to test resources
+TEST_DIR = Path(__file__).resolve().parent.parent / "Tests"
+INPUT_DIR = TEST_DIR / "Input"
+ANSWERS_DIR = TEST_DIR / "Answers"
+
+
+def load_answer(name: str):
+    with (ANSWERS_DIR / name).open("r") as f:
+        return json.load(f)
+
+def test_tokenize_directory():
+    expected = load_answer("TestDirectory.json")
+    result = tc.TokenizeDir(
+        dirPath=INPUT_DIR / "TestDirectory",
+        model="gpt-4o",
+        recursive=True,
+        quiet=True,
+    )
+    assert isinstance(result, dict)
+    assert result == expected
+
+
+def test_tokenize_files_with_directory():
+    expected = load_answer("TestDirectory.json")
+    result = tc.TokenizeFiles(INPUT_DIR / "TestDirectory", model="gpt-4o", quiet=True)
+    assert isinstance(result, dict)
+    assert result == expected
+
+
+def test_tokenize_files_multiple():
+    files = [
+        INPUT_DIR / "TestFile1.txt",
+        INPUT_DIR / "TestImg.jpg",
+        INPUT_DIR / "TestFile2.txt",
+    ]
+    answers = [load_answer("TestFile1.json"), load_answer("TestFile2.json")]
+    result = tc.TokenizeFiles(files, model="gpt-4o", quiet=True)
+    expected = {
+        "TestFile1.txt": {"tokens": answers[0]["tokens"]},
+        "TestFile2.txt": {"tokens": answers[1]["tokens"]},
+    }
+    assert isinstance(result, dict)
+    assert result == expected
+
+
+def test_tokenize_files_exit_on_list_error_false():
+    files = [
+        INPUT_DIR / "TestFile1.txt",
+        INPUT_DIR / "TestImg.jpg",
+        INPUT_DIR / "TestFile2.txt",
+    ]
+    answers = [load_answer("TestFile1.json"), load_answer("TestFile2.json")]
+    result = tc.TokenizeFiles(files, model="gpt-4o", quiet=True, exitOnListError=False)
+    expected = {
+        "TestFile1.txt": {"tokens": answers[0]["tokens"]},
+        "TestFile2.txt": {"tokens": answers[1]["tokens"]},
+    }
+    assert isinstance(result, dict)
+    assert result == expected
+
+
+def test_tokenize_directory_no_recursion():
+    expected = load_answer("TestDirectoryNoRecursion.json")
+    result = tc.TokenizeDir(
+        dirPath=INPUT_DIR / "TestDirectory",
+        model="gpt-4o",
+        recursive=False,
+        quiet=True,
+    )
+    assert isinstance(result, dict)
+    assert result == expected
+    # ensure subdirectories not included
+    for entry in (INPUT_DIR / "TestDirectory").iterdir():
+        if entry.is_dir():
+            assert entry.name not in result
+
+
+def test_tokenize_files_with_invalid_input():
+    with pytest.raises(TypeError):
+        tc.TokenizeFiles(67890)
+
+
+def test_tokenize_files_list_quiet_false(capsys):
+    files = [
+        INPUT_DIR / "TestFile1.txt",
+        INPUT_DIR / "TestImg.jpg",
+        INPUT_DIR / "TestFile2.txt",
+    ]
+    answers = [load_answer("TestFile1.json"), load_answer("TestFile2.json")]
+    result = None
+    captured = io.StringIO()
+    sys_stdout = sys.stdout
+    sys.stdout = captured
+    try:
+        result = tc.TokenizeFiles(files, model="gpt-4o", quiet=False)
+    finally:
+        sys.stdout = sys_stdout
+    output = captured.getvalue()
+    assert "Skipping binary file TestImg.jpg" in output
+    expected = {
+        "TestFile1.txt": {"tokens": answers[0]["tokens"]},
+        "TestFile2.txt": {"tokens": answers[1]["tokens"]},
+    }
+    assert isinstance(result, dict)
+    assert result == expected
+
+
+import pytest
+
+@pytest.mark.parametrize(
+    "input_name,answer_name",
+    [
+        ("TestFile1.txt", "TestFile1.json"),
+        ("TestFile2.txt", "TestFile2.json"),
+    ],
+)
+def test_tokenize_file(input_name, answer_name):
+    answer = load_answer(answer_name)
+    result = tc.TokenizeFile(INPUT_DIR / input_name, model="gpt-4o", quiet=True)
+    assert result == answer["tokens"]
+    count = tc.GetNumTokenFile(INPUT_DIR / input_name, model="gpt-4o", quiet=True)
+    assert count == answer["numTokens"]
+
+
+def test_tokenize_file_error():
+    with pytest.raises(tc.UnsupportedEncodingError):
+        tc.TokenizeFile(INPUT_DIR / "TestImg.jpg", model="gpt-4o", quiet=True)
+
+
+def test_tokenize_file_error_type():
+    with pytest.raises(TypeError):
+        tc.TokenizeFile(54321, model="gpt-4o", quiet=True)
+
+
+def test_tokenize_file_with_unsupported_encoding():
+    path = INPUT_DIR / "TestImg.jpg"
+    with pytest.raises(tc.UnsupportedEncodingError) as exc:
+        tc.TokenizeFile(filePath=path, model="gpt-4o", quiet=True)
+    assert str(path) in str(exc.value)
+    assert "encoding" in str(exc.value)
+
+
+def test_tokenize_str():
+    expected_strings = {
+        "Hail to the Victors!": [39, 663, 316, 290, 16566, 914, 0],
+        "2024 National Champions": [1323, 19, 6743, 40544],
+        "Corum 4 Heisman": [11534, 394, 220, 19, 1679, 107107],
+    }
+    for text, tokens in expected_strings.items():
+        result = tc.TokenizeStr(string=text, model="gpt-4o", quiet=True)
+        assert result == tokens
+        count = tc.GetNumTokenStr(string=text, model="gpt-4o", quiet=True)
+        assert count == len(tokens)
+
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..5f2266b
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,42 @@
+import json
+from pathlib import Path
+import PyTokenCounter as tc
+from PyTokenCounter.encoding_utils import ReadTextFile
+from PyTokenCounter.cli import ParseFiles
+
+TEST_DIR = Path(__file__).resolve().parent.parent / "Tests"
+INPUT_DIR = TEST_DIR / "Input"
+ANSWERS_DIR = TEST_DIR / "Answers"
+
+
+def test_parse_files_glob():
+    pattern = "Input/TestFile*.txt"
+    expected = {
+        "Input/TestFile1.txt",
+        "Input/TestFile2.txt",
+    }
+    result = set(ParseFiles([pattern]))
+    assert result == expected
+
+
+def test_parse_files_glob_recursive():
+    pattern = "Input/**/*.txt"
+    expected = {
+        "Input/TestFile1.txt",
+        "Input/TestFile2.txt",
+        "Input/TestDirectory/TestDir1.txt",
+        "Input/TestDirectory/TestDir2.txt",
+        "Input/TestDirectory/TestDir3.txt",
+        "Input/TestDirectory/TestSubDir/TestDir4.txt",
+        "Input/TestDirectory/TestSubDir/TestDir5.txt",
+    }
+    result = set(ParseFiles([pattern]))
+    assert result == expected
+
+
+def test_read_text_file_windows1252():
+    file_path = INPUT_DIR / "TestFile1252.txt"
+    expected = "Café – résumé naïve fiancé"
+    result = ReadTextFile(file_path)
+    assert result == expected
+

From d6284a76bf672ad1e9cae67414551d118f332ccb Mon Sep 17 00:00:00 2001
From: Kaden Gruizenga <kjgruiz@pm.me>
Date: Thu, 21 Aug 2025 22:29:12 -0400
Subject: [PATCH 4/6] fix(progress): prevent unfinished bars by matching totals
 to processed items

Update _CountDirFiles to respect includeHidden/excludeBinary and pass flags in TokenizeDir/GetNumTokenDir.
---
 PyTokenCounter/file_tokens.py | 53 +++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 9 deletions(-)

diff --git a/PyTokenCounter/file_tokens.py b/PyTokenCounter/file_tokens.py
index 17d8be5..f211d2a 100644
--- a/PyTokenCounter/file_tokens.py
+++ b/PyTokenCounter/file_tokens.py
@@ -7,7 +7,13 @@
 from .progress import _InitializeTask, _UpdateTask, _tasks
 from .core import BINARY_EXTENSIONS, TokenizeStr
 
-def _CountDirFiles(dirPath: Path, recursive: bool = True) -> int:
+def _CountDirFiles(
+    dirPath: Path,
+    recursive: bool = True,
+    *,
+    includeHidden: bool = False,
+    excludeBinary: bool = True,
+) -> int:
     """
     Count the number of files in a directory.
 
@@ -20,6 +26,10 @@ def _CountDirFiles(dirPath: Path, recursive: bool = True) -> int:
         The path to the directory in which to count files.
     recursive : bool, optional
         Whether to count files in subdirectories recursively (default is True).
+    includeHidden : bool, optional
+        Whether to include hidden files and directories (default is False).
+    excludeBinary : bool, optional
+        Whether to exclude binary files based on extension (default is True).
 
     Returns
     -------
@@ -42,17 +52,33 @@ def _CountDirFiles(dirPath: Path, recursive: bool = True) -> int:
 
         for entry in dirPath.iterdir():
 
-            if entry.is_dir():
+            # Skip hidden files and directories entirely if not including hidden
+            if not includeHidden and entry.name.startswith("."):
+                continue
 
-                numFiles += _CountDirFiles(entry, recursive=recursive)
+            if entry.is_dir():
+                # Recurse into subdirectories (respect hidden handling)
+                numFiles += _CountDirFiles(
+                    entry,
+                    recursive=recursive,
+                    includeHidden=includeHidden,
+                    excludeBinary=excludeBinary,
+                )
 
             else:
-
+                # Optionally skip binary files
+                if excludeBinary and entry.suffix.lower() in BINARY_EXTENSIONS:
+                    continue
                 numFiles += 1
 
     else:
-
-        numFiles = sum(1 for entry in dirPath.iterdir() if entry.is_file())
+        for entry in dirPath.iterdir():
+            if entry.is_file():
+                if not includeHidden and entry.name.startswith("."):
+                    continue
+                if excludeBinary and entry.suffix.lower() in BINARY_EXTENSIONS:
+                    continue
+                numFiles += 1
 
     return numFiles
 
@@ -519,7 +545,12 @@ def TokenizeDir(
 
         raise ValueError(f'Given directory path "{dirPath}" is not a directory.')
 
-    numFiles = _CountDirFiles(dirPath=dirPath, recursive=recursive)
+    numFiles = _CountDirFiles(
+        dirPath=dirPath,
+        recursive=recursive,
+        includeHidden=includeHidden,
+        excludeBinary=excludeBinary,
+    )
 
     if not quiet:
 
@@ -748,7 +779,12 @@ def GetNumTokenDir(
 
         raise ValueError(f'Given path "{dirPath}" is not a directory.')
 
-    numFiles = _CountDirFiles(dirPath=dirPath, recursive=recursive)
+    numFiles = _CountDirFiles(
+        dirPath=dirPath,
+        recursive=recursive,
+        includeHidden=includeHidden,
+        excludeBinary=excludeBinary,
+    )
 
     if not quiet:
 
@@ -1478,4 +1514,3 @@ def GetNumTokenFiles(
             raise RuntimeError(
                 f'Unexpected error. Given inputPath "{inputPath}" is neither a file, a directory, nor a list.'
             )
-

From c62db548a41751731236820f94bb346c1f2b5f02 Mon Sep 17 00:00:00 2001
From: Kaden Gruizenga <kjgruiz@pm.me>
Date: Thu, 21 Aug 2025 22:37:52 -0400
Subject: [PATCH 5/6] fix(cli): prevent error filepath wrapping by printing
 path outside panel and remove fixed console width

---
 PyTokenCounter/encoding_utils.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/PyTokenCounter/encoding_utils.py b/PyTokenCounter/encoding_utils.py
index f810f56..7f9bd4f 100644
--- a/PyTokenCounter/encoding_utils.py
+++ b/PyTokenCounter/encoding_utils.py
@@ -44,19 +44,26 @@ def __init__(
         errorText.append("Detected encoding: ", style="green")
         errorText.append(f"{encoding}", style="bold")
         errorText.append("\n")
-        errorText.append("File path: ", style="green")
-        errorText.append(f"{filePath}", style="bold blue")
+        # Intentionally do not include file path inside the panel to avoid line wrapping issues
 
         panel = Panel(
             errorText, title="Encoding Error", title_align="left", border_style="red"
         )
 
-        console = Console(width=80, color_system="truecolor", record=True)
+        console = Console(color_system="truecolor", record=True)
 
         with console.capture() as capture:
 
             console.print("")  # Add a new line before the panel
             console.print(panel)
+            console.print("")
+
+            # Print the file path outside the panel and prevent wrapping so it remains clickable
+            pathText = Text()
+            pathText.append("File path: ", style="green")
+            pathText.append(f"{filePath}", style="bold blue")
+            pathText.no_wrap = True
+            console.print(pathText)
         captured = capture.get()
 
         # Store the formatted panel; pass a plain message to the base Exception

From 2b76fd6987c9330620a7f077476efaf7cdbb2419 Mon Sep 17 00:00:00 2001
From: Kaden Gruizenga <kjgruiz@pm.me>
Date: Thu, 21 Aug 2025 22:43:55 -0400
Subject: [PATCH 6/6] chore(release): bump version to v1.8.0

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index db3d81a..ddd2178 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "PyTokenCounter"
-version = "1.7.0"
+version = "1.8.0"
 description = "A Python library for tokenizing text and counting tokens using various encoding schemes."
 readme = {file = "README.md", content-type = "text/markdown"}
 requires-python = ">=3.11"