From f5eba4b9cab86f22b0c51d570d38c0fac678a1bc Mon Sep 17 00:00:00 2001
From: kgruiz <kjgruiz@pm.me>
Date: Tue, 20 May 2025 18:32:48 -0400
Subject: [PATCH] Convert tests to pytest and add CI

---
 .github/workflows/ci.yml   |  47 +++++++++++
 README.md                  |  10 +++
 pyproject.toml             |   4 +-
 pytest.ini                 |   4 +
 tests/test_models.py       | 152 ++++++++++++++++++++++++++++++++++
 tests/test_tokenization.py | 161 +++++++++++++++++++++++++++++++++++++
 tests/test_utils.py        |  42 ++++++++++
 7 files changed, 419 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 pytest.ini
 create mode 100644 tests/test_models.py
 create mode 100644 tests/test_tokenization.py
 create mode 100644 tests/test_utils.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..23c49e1
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,47 @@
+name: CI
+
+on:
+  push:
+    branches: [ main ]
+    paths-ignore:
+      - '**/*.md'
+  pull_request:
+    paths-ignore:
+      - '**/*.md'
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.x'
+      - name: Detect comment-only changes
+        id: changes
+        run: |
+          git fetch --depth=2 origin ${{ github.base_ref }}
+          CHANGED=$(git diff --name-only HEAD origin/${{ github.base_ref }} -- '*.py')
+          if [ -z "$CHANGED" ]; then
+            echo "run=false" >> $GITHUB_OUTPUT
+          else
+            DIFF=$(git diff origin/${{ github.base_ref }} HEAD -- '*.py' | grep '^+' | grep -v '+++' | grep -v '^+\s*#')
+            if [ -z "$DIFF" ]; then
+              echo "run=false" >> $GITHUB_OUTPUT
+            else
+              echo "run=true" >> $GITHUB_OUTPUT
+            fi
+          fi
+      - name: Install dependencies
+        if: steps.changes.outputs.run == 'true'
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .
+          pip install flake8
+      - name: Run flake8
+        if: steps.changes.outputs.run == 'true'
+        run: flake8 .
+      - name: Run pytest
+        if: steps.changes.outputs.run == 'true'
+        run: pytest
diff --git a/README.md b/README.md
index ff0fcc4..ac434c7 100644
--- a/README.md
+++ b/README.md
@@ -1070,6 +1070,16 @@ Along with ignoring the extensions in the exclude list to quickly bypass known f
 
 ---
 
+## Running Tests
+
+After installing the package's dependencies, run the test suite with `pytest`:
+
+```bash
+pytest
+```
+
+The tests rely on the sample data located in the `Tests/` directory.
+
 ## Maintainers
 
 - [Kaden Gruizenga](https://github.com/kgruiz)
diff --git a/pyproject.toml b/pyproject.toml
index 782b15c..db3d81a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,8 @@ dependencies = [
     "rich>=13.9.4",
     "chardet>=5.2.0",
     "colorlog>=6.9.0",
+    "pytest>=7.4.0",
+    "flake8>=6.1.0",
 ]
 
 classifiers = [
@@ -62,4 +64,4 @@ requires = ["setuptools>=43.0.0", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools.packages.find]
-exclude = ["Tests"]
\ No newline at end of file
+exclude = ["Tests"]
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..52387a2
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+addopts = -v
+pythonpaths = PyTokenCounter
+
diff --git a/tests/test_models.py b/tests/test_models.py
new file mode 100644
index 0000000..f7ce786
--- /dev/null
+++ b/tests/test_models.py
@@ -0,0 +1,152 @@
+import json
+from pathlib import Path
+import pytest
+import tiktoken
+import PyTokenCounter as tc
+
+TEST_DIR = Path(__file__).resolve().parent.parent / "Tests"
+ANSWERS_DIR = TEST_DIR / "Answers"
+
+
+def test_get_model_mappings():
+    expected = {
+        "gpt-4o": "o200k_base",
+        "gpt-4o-mini": "o200k_base",
+        "gpt-4-turbo": "cl100k_base",
+        "gpt-4": "cl100k_base",
+        "gpt-3.5-turbo": "cl100k_base",
+        "text-embedding-ada-002": "cl100k_base",
+        "text-embedding-3-small": "cl100k_base",
+        "text-embedding-3-large": "cl100k_base",
+        "Codex models": "p50k_base",
+        "text-davinci-002": "p50k_base",
+        "text-davinci-003": "p50k_base",
+        "GPT-3 models like davinci": "r50k_base",
+    }
+    assert tc.GetModelMappings() == expected
+
+
+def test_get_valid_models():
+    expected = [
+        "gpt-4o",
+        "gpt-4o-mini",
+        "gpt-4-turbo",
+        "gpt-4",
+        "gpt-3.5-turbo",
+        "text-embedding-ada-002",
+        "text-embedding-3-small",
+        "text-embedding-3-large",
+        "Codex models",
+        "text-davinci-002",
+        "text-davinci-003",
+        "GPT-3 models like davinci",
+    ]
+    assert set(tc.GetValidModels()) == set(expected)
+
+
+def test_get_valid_encodings():
+    expected = ["o200k_base", "cl100k_base", "p50k_base", "r50k_base"]
+    assert set(tc.GetValidEncodings()) == set(expected)
+
+
+def test_get_model_for_encoding():
+    mapping = {
+        "o200k_base": ["gpt-4o", "gpt-4o-mini"],
+        "cl100k_base": [
+            "gpt-3.5-turbo",
+            "gpt-4",
+            "gpt-4-turbo",
+            "text-embedding-3-large",
+            "text-embedding-3-small",
+            "text-embedding-ada-002",
+        ],
+        "p50k_base": ["Codex models", "text-davinci-002", "text-davinci-003"],
+        "r50k_base": "GPT-3 models like davinci",
+    }
+    for name, expected in mapping.items():
+        encoding = tiktoken.get_encoding(encoding_name=name)
+        result = tc.GetModelForEncoding(encoding=encoding)
+        if isinstance(expected, list):
+            assert sorted(result) == sorted(expected)
+        else:
+            assert result == expected
+
+
+def test_get_model_for_encoding_name():
+    mapping = {
+        "o200k_base": ["gpt-4o", "gpt-4o-mini"],
+        "cl100k_base": [
+            "gpt-3.5-turbo",
+            "gpt-4",
+            "gpt-4-turbo",
+            "text-embedding-3-large",
+            "text-embedding-3-small",
+            "text-embedding-ada-002",
+        ],
+        "p50k_base": ["Codex models", "text-davinci-002", "text-davinci-003"],
+        "r50k_base": "GPT-3 models like davinci",
+    }
+    for name, expected in mapping.items():
+        result = tc.GetModelForEncodingName(encodingName=name)
+        if isinstance(expected, list):
+            assert sorted(result) == sorted(expected)
+        else:
+            assert result == expected
+
+
+def test_get_encoding_for_model():
+    mapping = {
+        "gpt-4o": "o200k_base",
+        "gpt-4o-mini": "o200k_base",
+        "gpt-4-turbo": "cl100k_base",
+        "gpt-4": "cl100k_base",
+        "gpt-3.5-turbo": "cl100k_base",
+        "text-embedding-ada-002": "cl100k_base",
+        "text-embedding-3-small": "cl100k_base",
+        "text-embedding-3-large": "cl100k_base",
+        "text-davinci-002": "p50k_base",
+        "text-davinci-003": "p50k_base",
+    }
+    for model, encoding_name in mapping.items():
+        result = tc.GetEncodingForModel(modelName=model)
+        assert result.name == encoding_name
+
+
+def test_get_encoding_name_for_model():
+    mapping = {
+        "gpt-4o": "o200k_base",
+        "gpt-4o-mini": "o200k_base",
+        "gpt-4-turbo": "cl100k_base",
+        "gpt-4": "cl100k_base",
+        "gpt-3.5-turbo": "cl100k_base",
+        "text-embedding-ada-002": "cl100k_base",
+        "text-embedding-3-small": "cl100k_base",
+        "text-embedding-3-large": "cl100k_base",
+        "Codex models": "p50k_base",
+        "text-davinci-002": "p50k_base",
+        "text-davinci-003": "p50k_base",
+        "GPT-3 models like davinci": "r50k_base",
+    }
+    for model, encoding_name in mapping.items():
+        result = tc.GetEncodingNameForModel(modelName=model)
+        assert result == encoding_name
+
+
+def test_get_encoding():
+    encoding = tc.GetEncoding(model="gpt-3.5-turbo")
+    assert encoding.name == "cl100k_base"
+
+    encoding = tc.GetEncoding(encodingName="p50k_base")
+    assert encoding.name == "p50k_base"
+
+    encoding = tc.GetEncoding(model="gpt-4-turbo", encodingName="cl100k_base")
+    assert encoding.name == "cl100k_base"
+
+    with pytest.raises(ValueError):
+        tc.GetEncoding(model="gpt-3.5-turbo", encodingName="p50k_base")
+
+
+def test_get_encoding_error():
+    with pytest.raises(ValueError):
+        tc.GetEncoding()
+
diff --git a/tests/test_tokenization.py b/tests/test_tokenization.py
new file mode 100644
index 0000000..5035681
--- /dev/null
+++ b/tests/test_tokenization.py
@@ -0,0 +1,161 @@
+import json
+from pathlib import Path
+import io
+import sys
+import PyTokenCounter as tc
+from PyTokenCounter.cli import ParseFiles
+
+# Paths to test resources
+TEST_DIR = Path(__file__).resolve().parent.parent / "Tests"
+INPUT_DIR = TEST_DIR / "Input"
+ANSWERS_DIR = TEST_DIR / "Answers"
+
+
+def load_answer(name: str):
+    with (ANSWERS_DIR / name).open("r") as f:
+        return json.load(f)
+
+def test_tokenize_directory():
+    expected = load_answer("TestDirectory.json")
+    result = tc.TokenizeDir(
+        dirPath=INPUT_DIR / "TestDirectory",
+        model="gpt-4o",
+        recursive=True,
+        quiet=True,
+    )
+    assert isinstance(result, dict)
+    assert result == expected
+
+
+def test_tokenize_files_with_directory():
+    expected = load_answer("TestDirectory.json")
+    result = tc.TokenizeFiles(INPUT_DIR / "TestDirectory", model="gpt-4o", quiet=True)
+    assert isinstance(result, dict)
+    assert result == expected
+
+
+def test_tokenize_files_multiple():
+    files = [
+        INPUT_DIR / "TestFile1.txt",
+        INPUT_DIR / "TestImg.jpg",
+        INPUT_DIR / "TestFile2.txt",
+    ]
+    answers = [load_answer("TestFile1.json"), load_answer("TestFile2.json")]
+    result = tc.TokenizeFiles(files, model="gpt-4o", quiet=True)
+    expected = {
+        "TestFile1.txt": {"tokens": answers[0]["tokens"]},
+        "TestFile2.txt": {"tokens": answers[1]["tokens"]},
+    }
+    assert isinstance(result, dict)
+    assert result == expected
+
+
+def test_tokenize_files_exit_on_list_error_false():
+    files = [
+        INPUT_DIR / "TestFile1.txt",
+        INPUT_DIR / "TestImg.jpg",
+        INPUT_DIR / "TestFile2.txt",
+    ]
+    answers = [load_answer("TestFile1.json"), load_answer("TestFile2.json")]
+    result = tc.TokenizeFiles(files, model="gpt-4o", quiet=True, exitOnListError=False)
+    expected = {
+        "TestFile1.txt": {"tokens": answers[0]["tokens"]},
+        "TestFile2.txt": {"tokens": answers[1]["tokens"]},
+    }
+    assert isinstance(result, dict)
+    assert result == expected
+
+
+def test_tokenize_directory_no_recursion():
+    expected = load_answer("TestDirectoryNoRecursion.json")
+    result = tc.TokenizeDir(
+        dirPath=INPUT_DIR / "TestDirectory",
+        model="gpt-4o",
+        recursive=False,
+        quiet=True,
+    )
+    assert isinstance(result, dict)
+    assert result == expected
+    # ensure subdirectories not included
+    for entry in (INPUT_DIR / "TestDirectory").iterdir():
+        if entry.is_dir():
+            assert entry.name not in result
+
+
+def test_tokenize_files_with_invalid_input():
+    with pytest.raises(TypeError):
+        tc.TokenizeFiles(67890)
+
+
+def test_tokenize_files_list_quiet_false(capsys):
+    files = [
+        INPUT_DIR / "TestFile1.txt",
+        INPUT_DIR / "TestImg.jpg",
+        INPUT_DIR / "TestFile2.txt",
+    ]
+    answers = [load_answer("TestFile1.json"), load_answer("TestFile2.json")]
+    result = None
+    captured = io.StringIO()
+    sys_stdout = sys.stdout
+    sys.stdout = captured
+    try:
+        result = tc.TokenizeFiles(files, model="gpt-4o", quiet=False)
+    finally:
+        sys.stdout = sys_stdout
+    output = captured.getvalue()
+    assert "Skipping binary file TestImg.jpg" in output
+    expected = {
+        "TestFile1.txt": {"tokens": answers[0]["tokens"]},
+        "TestFile2.txt": {"tokens": answers[1]["tokens"]},
+    }
+    assert isinstance(result, dict)
+    assert result == expected
+
+
+import pytest
+
+@pytest.mark.parametrize(
+    "input_name,answer_name",
+    [
+        ("TestFile1.txt", "TestFile1.json"),
+        ("TestFile2.txt", "TestFile2.json"),
+    ],
+)
+def test_tokenize_file(input_name, answer_name):
+    answer = load_answer(answer_name)
+    result = tc.TokenizeFile(INPUT_DIR / input_name, model="gpt-4o", quiet=True)
+    assert result == answer["tokens"]
+    count = tc.GetNumTokenFile(INPUT_DIR / input_name, model="gpt-4o", quiet=True)
+    assert count == answer["numTokens"]
+
+
+def test_tokenize_file_error():
+    with pytest.raises(tc.UnsupportedEncodingError):
+        tc.TokenizeFile(INPUT_DIR / "TestImg.jpg", model="gpt-4o", quiet=True)
+
+
+def test_tokenize_file_error_type():
+    with pytest.raises(TypeError):
+        tc.TokenizeFile(54321, model="gpt-4o", quiet=True)
+
+
+def test_tokenize_file_with_unsupported_encoding():
+    path = INPUT_DIR / "TestImg.jpg"
+    with pytest.raises(tc.UnsupportedEncodingError) as exc:
+        tc.TokenizeFile(filePath=path, model="gpt-4o", quiet=True)
+    assert str(path) in str(exc.value)
+    assert "encoding" in str(exc.value)
+
+
+def test_tokenize_str():
+    expected_strings = {
+        "Hail to the Victors!": [39, 663, 316, 290, 16566, 914, 0],
+        "2024 National Champions": [1323, 19, 6743, 40544],
+        "Corum 4 Heisman": [11534, 394, 220, 19, 1679, 107107],
+    }
+    for text, tokens in expected_strings.items():
+        result = tc.TokenizeStr(string=text, model="gpt-4o", quiet=True)
+        assert result == tokens
+        count = tc.GetNumTokenStr(string=text, model="gpt-4o", quiet=True)
+        assert count == len(tokens)
+
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..5f2266b
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,42 @@
+import json
+from pathlib import Path
+import PyTokenCounter as tc
+from PyTokenCounter.encoding_utils import ReadTextFile
+from PyTokenCounter.cli import ParseFiles
+
+TEST_DIR = Path(__file__).resolve().parent.parent / "Tests"
+INPUT_DIR = TEST_DIR / "Input"
+ANSWERS_DIR = TEST_DIR / "Answers"
+
+
+def test_parse_files_glob():
+    pattern = "Input/TestFile*.txt"
+    expected = {
+        "Input/TestFile1.txt",
+        "Input/TestFile2.txt",
+    }
+    result = set(ParseFiles([pattern]))
+    assert result == expected
+
+
+def test_parse_files_glob_recursive():
+    pattern = "Input/**/*.txt"
+    expected = {
+        "Input/TestFile1.txt",
+        "Input/TestFile2.txt",
+        "Input/TestDirectory/TestDir1.txt",
+        "Input/TestDirectory/TestDir2.txt",
+        "Input/TestDirectory/TestDir3.txt",
+        "Input/TestDirectory/TestSubDir/TestDir4.txt",
+        "Input/TestDirectory/TestSubDir/TestDir5.txt",
+    }
+    result = set(ParseFiles([pattern]))
+    assert result == expected
+
+
+def test_read_text_file_windows1252():
+    file_path = INPUT_DIR / "TestFile1252.txt"
+    expected = "Café – résumé naïve fiancé"
+    result = ReadTextFile(file_path)
+    assert result == expected
+