From f5eba4b9cab86f22b0c51d570d38c0fac678a1bc Mon Sep 17 00:00:00 2001 From: kgruiz Date: Tue, 20 May 2025 18:32:48 -0400 Subject: [PATCH] Convert tests to pytest and add CI --- .github/workflows/ci.yml | 47 +++++++++++ README.md | 10 +++ pyproject.toml | 4 +- pytest.ini | 4 + tests/test_models.py | 152 ++++++++++++++++++++++++++++++++++ tests/test_tokenization.py | 161 +++++++++++++++++++++++++++++++++++++ tests/test_utils.py | 42 ++++++++++ 7 files changed, 419 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/ci.yml create mode 100644 pytest.ini create mode 100644 tests/test_models.py create mode 100644 tests/test_tokenization.py create mode 100644 tests/test_utils.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..23c49e1 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,47 @@ +name: CI + +on: + push: + branches: [ main ] + paths-ignore: + - '**/*.md' + pull_request: + paths-ignore: + - '**/*.md' + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + - name: Detect comment-only changes + id: changes + run: | + git fetch --depth=2 origin ${{ github.base_ref }} + CHANGED=$(git diff --name-only HEAD origin/${{ github.base_ref }} -- '*.py') + if [ -z "$CHANGED" ]; then + echo "run=false" >> $GITHUB_OUTPUT + else + DIFF=$(git diff origin/${{ github.base_ref }} HEAD -- '*.py' | grep '^+' | grep -v '+++' | grep -v '^+\s*#') + if [ -z "$DIFF" ]; then + echo "run=false" >> $GITHUB_OUTPUT + else + echo "run=true" >> $GITHUB_OUTPUT + fi + fi + - name: Install dependencies + if: steps.changes.outputs.run == 'true' + run: | + python -m pip install --upgrade pip + pip install -e . + pip install flake8 + - name: Run flake8 + if: steps.changes.outputs.run == 'true' + run: flake8 . + - name: Run pytest + if: steps.changes.outputs.run == 'true' + run: pytest diff --git a/README.md b/README.md index ff0fcc4..ac434c7 100644 --- a/README.md +++ b/README.md @@ -1070,6 +1070,16 @@ Along with ignoring the extensions in the exclude list to quickly bypass known f --- +## Running Tests + +After installing the package's dependencies, run the test suite with `pytest`: + +```bash +pytest +``` + +The tests rely on the sample data located in the `Tests/` directory. + ## Maintainers - [Kaden Gruizenga](https://github.com/kgruiz) diff --git a/pyproject.toml b/pyproject.toml index 782b15c..db3d81a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,8 @@ dependencies = [ "rich>=13.9.4", "chardet>=5.2.0", "colorlog>=6.9.0", + "pytest>=7.4.0", + "flake8>=6.1.0", ] classifiers = [ @@ -62,4 +64,4 @@ requires = ["setuptools>=43.0.0", "wheel"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] -exclude = ["Tests"] \ No newline at end of file +exclude = ["Tests"] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..52387a2 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +addopts = -v +pythonpaths = PyTokenCounter + diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..f7ce786 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,152 @@ +import json +from pathlib import Path +import pytest +import tiktoken +import PyTokenCounter as tc + +TEST_DIR = Path(__file__).resolve().parent.parent / "Tests" +ANSWERS_DIR = TEST_DIR / "Answers" + + +def test_get_model_mappings(): + expected = { + "gpt-4o": "o200k_base", + "gpt-4o-mini": "o200k_base", + "gpt-4-turbo": "cl100k_base", + "gpt-4": "cl100k_base", + "gpt-3.5-turbo": "cl100k_base", + "text-embedding-ada-002": "cl100k_base", + "text-embedding-3-small": "cl100k_base", + "text-embedding-3-large": "cl100k_base", + "Codex models": "p50k_base", + "text-davinci-002": "p50k_base", + "text-davinci-003": "p50k_base", + "GPT-3 models like davinci": "r50k_base", + } + assert tc.GetModelMappings() == expected + + +def test_get_valid_models(): + expected = [ + "gpt-4o", + "gpt-4o-mini", + "gpt-4-turbo", + "gpt-4", + "gpt-3.5-turbo", + "text-embedding-ada-002", + "text-embedding-3-small", + "text-embedding-3-large", + "Codex models", + "text-davinci-002", + "text-davinci-003", + "GPT-3 models like davinci", + ] + assert set(tc.GetValidModels()) == set(expected) + + +def test_get_valid_encodings(): + expected = ["o200k_base", "cl100k_base", "p50k_base", "r50k_base"] + assert set(tc.GetValidEncodings()) == set(expected) + + +def test_get_model_for_encoding(): + mapping = { + "o200k_base": ["gpt-4o", "gpt-4o-mini"], + "cl100k_base": [ + "gpt-3.5-turbo", + "gpt-4", + "gpt-4-turbo", + "text-embedding-3-large", + "text-embedding-3-small", + "text-embedding-ada-002", + ], + "p50k_base": ["Codex models", "text-davinci-002", "text-davinci-003"], + "r50k_base": "GPT-3 models like davinci", + } + for name, expected in mapping.items(): + encoding = tiktoken.get_encoding(encoding_name=name) + result = tc.GetModelForEncoding(encoding=encoding) + if isinstance(expected, list): + assert sorted(result) == sorted(expected) + else: + assert result == expected + + +def test_get_model_for_encoding_name(): + mapping = { + "o200k_base": ["gpt-4o", "gpt-4o-mini"], + "cl100k_base": [ + "gpt-3.5-turbo", + "gpt-4", + "gpt-4-turbo", + "text-embedding-3-large", + "text-embedding-3-small", + "text-embedding-ada-002", + ], + "p50k_base": ["Codex models", "text-davinci-002", "text-davinci-003"], + "r50k_base": "GPT-3 models like davinci", + } + for name, expected in mapping.items(): + result = tc.GetModelForEncodingName(encodingName=name) + if isinstance(expected, list): + assert sorted(result) == sorted(expected) + else: + assert result == expected + + +def test_get_encoding_for_model(): + mapping = { + "gpt-4o": "o200k_base", + "gpt-4o-mini": "o200k_base", + "gpt-4-turbo": "cl100k_base", + "gpt-4": "cl100k_base", + "gpt-3.5-turbo": "cl100k_base", + "text-embedding-ada-002": "cl100k_base", + "text-embedding-3-small": "cl100k_base", + "text-embedding-3-large": "cl100k_base", + "text-davinci-002": "p50k_base", + "text-davinci-003": "p50k_base", + } + for model, encoding_name in mapping.items(): + result = tc.GetEncodingForModel(modelName=model) + assert result.name == encoding_name + + +def test_get_encoding_name_for_model(): + mapping = { + "gpt-4o": "o200k_base", + "gpt-4o-mini": "o200k_base", + "gpt-4-turbo": "cl100k_base", + "gpt-4": "cl100k_base", + "gpt-3.5-turbo": "cl100k_base", + "text-embedding-ada-002": "cl100k_base", + "text-embedding-3-small": "cl100k_base", + "text-embedding-3-large": "cl100k_base", + "Codex models": "p50k_base", + "text-davinci-002": "p50k_base", + "text-davinci-003": "p50k_base", + "GPT-3 models like davinci": "r50k_base", + } + for model, encoding_name in mapping.items(): + result = tc.GetEncodingNameForModel(modelName=model) + assert result == encoding_name + + +def test_get_encoding(): + encoding = tc.GetEncoding(model="gpt-3.5-turbo") + assert encoding.name == "cl100k_base" + + encoding = tc.GetEncoding(encodingName="p50k_base") + assert encoding.name == "p50k_base" + + encoding = tc.GetEncoding(model="gpt-4-turbo", encodingName="cl100k_base") + assert encoding.name == "cl100k_base" + + with pytest.raises(ValueError): + tc.GetEncoding(model="gpt-3.5-turbo", encodingName="p50k_base") + + +def test_get_encoding_error(): + with pytest.raises(ValueError): + tc.GetEncoding() + diff --git a/tests/test_tokenization.py b/tests/test_tokenization.py new file mode 100644 index 0000000..5035681 --- /dev/null +++ b/tests/test_tokenization.py @@ -0,0 +1,161 @@ +import json +from pathlib import Path +import io +import sys +import PyTokenCounter as tc +from PyTokenCounter.cli import ParseFiles + +# Paths to test resources +TEST_DIR = Path(__file__).resolve().parent.parent / "Tests" +INPUT_DIR = TEST_DIR / "Input" +ANSWERS_DIR = TEST_DIR / "Answers" + + +def load_answer(name: str): + with (ANSWERS_DIR / name).open("r") as f: + return json.load(f) + +def test_tokenize_directory(): + expected = load_answer("TestDirectory.json") + result = tc.TokenizeDir( + dirPath=INPUT_DIR / "TestDirectory", + model="gpt-4o", + recursive=True, + quiet=True, + ) + assert isinstance(result, dict) + assert result == expected + + +def test_tokenize_files_with_directory(): + expected = load_answer("TestDirectory.json") + result = tc.TokenizeFiles(INPUT_DIR / "TestDirectory", model="gpt-4o", quiet=True) + assert isinstance(result, dict) + assert result == expected + + +def test_tokenize_files_multiple(): + files = [ + INPUT_DIR / "TestFile1.txt", + INPUT_DIR / "TestImg.jpg", + INPUT_DIR / "TestFile2.txt", + ] + answers = [load_answer("TestFile1.json"), load_answer("TestFile2.json")] + result = tc.TokenizeFiles(files, model="gpt-4o", quiet=True) + expected = { + "TestFile1.txt": {"tokens": answers[0]["tokens"]}, + "TestFile2.txt": {"tokens": answers[1]["tokens"]}, + } + assert isinstance(result, dict) + assert result == expected + + +def test_tokenize_files_exit_on_list_error_false(): + files = [ + INPUT_DIR / "TestFile1.txt", + INPUT_DIR / "TestImg.jpg", + INPUT_DIR / "TestFile2.txt", + ] + answers = [load_answer("TestFile1.json"), load_answer("TestFile2.json")] + result = tc.TokenizeFiles(files, model="gpt-4o", quiet=True, exitOnListError=False) + expected = { + "TestFile1.txt": {"tokens": answers[0]["tokens"]}, + "TestFile2.txt": {"tokens": answers[1]["tokens"]}, + } + assert isinstance(result, dict) + assert result == expected + + +def test_tokenize_directory_no_recursion(): + expected = load_answer("TestDirectoryNoRecursion.json") + result = tc.TokenizeDir( + dirPath=INPUT_DIR / "TestDirectory", + model="gpt-4o", + recursive=False, + quiet=True, + ) + assert isinstance(result, dict) + assert result == expected + # ensure subdirectories not included + for entry in (INPUT_DIR / "TestDirectory").iterdir(): + if entry.is_dir(): + assert entry.name not in result + + +def test_tokenize_files_with_invalid_input(): + with pytest.raises(TypeError): + tc.TokenizeFiles(67890) + + +def test_tokenize_files_list_quiet_false(capsys): + files = [ + INPUT_DIR / "TestFile1.txt", + INPUT_DIR / "TestImg.jpg", + INPUT_DIR / "TestFile2.txt", + ] + answers = [load_answer("TestFile1.json"), load_answer("TestFile2.json")] + result = None + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + result = tc.TokenizeFiles(files, model="gpt-4o", quiet=False) + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + assert "Skipping binary file TestImg.jpg" in output + expected = { + "TestFile1.txt": {"tokens": answers[0]["tokens"]}, + "TestFile2.txt": {"tokens": answers[1]["tokens"]}, + } + assert isinstance(result, dict) + assert result == expected + + +import pytest + +@pytest.mark.parametrize( + "input_name,answer_name", + [ + ("TestFile1.txt", "TestFile1.json"), + ("TestFile2.txt", "TestFile2.json"), + ], +) +def test_tokenize_file(input_name, answer_name): + answer = load_answer(answer_name) + result = tc.TokenizeFile(INPUT_DIR / input_name, model="gpt-4o", quiet=True) + assert result == answer["tokens"] + count = tc.GetNumTokenFile(INPUT_DIR / input_name, model="gpt-4o", quiet=True) + assert count == answer["numTokens"] + + +def test_tokenize_file_error(): + with pytest.raises(tc.UnsupportedEncodingError): + tc.TokenizeFile(INPUT_DIR / "TestImg.jpg", model="gpt-4o", quiet=True) + + +def test_tokenize_file_error_type(): + with pytest.raises(TypeError): + tc.TokenizeFile(54321, model="gpt-4o", quiet=True) + + +def test_tokenize_file_with_unsupported_encoding(): + path = INPUT_DIR / "TestImg.jpg" + with pytest.raises(tc.UnsupportedEncodingError) as exc: + tc.TokenizeFile(filePath=path, model="gpt-4o", quiet=True) + assert str(path) in str(exc.value) + assert "encoding" in str(exc.value) + + +def test_tokenize_str(): + expected_strings = { + "Hail to the Victors!": [39, 663, 316, 290, 16566, 914, 0], + "2024 National Champions": [1323, 19, 6743, 40544], + "Corum 4 Heisman": [11534, 394, 220, 19, 1679, 107107], + } + for text, tokens in expected_strings.items(): + result = tc.TokenizeStr(string=text, model="gpt-4o", quiet=True) + assert result == tokens + count = tc.GetNumTokenStr(string=text, model="gpt-4o", quiet=True) + assert count == len(tokens) + diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..5f2266b --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,42 @@ +import json +from pathlib import Path +import PyTokenCounter as tc +from PyTokenCounter.encoding_utils import ReadTextFile +from PyTokenCounter.cli import ParseFiles + +TEST_DIR = Path(__file__).resolve().parent.parent / "Tests" +INPUT_DIR = TEST_DIR / "Input" +ANSWERS_DIR = TEST_DIR / "Answers" + + +def test_parse_files_glob(): + pattern = "Input/TestFile*.txt" + expected = { + "Input/TestFile1.txt", + "Input/TestFile2.txt", + } + result = set(ParseFiles([pattern])) + assert result == expected + + +def test_parse_files_glob_recursive(): + pattern = "Input/**/*.txt" + expected = { + "Input/TestFile1.txt", + "Input/TestFile2.txt", + "Input/TestDirectory/TestDir1.txt", + "Input/TestDirectory/TestDir2.txt", + "Input/TestDirectory/TestDir3.txt", + "Input/TestDirectory/TestSubDir/TestDir4.txt", + "Input/TestDirectory/TestSubDir/TestDir5.txt", + } + result = set(ParseFiles([pattern])) + assert result == expected + + +def test_read_text_file_windows1252(): + file_path = INPUT_DIR / "TestFile1252.txt" + expected = "Café – résumé naïve fiancé" + result = ReadTextFile(file_path) + assert result == expected +