Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: CI

on:
push:
branches: [ main ]
paths-ignore:
- '**/*.md'
pull_request:
paths-ignore:
- '**/*.md'

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.x'
- name: Detect comment-only changes
id: changes
run: |
git fetch --depth=2 origin ${{ github.base_ref }}
CHANGED=$(git diff --name-only HEAD origin/${{ github.base_ref }} -- '*.py')
if [ -z "$CHANGED" ]; then
echo "run=false" >> $GITHUB_OUTPUT
else
DIFF=$(git diff origin/${{ github.base_ref }} HEAD -- '*.py' | grep '^+' | grep -v '+++' | grep -v '^+\s*#')
if [ -z "$DIFF" ]; then
echo "run=false" >> $GITHUB_OUTPUT
else
echo "run=true" >> $GITHUB_OUTPUT
fi
fi
- name: Install dependencies
if: steps.changes.outputs.run == 'true'
run: |
python -m pip install --upgrade pip
pip install -e .
pip install flake8
- name: Run flake8
if: steps.changes.outputs.run == 'true'
run: flake8 .
- name: Run pytest
if: steps.changes.outputs.run == 'true'
run: pytest
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,16 @@ Along with ignoring the extensions in the exclude list to quickly bypass known f

---

## Running Tests

After installing the package's dependencies, run the test suite with `pytest`:

```bash
pytest
```

The tests rely on the sample data located in the `Tests/` directory.

## Maintainers

- [Kaden Gruizenga](https://github.com/kgruiz)
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ dependencies = [
"rich>=13.9.4",
"chardet>=5.2.0",
"colorlog>=6.9.0",
"pytest>=7.4.0",
"flake8>=6.1.0",
]

classifiers = [
Expand Down Expand Up @@ -62,4 +64,4 @@ requires = ["setuptools>=43.0.0", "wheel"]
build-backend = "setuptools.build_meta"

[tool.setuptools.packages.find]
exclude = ["Tests"]
exclude = ["Tests"]
4 changes: 4 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[pytest]
addopts = -v
pythonpaths = PyTokenCounter

152 changes: 152 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import json
from pathlib import Path
import pytest
import tiktoken
import PyTokenCounter as tc

TEST_DIR = Path(__file__).resolve().parent.parent / "Tests"
ANSWERS_DIR = TEST_DIR / "Answers"


def test_get_model_mappings():
expected = {
"gpt-4o": "o200k_base",
"gpt-4o-mini": "o200k_base",
"gpt-4-turbo": "cl100k_base",
"gpt-4": "cl100k_base",
"gpt-3.5-turbo": "cl100k_base",
"text-embedding-ada-002": "cl100k_base",
"text-embedding-3-small": "cl100k_base",
"text-embedding-3-large": "cl100k_base",
"Codex models": "p50k_base",
"text-davinci-002": "p50k_base",
"text-davinci-003": "p50k_base",
"GPT-3 models like davinci": "r50k_base",
}
assert tc.GetModelMappings() == expected


def test_get_valid_models():
expected = [
"gpt-4o",
"gpt-4o-mini",
"gpt-4-turbo",
"gpt-4",
"gpt-3.5-turbo",
"text-embedding-ada-002",
"text-embedding-3-small",
"text-embedding-3-large",
"Codex models",
"text-davinci-002",
"text-davinci-003",
"GPT-3 models like davinci",
]
assert set(tc.GetValidModels()) == set(expected)


def test_get_valid_encodings():
expected = ["o200k_base", "cl100k_base", "p50k_base", "r50k_base"]
assert set(tc.GetValidEncodings()) == set(expected)


def test_get_model_for_encoding():
mapping = {
"o200k_base": ["gpt-4o", "gpt-4o-mini"],
"cl100k_base": [
"gpt-3.5-turbo",
"gpt-4",
"gpt-4-turbo",
"text-embedding-3-large",
"text-embedding-3-small",
"text-embedding-ada-002",
],
"p50k_base": ["Codex models", "text-davinci-002", "text-davinci-003"],
"r50k_base": "GPT-3 models like davinci",
}
for name, expected in mapping.items():
encoding = tiktoken.get_encoding(encoding_name=name)
result = tc.GetModelForEncoding(encoding=encoding)
if isinstance(expected, list):
assert sorted(result) == sorted(expected)
else:
assert result == expected


def test_get_model_for_encoding_name():
mapping = {
"o200k_base": ["gpt-4o", "gpt-4o-mini"],
"cl100k_base": [
"gpt-3.5-turbo",
"gpt-4",
"gpt-4-turbo",
"text-embedding-3-large",
"text-embedding-3-small",
"text-embedding-ada-002",
],
"p50k_base": ["Codex models", "text-davinci-002", "text-davinci-003"],
"r50k_base": "GPT-3 models like davinci",
}
for name, expected in mapping.items():
result = tc.GetModelForEncodingName(encodingName=name)
if isinstance(expected, list):
assert sorted(result) == sorted(expected)
else:
assert result == expected


def test_get_encoding_for_model():
mapping = {
"gpt-4o": "o200k_base",
"gpt-4o-mini": "o200k_base",
"gpt-4-turbo": "cl100k_base",
"gpt-4": "cl100k_base",
"gpt-3.5-turbo": "cl100k_base",
"text-embedding-ada-002": "cl100k_base",
"text-embedding-3-small": "cl100k_base",
"text-embedding-3-large": "cl100k_base",
"text-davinci-002": "p50k_base",
"text-davinci-003": "p50k_base",
}
for model, encoding_name in mapping.items():
result = tc.GetEncodingForModel(modelName=model)
assert result.name == encoding_name


def test_get_encoding_name_for_model():
mapping = {
"gpt-4o": "o200k_base",
"gpt-4o-mini": "o200k_base",
"gpt-4-turbo": "cl100k_base",
"gpt-4": "cl100k_base",
"gpt-3.5-turbo": "cl100k_base",
"text-embedding-ada-002": "cl100k_base",
"text-embedding-3-small": "cl100k_base",
"text-embedding-3-large": "cl100k_base",
"Codex models": "p50k_base",
"text-davinci-002": "p50k_base",
"text-davinci-003": "p50k_base",
"GPT-3 models like davinci": "r50k_base",
}
for model, encoding_name in mapping.items():
result = tc.GetEncodingNameForModel(modelName=model)
assert result == encoding_name


def test_get_encoding():
encoding = tc.GetEncoding(model="gpt-3.5-turbo")
assert encoding.name == "cl100k_base"

encoding = tc.GetEncoding(encodingName="p50k_base")
assert encoding.name == "p50k_base"

encoding = tc.GetEncoding(model="gpt-4-turbo", encodingName="cl100k_base")
assert encoding.name == "cl100k_base"

with pytest.raises(ValueError):
tc.GetEncoding(model="gpt-3.5-turbo", encodingName="p50k_base")


def test_get_encoding_error():
with pytest.raises(ValueError):
tc.GetEncoding()

Loading
Loading