Skip to content

Commit 27eef74

Browse files
[duplicate] Support for o200k_base and gpt-4o (omni) model (#43)
* Adding o200k_base.tiktoken * Support for o200k_base and gpt-4o (omni) * Fixing typo Accidentally removed a using statement in my last update * Fixed issue with some tests Still failing in a handful of tests of the new o200k * All tests now passing (fixed typo) * architecture: x64 # Add this line * attempt * Update dotnet-build-test.yml --------- Co-authored-by: Tom Winzig <winzig@users.noreply.github.com> Co-authored-by: Tom Winzig <thomas@winzig.com>
1 parent c7de8c0 commit 27eef74

10 files changed

Lines changed: 200310 additions & 6 deletions

File tree

.github/workflows/dotnet-build-test.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ jobs:
88
strategy:
99
fail-fast: false
1010
matrix:
11-
os: [windows-latest, ubuntu-latest, macos-latest]
11+
os: [windows-latest, ubuntu-latest]
1212
runs-on: ${{ matrix.os }}
1313
steps:
1414
- name: Checkout repository
@@ -21,6 +21,10 @@ jobs:
2121
3.1.x
2222
6.0.x
2323
8.0.x
24+
architecture: x64
25+
26+
- name: Log .NET SDK versions
27+
run: dotnet --info
2428

2529
- name: Restore dependencies
2630
run: dotnet restore

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ SharpToken currently supports the following models:
7878
* `p50k_base`
7979
* `p50k_edit`
8080
* `cl100k_base`
81+
* `o200k_base`
8182

8283
You can use any of these models when creating an instance of GptEncoding:
8384

@@ -86,6 +87,7 @@ var r50kBaseEncoding = GptEncoding.GetEncoding("r50k_base");
8687
var p50kBaseEncoding = GptEncoding.GetEncoding("p50k_base");
8788
var p50kEditEncoding = GptEncoding.GetEncoding("p50k_edit");
8889
var cl100kBaseEncoding = GptEncoding.GetEncoding("cl100k_base");
90+
var o200kBaseEncoding = GptEncoding.GetEncoding("o200k_base");
8991
```
9092

9193
### Model Prefix Matching
@@ -96,11 +98,13 @@ Here are the current supported prefixes and their corresponding encodings:
9698

9799
| Model Prefix | Encoding |
98100
|---------------------|------------|
101+
| `gpt-4o` | `o200k_base` |
99102
| `gpt-4-` | `cl100k_base` |
100103
| `gpt-3.5-turbo-` | `cl100k_base` |
101104
| `gpt-35-turbo` | `cl100k_base` |
102105

103106
Examples of model names that fall under these prefixes include:
107+
- For the prefix `gpt-4o`: `gpt-4o`, `gpt-4o-2024-05-13`, etc.
104108
- For the prefix `gpt-4-`: `gpt-4-0314`, `gpt-4-32k`, etc.
105109
- For the prefix `gpt-3.5-turbo-`: `gpt-3.5-turbo-0301`, `gpt-3.5-turbo-0401`, etc.
106110
- For the Azure deployment name `gpt-35-turbo`.
@@ -256,7 +260,7 @@ public class CompareBenchmark
256260

257261
return sum;
258262
}
259-
263+
260264
[Benchmark]
261265
public int MLTokenizers()
262266
{

SharpToken.Benchmark/CompareBenchmark.cs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ public class CompareBenchmark
2020
private Tokenizer _mlTokenizer;
2121
private string _kLongText;
2222

23-
[GlobalSetup]
24-
public async Task Setup()
23+
[GlobalSetup] // TODO: move this to SetupO200k?
24+
public async Task SetupCL100k()
2525
{
2626
_sharpToken = GptEncoding.GetEncoding("cl100k_base");
2727
_tikToken = await TikToken.GetEncodingAsync("cl100k_base").ConfigureAwait(false);
@@ -30,6 +30,15 @@ public async Task Setup()
3030
_kLongText = "King Lear, one of Shakespeare's darkest and most savage plays, tells the story of the foolish and Job-like Lear, who divides his kingdom, as he does his affections, according to vanity and whim. Lear’s failure as a father engulfs himself and his world in turmoil and tragedy.";
3131
}
3232

33+
public async Task SetupO200k()
34+
{
35+
_sharpToken = GptEncoding.GetEncoding("o200k_base");
36+
_tikToken = await TikToken.GetEncodingAsync("o200k_base").ConfigureAwait(false);
37+
_tokenizer = await TokenizerBuilder.CreateByModelNameAsync("gpt-4o").ConfigureAwait(false);
38+
_mlTokenizer = Tokenizer.CreateTiktokenForModel("gpt-4o");
39+
_kLongText = "King Lear, one of Shakespeare's darkest and most savage plays, tells the story of the foolish and Job-like Lear, who divides his kingdom, as he does his affections, according to vanity and whim. Lear’s failure as a father engulfs himself and his world in turmoil and tragedy.";
40+
}
41+
3342
[Benchmark]
3443
public int SharpToken()
3544
{

SharpToken.Tests/SharpToken.Tests.cs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ namespace SharpToken.Tests;
66

77
public class Tests
88
{
9-
private static readonly List<string> ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base" };
9+
private static readonly List<string> ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base", "o200k_base" };
1010

1111
private static readonly List<Tuple<string, string, List<int>>> TestData =
1212
TestHelpers.ReadTestPlans("SharpToken.Tests.data.TestPlans.txt");
@@ -70,7 +70,6 @@ public async Task TestEncodingAndDecodingInParallel()
7070
}
7171
}
7272

73-
7473
[Test]
7574
public void TestEncodingWithCustomAllowedSet()
7675
{

SharpToken.Tests/data/TestPlanGenerator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def save_test_plans(test_plans, filename):
3535
tiktoken.get_encoding("p50k_base"),
3636
tiktoken.get_encoding("p50k_edit"),
3737
tiktoken.get_encoding("cl100k_base"),
38+
tiktoken.get_encoding("o200k_base"),
3839
]
3940

4041
test_samples = read_test_samples(samples_filename)

SharpToken.Tests/data/TestPlans.txt

Lines changed: 259 additions & 0 deletions
Large diffs are not rendered by default.

SharpToken/Lib/Internals/ModelParamsGenerator.cs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ public static ModelParams GetModelParams(string encodingName)
5353
case "cl100k_base":
5454
return Cl100KBase();
5555

56+
case "o200k_base":
57+
return O200KBase();
58+
5659
default:
5760
throw new ArgumentException($"Unknown encoding name: {encodingName}");
5861
}
@@ -119,6 +122,24 @@ private static ModelParams Cl100KBase()
119122
specialTokens: specialTokens
120123
);
121124
}
125+
126+
private static ModelParams O200KBase()
127+
{
128+
var mergeableRanks = EmbeddedResourceReader.LoadTokenBytePairEncoding("SharpToken.data.o200k_base.tiktoken");
129+
130+
var specialTokens = new Dictionary<string, int>
131+
{
132+
{ EndOfText, 199999 },
133+
{ EndOfPrompt, 200018 }
134+
};
135+
136+
return new ModelParams
137+
(
138+
tokenizerRegex: ModelParamsGeneratorRegex.RegexO200KBase(),
139+
mergeableRanks: mergeableRanks,
140+
specialTokens: specialTokens
141+
);
142+
}
122143
}
123144

124145
internal sealed partial class ModelParamsGeneratorRegex
@@ -129,10 +150,15 @@ internal sealed partial class ModelParamsGeneratorRegex
129150

130151
[GeneratedRegex(@"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+")]
131152
public static partial Regex RegexCl100KBase();
153+
154+
[GeneratedRegex(@"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+")]
155+
public static partial Regex RegexO200KBase();
132156
#else
133157
public static Regex Regex50KBase() => new Regex(@"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+", RegexOptions.Compiled);
134158

135159
public static Regex RegexCl100KBase() => new Regex(@"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+", RegexOptions.Compiled);
160+
161+
public static Regex RegexO200KBase() => new Regex(@"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+", RegexOptions.Compiled);
136162
#endif
137163
}
138164
}

SharpToken/Lib/Model.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ public static class Model
88
private static readonly Dictionary<string, string> ModelToEncodingMapping = new Dictionary<string, string>
99
{
1010
// chat
11+
{ "gpt-4o", "o200k_base" },
1112
{ "gpt-4", "cl100k_base" },
1213
{ "gpt-3.5-turbo-16k", "cl100k_base" },
1314
{ "gpt-35-turbo-16k", "cl100k_base" }, // Azure deployment name
@@ -53,6 +54,7 @@ public static class Model
5354

5455
private static readonly Dictionary<string, string> ModelPrefixToEncodingMapping = new Dictionary<string, string>
5556
{
57+
{ "gpt-4o", "o200k_base" }, // (NOTE: no trailing dash, on purpose). E.g., gpt-4o, gpt-4o-2024-05-13, etc.,
5658
{ "gpt-4-", "cl100k_base" }, // e.g., gpt-4-0314, etc., plus gpt-4-32k
5759
{ "gpt-3.5-turbo-", "cl100k_base" }, // e.g, gpt-3.5-turbo-0301, -0401, etc.
5860
{ "gpt-35-turbo", "cl100k_base" }, // Azure deployment name

SharpToken/SharpToken.csproj

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,13 @@
3333
<None Remove="Lib\" />
3434
<None Remove="data\" />
3535
<None Remove="data\cl100k_base.tiktoken" />
36+
<None Remove="data\o200k_base.tiktoken" />
3637
<None Remove="data\p50k_base.tiktoken" />
3738
<None Remove="data\r50k_base.tiktoken" />
3839
</ItemGroup>
3940
<ItemGroup>
4041
<EmbeddedResource Include="data\cl100k_base.tiktoken" />
42+
<EmbeddedResource Include="data\o200k_base.tiktoken" />
4143
<EmbeddedResource Include="data\p50k_base.tiktoken" />
4244
<EmbeddedResource Include="data\r50k_base.tiktoken" />
4345
</ItemGroup>

0 commit comments

Comments
 (0)