Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 60 additions & 2 deletions SharpToken.Tests/SharpToken.Tests.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using System.Net.Http;
using System.Net.Http;
using System.Text;
using System.Linq;
using NUnit.Framework;
Expand All @@ -7,7 +7,7 @@ namespace SharpToken.Tests;

public class Tests
{
private static readonly List<string> ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base", "o200k_base", "o200k_harmony" };
private static readonly List<string> ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base", "o200k_base", "o200k_harmony", "claude" };

private static readonly List<Tuple<string, string, List<int>>> TestData =
TestHelpers.ReadTestPlans("SharpToken.Tests.data.TestPlans.txt");
Expand Down Expand Up @@ -206,6 +206,13 @@ public async Task TestLocalResourceMatchesRemoteResource(string modelName)
return;
}

// Skip claude as it doesn't have a remote resource on openaipublic.blob.core.windows.net
if (modelName == "claude")
{
Assert.Pass("claude is an Anthropic encoding without a remote resource on openaipublic.blob.core.windows.net");
return;
}

var embeddedResourceName = $"SharpToken.data.{modelName}.tiktoken";
var remoteResourceUrl = $"https://openaipublic.blob.core.windows.net/encodings/{modelName}.tiktoken";

Expand Down Expand Up @@ -286,6 +293,56 @@ public void TestGPT5ModelMappings()
Assert.That(Model.GetEncodingNameForModel("gpt-5-chat-latest"), Is.EqualTo("o200k_base"));
}

[Test]
public void TestClaudeBasicEncodingDecoding()
{
var encoding = GptEncoding.GetEncoding("claude");
const string inputText = "Hello, world!";
var encoded = encoding.Encode(inputText);
var decodedText = encoding.Decode(encoded);
Assert.That(decodedText, Is.EqualTo(inputText));
}

[Test]
public void TestClaudeModelMappings()
{
Assert.That(Model.GetEncodingNameForModel("claude-3-opus"), Is.EqualTo("claude"));
Assert.That(Model.GetEncodingNameForModel("claude-3.5-sonnet"), Is.EqualTo("claude"));
Assert.That(Model.GetEncodingNameForModel("claude-3-opus-20240229"), Is.EqualTo("claude"));
Assert.That(Model.GetEncodingNameForModel("claude-2"), Is.EqualTo("claude"));
Assert.That(Model.GetEncodingNameForModel("claude-instant-1"), Is.EqualTo("claude"));
}

[Test]
public void TestClaudeNfkcNormalization()
{
var encoding = GptEncoding.GetEncoding("claude");

// Fullwidth "Hello" should normalize to ASCII "Hello" via NFKC
const string fullwidthHello = "Hello";
const string asciiHello = "Hello";

var fullwidthEncoded = encoding.Encode(fullwidthHello);
var asciiEncoded = encoding.Encode(asciiHello);

// NFKC normalization should make them produce the same tokens
Assert.That(fullwidthEncoded, Is.EqualTo(asciiEncoded));
}

[Test]
public void TestClaudeSpecialTokens()
{
var encoding = GptEncoding.GetEncoding("claude");

// Test encoding with special tokens allowed
var allowedSpecial = new HashSet<string> { "<EOT>" };
var encoded = encoding.Encode("<EOT>", allowedSpecial);
Assert.That(encoded, Is.EqualTo(new List<int> { 0 }));

// Test that special tokens are disallowed by default
Assert.Throws<ArgumentException>(() => encoding.Encode("<EOT>"));
}

private static HashSet<string> GetSpecialTokensForEncoding(string encodingName)
{
return encodingName switch
Expand All @@ -299,6 +356,7 @@ private static HashSet<string> GetSpecialTokensForEncoding(string encodingName)
"<|endoftext|>", "<|endofprompt|>", "<|startoftext|>", "<|return|>", "<|constrain|>",
"<|channel|>", "<|start|>", "<|end|>", "<|message|>", "<|call|>"
}.Union(Enumerable.Range(200000, 1088).Select(i => $"<|reserved_{i}|>"))),
"claude" => new HashSet<string> { "<EOT>", "<META>", "<META_START>", "<META_END>", "<SOS>" },
_ => new HashSet<string>()
};
}
Expand Down
18 changes: 16 additions & 2 deletions SharpToken/Lib/Encoding.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@ public class GptEncoding
{
private readonly BytePairEncodingCore _bytePairEncodingCoreProcessor;
private readonly Dictionary<string, int> _specialTokenMappings;
private readonly NormalizationForm? _textNormalization;

private GptEncoding(
Regex tokenizerRegex,
BytePairIndex bytePairRanks,
Dictionary<string, int> specialTokenMappings,
int? explicitNVocab = null
int? explicitNVocab = null,
NormalizationForm? textNormalization = null
)
{
var maxTokenValue = Math.Max(
Expand All @@ -40,6 +42,7 @@ private GptEncoding(
}
}

_textNormalization = textNormalization;
_bytePairEncodingCoreProcessor = new BytePairEncodingCore(bytePairRanks, specialTokenMappings, tokenizerRegex);

int GetMaxValueFromBytePairRanks(BytePairIndex dictionary)
Expand All @@ -61,7 +64,8 @@ public static GptEncoding GetEncoding(string encodingName)
modelParams.TokenizerRegex,
modelParams.MergeableRanks,
modelParams.SpecialTokens,
modelParams.ExplicitNVocab
modelParams.ExplicitNVocab,
modelParams.TextNormalization
);

return encoding;
Expand Down Expand Up @@ -150,6 +154,16 @@ public string Decode(IEnumerable<int> inputTokensToDecode)
bool countOnly = false
)
{
if (_textNormalization.HasValue)
{
#if NET8_0_OR_GREATER
var normalized = new string(lineToEncode).Normalize(_textNormalization.Value);
lineToEncode = normalized.AsSpan();
#else
lineToEncode = lineToEncode.Normalize(_textNormalization.Value);
#endif
}

var allowedSpecialTokens = allowedSpecial is null
// When null allow nothing
? Array.Empty<string>()
Expand Down
32 changes: 31 additions & 1 deletion SharpToken/Lib/Internals/ModelParamsGenerator.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

namespace SharpToken
Expand All @@ -11,17 +12,20 @@ internal readonly struct ModelParams
public Regex TokenizerRegex { get; }
public BytePairIndex MergeableRanks { get; }
public Dictionary<string, int> SpecialTokens { get; }
public NormalizationForm? TextNormalization { get; }

public ModelParams(
int? explicitNVocab = null,
Regex tokenizerRegex = null,
BytePairIndex mergeableRanks = null,
Dictionary<string, int> specialTokens = null)
Dictionary<string, int> specialTokens = null,
NormalizationForm? textNormalization = null)
{
ExplicitNVocab = explicitNVocab;
TokenizerRegex = tokenizerRegex;
MergeableRanks = mergeableRanks;
SpecialTokens = specialTokens ?? new Dictionary<string, int>();
TextNormalization = textNormalization;
}
}

Expand Down Expand Up @@ -59,6 +63,9 @@ public static ModelParams GetModelParams(string encodingName)
case "o200k_harmony":
return O200KHarmony();

case "claude":
return Claude();

default:
throw new ArgumentException($"Unknown encoding name: {encodingName}");
}
Expand Down Expand Up @@ -193,6 +200,29 @@ private static ModelParams O200KHarmony()
specialTokens: specialTokens
);
}


private static ModelParams Claude()
{
var mergeableRanks = EmbeddedResourceReader.LoadTokenBytePairEncoding("SharpToken.data.claude.tiktoken");

var specialTokens = new Dictionary<string, int>
{
{ "<EOT>", 0 },
{ "<META>", 1 },
{ "<META_START>", 2 },
{ "<META_END>", 3 },
{ "<SOS>", 4 }
};

return new ModelParams
(
tokenizerRegex: ModelParamsGeneratorRegex.Regex50KBase(),
mergeableRanks: mergeableRanks,
specialTokens: specialTokens,
textNormalization: NormalizationForm.FormKC
);
}
}

internal sealed partial class ModelParamsGeneratorRegex
Expand Down
16 changes: 15 additions & 1 deletion SharpToken/Lib/Model.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,20 @@ public static class Model
{ "text-search-babbage-doc-001", "r50k_base" },
{ "text-search-ada-doc-001", "r50k_base" },
{ "code-search-babbage-code-001", "r50k_base" },
{ "code-search-ada-code-001", "r50k_base" }
{ "code-search-ada-code-001", "r50k_base" },
// Anthropic Claude
{ "claude-instant-1", "claude" },
{ "claude-2", "claude" },
{ "claude-2.0", "claude" },
{ "claude-2.1", "claude" },
{ "claude-3-opus", "claude" },
{ "claude-3-sonnet", "claude" },
{ "claude-3-haiku", "claude" },
{ "claude-3.5-sonnet", "claude" },
{ "claude-3.5-haiku", "claude" },
{ "claude-3.7-sonnet", "claude" },
{ "claude-4-opus", "claude" },
{ "claude-4-sonnet", "claude" }
};

private static readonly Dictionary<string, string> ModelPrefixToEncodingMapping = new Dictionary<string, string>
Expand All @@ -64,6 +77,7 @@ public static class Model
{ "gpt-4-", "cl100k_base" }, // e.g., gpt-4-0314, etc., plus gpt-4-32k
{ "gpt-3.5-turbo-", "cl100k_base" }, // e.g, gpt-3.5-turbo-0301, -0401, etc.
{ "gpt-35-turbo", "cl100k_base" }, // Azure deployment name
{ "claude-", "claude" }, // e.g., claude-3-opus-20240229, claude-3.5-sonnet-20241022, etc.
};

public static string GetEncodingNameForModel(string modelName)
Expand Down
2 changes: 2 additions & 0 deletions SharpToken/SharpToken.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@
<None Remove="data\o200k_base.tiktoken" />
<None Remove="data\p50k_base.tiktoken" />
<None Remove="data\r50k_base.tiktoken" />
<None Remove="data\claude.tiktoken" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="data\cl100k_base.tiktoken" />
<EmbeddedResource Include="data\claude.tiktoken" />
<EmbeddedResource Include="data\o200k_base.tiktoken" />
<EmbeddedResource Include="data\p50k_base.tiktoken" />
<EmbeddedResource Include="data\r50k_base.tiktoken" />
Expand Down
Loading
Loading