Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ target_include_directories(pytokenizer_compile_config
add_subdirectory(csrc/src/core)
add_subdirectory(csrc/bindings)
add_subdirectory(test)
add_subdirectory(benchmarks)

add_library(pytokenizer INTERFACE)
target_link_libraries(pytokenizer INTERFACE pytokenizer_core)
Expand Down
8 changes: 6 additions & 2 deletions csrc/include/libtokenizer/core/tokenizer.hpp
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
#pragma once

#include <string>
#include <vector>
#include <map>
#include <utility>

namespace pytokenizer {
namespace core {

class Tokenizer {
public:
std::string foo();
static std::map<std::pair<int, int>, int> get_stats(const std::vector<int>& ids);
static std::vector<int> merge(const std::vector<int>& ids, const std::pair<int, int>& pair, int idx);
void foo();
};

} // namespace core
Expand Down
30 changes: 28 additions & 2 deletions csrc/src/core/tokenizer.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,36 @@
#include "libtokenizer/core/tokenizer.hpp"
#include <algorithm>
#include <iostream>
#include <string>
#include <vector>
#include <map>
#include "libtokenizer/libtokenizer.hpp"

namespace pytokenizer {
namespace core {

std::string Tokenizer::foo() { return "foo"; }
std::map<std::pair<int, int>, int> Tokenizer::get_stats(const std::vector<int>& ids) {
std::map<std::pair<int, int>, int> counts;
for (size_t i = 0; i < ids.size() - 1; ++i) {
std::pair<int, int> pair = {ids[i], ids[i + 1]};
counts[pair]++;
}
return counts;
}

std::vector<int> Tokenizer::merge(const std::vector<int>& ids, const std::pair<int, int>& pair, int idx) {
std::vector<int> newids;
size_t i = 0;
while (i < ids.size()) {
if (i < ids.size() - 1 && ids[i] == pair.first && ids[i + 1] == pair.second) {
newids.push_back(idx);
i += 2;
} else {
newids.push_back(ids[i]);
i += 1;
}
}
return newids;
}

} // namespace core
} // namespace pytokenizer
78 changes: 78 additions & 0 deletions src/pytokenizer/python_implementation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# sample text
text = "Hello, world! This is a test sentence for a tokenizer. It includes numbers like 123.45, special characters (@#$), and a contraction (don't)."

tokens = text.encode("utf-8")
tokens = list(map(int, tokens))

def get_stats(ids):
counts = {}
for pair in zip(ids, ids[1:]):
counts[pair] = counts.get(pair, 0) + 1
return counts

stats = get_stats(tokens) # {(72, 101): 1, ...}

top_pair = max(stats, key = stats.get)

def merge(ids, pair, idx):
# in new list of ints (ids), replace all consecutive occurences of pair with the new token idx
newids = []
i = 0
while i < len(ids):
if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
newids.append(idx)
i += 2
else:
newids.append(ids[i])
i += 1
return newids

tokens2 = merge(tokens, top_pair, 256)

#print(stats)
#print(top_pair)
print(len(tokens))
print(len(tokens2))
#print(merge([5, 6, 6, 7, 9, 1], (6, 7), 99))



# ---- Tokenization ----
ids = list(tokens) # copy so don't destroy original list

vocab_size = 276
num_merges = vocab_size - 256

merges = {}
for i in range(num_merges):
stats = get_stats(ids)
pair = max(stats, key=stats.get)
idx = 256 + i
print(f"merging {pair} into a new token {idx}")
ids = merge(ids, pair, idx)
merges[pair] = idx

print(len(tokens) / len(ids))

vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0, p1), idx in merges.items():
vocab[idx] = vocab[p0] + vocab[p1]
def decode(ids):
# give ids (list of integers), return Python string
tokens = b"".join(vocab[idx] for idx in ids)
text = tokens.decode("utf-8", errors='replace')
return text

def encode(text):
# given a string, return list of integers (the token)
tokens = list(text.encode("utf-8"))
while len(tokens) >= 2:
stats = get_stats(tokens)
pair = min(stats, key=lambda p: merges.get(p, float("inf")))
if pair not in merges:
break
idx = merges[pair]
tokens = merge(tokens, pair, idx)
return tokens

print(decode(encode("HELLO hello world!")))
20 changes: 0 additions & 20 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,27 +1,7 @@
project(tests LANGUAGES CXX)

# FETCH CONTENT ---------------------------------
include(FetchContent)
set(FETCHCONTENT_TRY_FIND_PACKAGE TRUE)

FetchContent_Declare(
googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG v1.14.0
GIT_SHALLOW TRUE
FIND_PACKAGE_ARGS CONFIG
)
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)

# EXECUTABLES ---------------------------------
enable_testing()

add_executable(playground playground.cpp)
target_link_libraries(playground
PRIVATE pytokenizer
)

include(GoogleTest)
# gtest_discover_tests(_tests PROPERTIES LABELS "library_tests")
26 changes: 24 additions & 2 deletions test/playground.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,33 @@
// main entry point for the program. Ideally in the future we'll setup a testing
// framework like Gtest, but this will be okay for now just for quick testing.

#include <algorithm>
#include <iostream>
#include <string>
#include <vector>
#include "libtokenizer/libtokenizer.hpp"

int main() {
pytokenizer::core::Tokenizer tokenizer;
std::cout << tokenizer.foo() << std::endl;
using namespace pytokenizer::core;

std::string text = "Wow, C++ is so hard!";
std::vector<int> ids(text.begin(), text.end());

int next_id = 256;
for (int i = 0; i < 3; ++i) {
auto stats = Tokenizer::get_stats(ids);
auto best = std::max_element(stats.begin(), stats.end(),
[](const auto& a, const auto& b) { return a.second < b.second; });

std::cout << "Merging (" << best->first.first << ", " << best->first.second
<< ") -> " << next_id << " [freq=" << best->second << "]\n";

ids = Tokenizer::merge(ids, best->first, next_id++);
}

std::cout << "Final ids:";
for (int id : ids) std::cout << " " << id;
std::cout << "\n";

return 0;
}
Loading