From 5665310ba98a1f42a6dc7a3558f4536fea7b079d Mon Sep 17 00:00:00 2001 From: Neil Lin Date: Sat, 7 Mar 2026 12:02:45 -0800 Subject: [PATCH 1/3] code now compiles, set up interface + compile options --- CMakeLists.txt | 18 +++++++++++++++++- csrc/include/libtokenizer/core/CMakeLists.txt | 0 csrc/include/libtokenizer/core/tokenizer.hpp | 12 ++++++++++++ csrc/include/libtokenizer/libtokenizer.hpp | 4 +++- csrc/src/core/CMakeLists.txt | 11 +++++++++++ csrc/src/core/tokenizer.cpp | 12 ++++++++++++ test/CMakeLists.txt | 7 +++++++ test/playground.cpp | 11 +++++++++++ 8 files changed, 73 insertions(+), 2 deletions(-) delete mode 100644 csrc/include/libtokenizer/core/CMakeLists.txt create mode 100644 csrc/src/core/CMakeLists.txt create mode 100644 test/CMakeLists.txt create mode 100644 test/playground.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 9050ec8..9d376a7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,8 +7,24 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) # CORE TARGETS --------------------------------- -add_subdirectory(csrc/src/) +add_library(pytokenizer_compile_config INTERFACE) +target_compile_options(pytokenizer_compile_config + INTERFACE + $<$:-march=native;-ffast-math> # also compiles with -O3 -DNDEBUG + $<$:-march=native;-ffast-math> # -g +) +target_include_directories(pytokenizer_compile_config + INTERFACE + $ + $ +) + +add_subdirectory(csrc/src/core) add_subdirectory(csrc/bindings) +add_subdirectory(test) + +add_library(pytokenizer INTERFACE) +target_link_libraries(pytokenizer INTERFACE pytokenizer_core) # FORMATTING --------------------------------- diff --git a/csrc/include/libtokenizer/core/CMakeLists.txt b/csrc/include/libtokenizer/core/CMakeLists.txt deleted file mode 100644 index e69de29..0000000 diff --git a/csrc/include/libtokenizer/core/tokenizer.hpp b/csrc/include/libtokenizer/core/tokenizer.hpp index e69de29..de2b30e 100644 --- a/csrc/include/libtokenizer/core/tokenizer.hpp +++ b/csrc/include/libtokenizer/core/tokenizer.hpp @@ -0,0 +1,12 @@ +#pragma once + +namespace pytokenizer { +namespace core { + +class Tokenizer { +public: + void foo(); +}; + +} // namespace core +} // namespace pytokenizer \ No newline at end of file diff --git a/csrc/include/libtokenizer/libtokenizer.hpp b/csrc/include/libtokenizer/libtokenizer.hpp index 7b9637e..dbcb5c7 100644 --- a/csrc/include/libtokenizer/libtokenizer.hpp +++ b/csrc/include/libtokenizer/libtokenizer.hpp @@ -1 +1,3 @@ -#pragma once \ No newline at end of file +#pragma once + +#include "core/tokenizer.hpp" \ No newline at end of file diff --git a/csrc/src/core/CMakeLists.txt b/csrc/src/core/CMakeLists.txt new file mode 100644 index 0000000..5fbd5bf --- /dev/null +++ b/csrc/src/core/CMakeLists.txt @@ -0,0 +1,11 @@ +project(pytokenizer_core LANGUAGES CXX) + +set(PYTOKENIZER_SOURCES + tokenizer.cpp +) + +add_library(pytokenizer_core SHARED ${PYTOKENIZER_SOURCES}) + +target_link_libraries(pytokenizer_core + PUBLIC pytokenizer_compile_config +) \ No newline at end of file diff --git a/csrc/src/core/tokenizer.cpp b/csrc/src/core/tokenizer.cpp index e69de29..43581d8 100644 --- a/csrc/src/core/tokenizer.cpp +++ b/csrc/src/core/tokenizer.cpp @@ -0,0 +1,12 @@ +#include "libtokenizer/core/tokenizer.hpp" +#include + +namespace pytokenizer { +namespace core { + +void Tokenizer::foo() { + std::cout << "foo" << std::endl; +} + +} // namespace core +} // namespace pytokenizer \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 0000000..d61a005 --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,7 @@ +project(tests LANGUAGES CXX) + +# EXECUTABLES --------------------------------- +add_executable(playground playground.cpp) +target_link_libraries(playground + PRIVATE pytokenizer +) \ No newline at end of file diff --git a/test/playground.cpp b/test/playground.cpp new file mode 100644 index 0000000..0c8b276 --- /dev/null +++ b/test/playground.cpp @@ -0,0 +1,11 @@ +// main entry point for the program. Ideally in the future we'll setup a testing +// framework like Gtest, but this will be okay for now just for quick testing. + +#include +#include "libtokenizer/libtokenizer.hpp" + +int main() { + pytokenizer::core::Tokenizer tokenizer; + tokenizer.foo(); + return 0; +} \ No newline at end of file From 0cf7f0f740cc1f692a59bcb49179a8c11164f66d Mon Sep 17 00:00:00 2001 From: Neil Lin Date: Sat, 7 Mar 2026 12:08:56 -0800 Subject: [PATCH 2/3] linting workflow --- .github/workflows/lint.yml | 33 +++++++++++++++++++++++++++++++++ csrc/src/core/tokenizer.cpp | 4 +--- 2 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/lint.yml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..4ec3968 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,33 @@ +name: lint + +on: + push: + branches: ["main"] + pull_request: + branches: ["main"] + +jobs: + greet: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup system deps + run: | + sudo apt-get update + sudo apt-get install -y \ + build-essential \ + cmake \ + git + + - name: Build CMake project + run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Debug + + - name: Run C++ formatting + run: | + cmake --build build --target format + git diff --exit-code + + - name: Run C++ linting + run: cmake --build build --target lint diff --git a/csrc/src/core/tokenizer.cpp b/csrc/src/core/tokenizer.cpp index 43581d8..72e3b5d 100644 --- a/csrc/src/core/tokenizer.cpp +++ b/csrc/src/core/tokenizer.cpp @@ -4,9 +4,7 @@ namespace pytokenizer { namespace core { -void Tokenizer::foo() { - std::cout << "foo" << std::endl; -} +void Tokenizer::foo() { std::cout << "foo" << std::endl; } } // namespace core } // namespace pytokenizer \ No newline at end of file From d8b6024518f75ee617d0f18e466be19e4d226cc4 Mon Sep 17 00:00:00 2001 From: Markus Chu Date: Sat, 21 Mar 2026 12:28:29 -0700 Subject: [PATCH 3/3] Python implementation of BPE --- csrc/include/libtokenizer/core/tokenizer.hpp | 6 ++ csrc/src/core/tokenizer.cpp | 30 +++++++- src/pytokenizer/python_implementation.py | 78 ++++++++++++++++++++ test/playground.cpp | 26 ++++++- 4 files changed, 136 insertions(+), 4 deletions(-) create mode 100644 src/pytokenizer/python_implementation.py diff --git a/csrc/include/libtokenizer/core/tokenizer.hpp b/csrc/include/libtokenizer/core/tokenizer.hpp index de2b30e..b70737d 100644 --- a/csrc/include/libtokenizer/core/tokenizer.hpp +++ b/csrc/include/libtokenizer/core/tokenizer.hpp @@ -1,10 +1,16 @@ #pragma once +#include +#include +#include + namespace pytokenizer { namespace core { class Tokenizer { public: + static std::map, int> get_stats(const std::vector& ids); + static std::vector merge(const std::vector& ids, const std::pair& pair, int idx); void foo(); }; diff --git a/csrc/src/core/tokenizer.cpp b/csrc/src/core/tokenizer.cpp index 72e3b5d..bd3d35d 100644 --- a/csrc/src/core/tokenizer.cpp +++ b/csrc/src/core/tokenizer.cpp @@ -1,10 +1,36 @@ -#include "libtokenizer/core/tokenizer.hpp" +#include #include +#include +#include +#include +#include "libtokenizer/libtokenizer.hpp" namespace pytokenizer { namespace core { -void Tokenizer::foo() { std::cout << "foo" << std::endl; } +std::map, int> Tokenizer::get_stats(const std::vector& ids) { + std::map, int> counts; + for (size_t i = 0; i < ids.size() - 1; ++i) { + std::pair pair = {ids[i], ids[i + 1]}; + counts[pair]++; + } + return counts; +} + +std::vector Tokenizer::merge(const std::vector& ids, const std::pair& pair, int idx) { + std::vector newids; + size_t i = 0; + while (i < ids.size()) { + if (i < ids.size() - 1 && ids[i] == pair.first && ids[i + 1] == pair.second) { + newids.push_back(idx); + i += 2; + } else { + newids.push_back(ids[i]); + i += 1; + } + } + return newids; +} } // namespace core } // namespace pytokenizer \ No newline at end of file diff --git a/src/pytokenizer/python_implementation.py b/src/pytokenizer/python_implementation.py new file mode 100644 index 0000000..3db1734 --- /dev/null +++ b/src/pytokenizer/python_implementation.py @@ -0,0 +1,78 @@ +# sample text +text = "Hello, world! This is a test sentence for a tokenizer. It includes numbers like 123.45, special characters (@#$), and a contraction (don't)." + +tokens = text.encode("utf-8") +tokens = list(map(int, tokens)) + +def get_stats(ids): + counts = {} + for pair in zip(ids, ids[1:]): + counts[pair] = counts.get(pair, 0) + 1 + return counts + +stats = get_stats(tokens) # {(72, 101): 1, ...} + +top_pair = max(stats, key = stats.get) + +def merge(ids, pair, idx): + # in new list of ints (ids), replace all consecutive occurences of pair with the new token idx + newids = [] + i = 0 + while i < len(ids): + if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]: + newids.append(idx) + i += 2 + else: + newids.append(ids[i]) + i += 1 + return newids + +tokens2 = merge(tokens, top_pair, 256) + +#print(stats) +#print(top_pair) +print(len(tokens)) +print(len(tokens2)) +#print(merge([5, 6, 6, 7, 9, 1], (6, 7), 99)) + + + +# ---- Tokenization ---- +ids = list(tokens) # copy so don't destroy original list + +vocab_size = 276 +num_merges = vocab_size - 256 + +merges = {} +for i in range(num_merges): + stats = get_stats(ids) + pair = max(stats, key=stats.get) + idx = 256 + i + print(f"merging {pair} into a new token {idx}") + ids = merge(ids, pair, idx) + merges[pair] = idx + +print(len(tokens) / len(ids)) + +vocab = {idx: bytes([idx]) for idx in range(256)} +for (p0, p1), idx in merges.items(): + vocab[idx] = vocab[p0] + vocab[p1] +def decode(ids): + # give ids (list of integers), return Python string + tokens = b"".join(vocab[idx] for idx in ids) + text = tokens.decode("utf-8", errors='replace') + return text + +def encode(text): + # given a string, return list of integers (the token) + tokens = list(text.encode("utf-8")) + while len(tokens) >= 2: + stats = get_stats(tokens) + pair = min(stats, key=lambda p: merges.get(p, float("inf"))) + if pair not in merges: + break + idx = merges[pair] + tokens = merge(tokens, pair, idx) + return tokens + +print(decode(encode("HELLO hello world!"))) \ No newline at end of file diff --git a/test/playground.cpp b/test/playground.cpp index 0c8b276..7e57196 100644 --- a/test/playground.cpp +++ b/test/playground.cpp @@ -1,11 +1,33 @@ // main entry point for the program. Ideally in the future we'll setup a testing // framework like Gtest, but this will be okay for now just for quick testing. +#include #include +#include +#include #include "libtokenizer/libtokenizer.hpp" int main() { - pytokenizer::core::Tokenizer tokenizer; - tokenizer.foo(); + using namespace pytokenizer::core; + + std::string text = "Wow, C++ is so hard!"; + std::vector ids(text.begin(), text.end()); + + int next_id = 256; + for (int i = 0; i < 3; ++i) { + auto stats = Tokenizer::get_stats(ids); + auto best = std::max_element(stats.begin(), stats.end(), + [](const auto& a, const auto& b) { return a.second < b.second; }); + + std::cout << "Merging (" << best->first.first << ", " << best->first.second + << ") -> " << next_id << " [freq=" << best->second << "]\n"; + + ids = Tokenizer::merge(ids, best->first, next_id++); + } + + std::cout << "Final ids:"; + for (int id : ids) std::cout << " " << id; + std::cout << "\n"; + return 0; } \ No newline at end of file