From 08d2dac022d8bf14ab156809f5d0fb37e7e0dbad Mon Sep 17 00:00:00 2001 From: Markus8888888 Date: Sat, 21 Mar 2026 12:53:47 -0700 Subject: [PATCH] Revert "Markus c++" --- CMakeLists.txt | 1 + csrc/include/libtokenizer/core/tokenizer.hpp | 8 +- csrc/src/core/tokenizer.cpp | 30 +------- src/pytokenizer/python_implementation.py | 78 -------------------- test/CMakeLists.txt | 20 +++++ test/playground.cpp | 26 +------ 6 files changed, 27 insertions(+), 136 deletions(-) delete mode 100644 src/pytokenizer/python_implementation.py diff --git a/CMakeLists.txt b/CMakeLists.txt index e87ae89..faf37ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ target_include_directories(pytokenizer_compile_config add_subdirectory(csrc/src/core) add_subdirectory(csrc/bindings) add_subdirectory(test) +add_subdirectory(benchmarks) add_library(pytokenizer INTERFACE) target_link_libraries(pytokenizer INTERFACE pytokenizer_core) diff --git a/csrc/include/libtokenizer/core/tokenizer.hpp b/csrc/include/libtokenizer/core/tokenizer.hpp index b70737d..3e93162 100644 --- a/csrc/include/libtokenizer/core/tokenizer.hpp +++ b/csrc/include/libtokenizer/core/tokenizer.hpp @@ -1,17 +1,13 @@ #pragma once -#include -#include -#include +#include namespace pytokenizer { namespace core { class Tokenizer { public: - static std::map, int> get_stats(const std::vector& ids); - static std::vector merge(const std::vector& ids, const std::pair& pair, int idx); - void foo(); + std::string foo(); }; } // namespace core diff --git a/csrc/src/core/tokenizer.cpp b/csrc/src/core/tokenizer.cpp index bd3d35d..3a23c7d 100644 --- a/csrc/src/core/tokenizer.cpp +++ b/csrc/src/core/tokenizer.cpp @@ -1,36 +1,10 @@ -#include +#include "libtokenizer/core/tokenizer.hpp" #include -#include -#include -#include -#include "libtokenizer/libtokenizer.hpp" namespace pytokenizer { namespace core { -std::map, int> Tokenizer::get_stats(const std::vector& ids) { - std::map, int> counts; - for (size_t i = 0; i < ids.size() - 1; ++i) { - std::pair pair = {ids[i], ids[i + 1]}; - counts[pair]++; - } - return counts; -} - -std::vector Tokenizer::merge(const std::vector& ids, const std::pair& pair, int idx) { - std::vector newids; - size_t i = 0; - while (i < ids.size()) { - if (i < ids.size() - 1 && ids[i] == pair.first && ids[i + 1] == pair.second) { - newids.push_back(idx); - i += 2; - } else { - newids.push_back(ids[i]); - i += 1; - } - } - return newids; -} +std::string Tokenizer::foo() { return "foo"; } } // namespace core } // namespace pytokenizer \ No newline at end of file diff --git a/src/pytokenizer/python_implementation.py b/src/pytokenizer/python_implementation.py deleted file mode 100644 index 3db1734..0000000 --- a/src/pytokenizer/python_implementation.py +++ /dev/null @@ -1,78 +0,0 @@ -# sample text -text = "Hello, world! This is a test sentence for a tokenizer. It includes numbers like 123.45, special characters (@#$), and a contraction (don't)." - -tokens = text.encode("utf-8") -tokens = list(map(int, tokens)) - -def get_stats(ids): - counts = {} - for pair in zip(ids, ids[1:]): - counts[pair] = counts.get(pair, 0) + 1 - return counts - -stats = get_stats(tokens) # {(72, 101): 1, ...} - -top_pair = max(stats, key = stats.get) - -def merge(ids, pair, idx): - # in new list of ints (ids), replace all consecutive occurences of pair with the new token idx - newids = [] - i = 0 - while i < len(ids): - if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]: - newids.append(idx) - i += 2 - else: - newids.append(ids[i]) - i += 1 - return newids - -tokens2 = merge(tokens, top_pair, 256) - -#print(stats) -#print(top_pair) -print(len(tokens)) -print(len(tokens2)) -#print(merge([5, 6, 6, 7, 9, 1], (6, 7), 99)) - - - -# ---- Tokenization ---- -ids = list(tokens) # copy so don't destroy original list - -vocab_size = 276 -num_merges = vocab_size - 256 - -merges = {} -for i in range(num_merges): - stats = get_stats(ids) - pair = max(stats, key=stats.get) - idx = 256 + i - print(f"merging {pair} into a new token {idx}") - ids = merge(ids, pair, idx) - merges[pair] = idx - -print(len(tokens) / len(ids)) - -vocab = {idx: bytes([idx]) for idx in range(256)} -for (p0, p1), idx in merges.items(): - vocab[idx] = vocab[p0] + vocab[p1] -def decode(ids): - # give ids (list of integers), return Python string - tokens = b"".join(vocab[idx] for idx in ids) - text = tokens.decode("utf-8", errors='replace') - return text - -def encode(text): - # given a string, return list of integers (the token) - tokens = list(text.encode("utf-8")) - while len(tokens) >= 2: - stats = get_stats(tokens) - pair = min(stats, key=lambda p: merges.get(p, float("inf"))) - if pair not in merges: - break - idx = merges[pair] - tokens = merge(tokens, pair, idx) - return tokens - -print(decode(encode("HELLO hello world!"))) \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6c10037..6a788fc 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,7 +1,27 @@ project(tests LANGUAGES CXX) +# FETCH CONTENT --------------------------------- +include(FetchContent) +set(FETCHCONTENT_TRY_FIND_PACKAGE TRUE) + +FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG v1.14.0 + GIT_SHALLOW TRUE + FIND_PACKAGE_ARGS CONFIG +) +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +set(INSTALL_GTEST OFF CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googletest) + # EXECUTABLES --------------------------------- +enable_testing() + add_executable(playground playground.cpp) target_link_libraries(playground PRIVATE pytokenizer ) + +include(GoogleTest) +# gtest_discover_tests(_tests PROPERTIES LABELS "library_tests") \ No newline at end of file diff --git a/test/playground.cpp b/test/playground.cpp index 7e57196..cad0388 100644 --- a/test/playground.cpp +++ b/test/playground.cpp @@ -1,33 +1,11 @@ // main entry point for the program. Ideally in the future we'll setup a testing // framework like Gtest, but this will be okay for now just for quick testing. -#include #include -#include -#include #include "libtokenizer/libtokenizer.hpp" int main() { - using namespace pytokenizer::core; - - std::string text = "Wow, C++ is so hard!"; - std::vector ids(text.begin(), text.end()); - - int next_id = 256; - for (int i = 0; i < 3; ++i) { - auto stats = Tokenizer::get_stats(ids); - auto best = std::max_element(stats.begin(), stats.end(), - [](const auto& a, const auto& b) { return a.second < b.second; }); - - std::cout << "Merging (" << best->first.first << ", " << best->first.second - << ") -> " << next_id << " [freq=" << best->second << "]\n"; - - ids = Tokenizer::merge(ids, best->first, next_id++); - } - - std::cout << "Final ids:"; - for (int id : ids) std::cout << " " << id; - std::cout << "\n"; - + pytokenizer::core::Tokenizer tokenizer; + std::cout << tokenizer.foo() << std::endl; return 0; } \ No newline at end of file