diff --git a/CMakeLists.txt b/CMakeLists.txt index faf37ee..e87ae89 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,6 @@ target_include_directories(pytokenizer_compile_config add_subdirectory(csrc/src/core) add_subdirectory(csrc/bindings) add_subdirectory(test) -add_subdirectory(benchmarks) add_library(pytokenizer INTERFACE) target_link_libraries(pytokenizer INTERFACE pytokenizer_core) diff --git a/csrc/include/libtokenizer/core/tokenizer.hpp b/csrc/include/libtokenizer/core/tokenizer.hpp index 3e93162..b70737d 100644 --- a/csrc/include/libtokenizer/core/tokenizer.hpp +++ b/csrc/include/libtokenizer/core/tokenizer.hpp @@ -1,13 +1,17 @@ #pragma once -#include +#include +#include +#include namespace pytokenizer { namespace core { class Tokenizer { public: - std::string foo(); + static std::map, int> get_stats(const std::vector& ids); + static std::vector merge(const std::vector& ids, const std::pair& pair, int idx); + void foo(); }; } // namespace core diff --git a/csrc/src/core/tokenizer.cpp b/csrc/src/core/tokenizer.cpp index 3a23c7d..bd3d35d 100644 --- a/csrc/src/core/tokenizer.cpp +++ b/csrc/src/core/tokenizer.cpp @@ -1,10 +1,36 @@ -#include "libtokenizer/core/tokenizer.hpp" +#include #include +#include +#include +#include +#include "libtokenizer/libtokenizer.hpp" namespace pytokenizer { namespace core { -std::string Tokenizer::foo() { return "foo"; } +std::map, int> Tokenizer::get_stats(const std::vector& ids) { + std::map, int> counts; + for (size_t i = 0; i < ids.size() - 1; ++i) { + std::pair pair = {ids[i], ids[i + 1]}; + counts[pair]++; + } + return counts; +} + +std::vector Tokenizer::merge(const std::vector& ids, const std::pair& pair, int idx) { + std::vector newids; + size_t i = 0; + while (i < ids.size()) { + if (i < ids.size() - 1 && ids[i] == pair.first && ids[i + 1] == pair.second) { + newids.push_back(idx); + i += 2; + } else { + newids.push_back(ids[i]); + i += 1; + } + } + return newids; +} } // namespace core } // namespace pytokenizer \ No newline at end of file diff --git a/src/pytokenizer/python_implementation.py b/src/pytokenizer/python_implementation.py new file mode 100644 index 0000000..3db1734 --- /dev/null +++ b/src/pytokenizer/python_implementation.py @@ -0,0 +1,78 @@ +# sample text +text = "Hello, world! This is a test sentence for a tokenizer. It includes numbers like 123.45, special characters (@#$), and a contraction (don't)." + +tokens = text.encode("utf-8") +tokens = list(map(int, tokens)) + +def get_stats(ids): + counts = {} + for pair in zip(ids, ids[1:]): + counts[pair] = counts.get(pair, 0) + 1 + return counts + +stats = get_stats(tokens) # {(72, 101): 1, ...} + +top_pair = max(stats, key = stats.get) + +def merge(ids, pair, idx): + # in new list of ints (ids), replace all consecutive occurences of pair with the new token idx + newids = [] + i = 0 + while i < len(ids): + if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]: + newids.append(idx) + i += 2 + else: + newids.append(ids[i]) + i += 1 + return newids + +tokens2 = merge(tokens, top_pair, 256) + +#print(stats) +#print(top_pair) +print(len(tokens)) +print(len(tokens2)) +#print(merge([5, 6, 6, 7, 9, 1], (6, 7), 99)) + + + +# ---- Tokenization ---- +ids = list(tokens) # copy so don't destroy original list + +vocab_size = 276 +num_merges = vocab_size - 256 + +merges = {} +for i in range(num_merges): + stats = get_stats(ids) + pair = max(stats, key=stats.get) + idx = 256 + i + print(f"merging {pair} into a new token {idx}") + ids = merge(ids, pair, idx) + merges[pair] = idx + +print(len(tokens) / len(ids)) + +vocab = {idx: bytes([idx]) for idx in range(256)} +for (p0, p1), idx in merges.items(): + vocab[idx] = vocab[p0] + vocab[p1] +def decode(ids): + # give ids (list of integers), return Python string + tokens = b"".join(vocab[idx] for idx in ids) + text = tokens.decode("utf-8", errors='replace') + return text + +def encode(text): + # given a string, return list of integers (the token) + tokens = list(text.encode("utf-8")) + while len(tokens) >= 2: + stats = get_stats(tokens) + pair = min(stats, key=lambda p: merges.get(p, float("inf"))) + if pair not in merges: + break + idx = merges[pair] + tokens = merge(tokens, pair, idx) + return tokens + +print(decode(encode("HELLO hello world!"))) \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6a788fc..6c10037 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,27 +1,7 @@ project(tests LANGUAGES CXX) -# FETCH CONTENT --------------------------------- -include(FetchContent) -set(FETCHCONTENT_TRY_FIND_PACKAGE TRUE) - -FetchContent_Declare( - googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG v1.14.0 - GIT_SHALLOW TRUE - FIND_PACKAGE_ARGS CONFIG -) -set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) -set(INSTALL_GTEST OFF CACHE BOOL "" FORCE) -FetchContent_MakeAvailable(googletest) - # EXECUTABLES --------------------------------- -enable_testing() - add_executable(playground playground.cpp) target_link_libraries(playground PRIVATE pytokenizer ) - -include(GoogleTest) -# gtest_discover_tests(_tests PROPERTIES LABELS "library_tests") \ No newline at end of file diff --git a/test/playground.cpp b/test/playground.cpp index cad0388..7e57196 100644 --- a/test/playground.cpp +++ b/test/playground.cpp @@ -1,11 +1,33 @@ // main entry point for the program. Ideally in the future we'll setup a testing // framework like Gtest, but this will be okay for now just for quick testing. +#include #include +#include +#include #include "libtokenizer/libtokenizer.hpp" int main() { - pytokenizer::core::Tokenizer tokenizer; - std::cout << tokenizer.foo() << std::endl; + using namespace pytokenizer::core; + + std::string text = "Wow, C++ is so hard!"; + std::vector ids(text.begin(), text.end()); + + int next_id = 256; + for (int i = 0; i < 3; ++i) { + auto stats = Tokenizer::get_stats(ids); + auto best = std::max_element(stats.begin(), stats.end(), + [](const auto& a, const auto& b) { return a.second < b.second; }); + + std::cout << "Merging (" << best->first.first << ", " << best->first.second + << ") -> " << next_id << " [freq=" << best->second << "]\n"; + + ids = Tokenizer::merge(ids, best->first, next_id++); + } + + std::cout << "Final ids:"; + for (int id : ids) std::cout << " " << id; + std::cout << "\n"; + return 0; } \ No newline at end of file