From 08d2dac022d8bf14ab156809f5d0fb37e7e0dbad Mon Sep 17 00:00:00 2001
From: Markus8888888 <buspoko@gmail.com>
Date: Sat, 21 Mar 2026 12:53:47 -0700
Subject: [PATCH] Revert "Markus c++"

---
 CMakeLists.txt                               |  1 +
 csrc/include/libtokenizer/core/tokenizer.hpp |  8 +-
 csrc/src/core/tokenizer.cpp                  | 30 +-------
 src/pytokenizer/python_implementation.py     | 78 --------------------
 test/CMakeLists.txt                          | 20 +++++
 test/playground.cpp                          | 26 +------
 6 files changed, 27 insertions(+), 136 deletions(-)
 delete mode 100644 src/pytokenizer/python_implementation.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e87ae89..faf37ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,7 @@ target_include_directories(pytokenizer_compile_config
 add_subdirectory(csrc/src/core)
 add_subdirectory(csrc/bindings)
 add_subdirectory(test)
+add_subdirectory(benchmarks)
 
 add_library(pytokenizer INTERFACE)
 target_link_libraries(pytokenizer INTERFACE pytokenizer_core)
diff --git a/csrc/include/libtokenizer/core/tokenizer.hpp b/csrc/include/libtokenizer/core/tokenizer.hpp
index b70737d..3e93162 100644
--- a/csrc/include/libtokenizer/core/tokenizer.hpp
+++ b/csrc/include/libtokenizer/core/tokenizer.hpp
@@ -1,17 +1,13 @@
 #pragma once
 
-#include <vector>
-#include <map>
-#include <utility>
+#include <string>
 
 namespace pytokenizer {
 namespace core {
 
 class Tokenizer {
 public:
-    static std::map<std::pair<int, int>, int> get_stats(const std::vector<int>& ids);
-    static std::vector<int> merge(const std::vector<int>& ids, const std::pair<int, int>& pair, int idx);
-    void foo();
+    std::string foo();
 };
 
 } // namespace core
diff --git a/csrc/src/core/tokenizer.cpp b/csrc/src/core/tokenizer.cpp
index bd3d35d..3a23c7d 100644
--- a/csrc/src/core/tokenizer.cpp
+++ b/csrc/src/core/tokenizer.cpp
@@ -1,36 +1,10 @@
-#include <algorithm>
+#include "libtokenizer/core/tokenizer.hpp"
 #include <iostream>
-#include <string>
-#include <vector>
-#include <map>
-#include "libtokenizer/libtokenizer.hpp"
 
 namespace pytokenizer {
 namespace core {
 
-std::map<std::pair<int, int>, int> Tokenizer::get_stats(const std::vector<int>& ids) {
-    std::map<std::pair<int, int>, int> counts;
-    for (size_t i = 0; i < ids.size() - 1; ++i) {
-        std::pair<int, int> pair = {ids[i], ids[i + 1]};
-        counts[pair]++;
-    }
-    return counts;
-}
-
-std::vector<int> Tokenizer::merge(const std::vector<int>& ids, const std::pair<int, int>& pair, int idx) {
-    std::vector<int> newids;
-    size_t i = 0;
-    while (i < ids.size()) {
-        if (i < ids.size() - 1 && ids[i] == pair.first && ids[i + 1] == pair.second) {
-            newids.push_back(idx);
-            i += 2;
-        } else {
-            newids.push_back(ids[i]);
-            i += 1;
-        }
-    }
-    return newids;
-}
+std::string Tokenizer::foo() { return "foo"; }
 
 } // namespace core
 } // namespace pytokenizer
\ No newline at end of file
diff --git a/src/pytokenizer/python_implementation.py b/src/pytokenizer/python_implementation.py
deleted file mode 100644
index 3db1734..0000000
--- a/src/pytokenizer/python_implementation.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# sample text
-text = "Hello, world! This is a test sentence for a tokenizer. It includes numbers like 123.45, special characters (@#$), and a contraction (don't)."
-
-tokens = text.encode("utf-8")
-tokens = list(map(int, tokens))
-
-def get_stats(ids):
-    counts = {}
-    for pair in zip(ids, ids[1:]):
-        counts[pair] = counts.get(pair, 0) + 1
-    return counts
-
-stats = get_stats(tokens) # {(72, 101): 1, ...}
-
-top_pair = max(stats, key = stats.get)
-
-def merge(ids, pair, idx):
-    # in new list of ints (ids), replace all consecutive occurences of pair with the new token idx
-    newids = []
-    i = 0
-    while i < len(ids):
-        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
-            newids.append(idx)
-            i += 2
-        else:
-            newids.append(ids[i])
-            i += 1
-    return newids
-
-tokens2 = merge(tokens, top_pair, 256)
-
-#print(stats)
-#print(top_pair)
-print(len(tokens))
-print(len(tokens2))
-#print(merge([5, 6, 6, 7, 9, 1], (6, 7), 99))
-
-
-
-# ---- Tokenization ----
-ids = list(tokens) # copy so don't destroy original list
-
-vocab_size = 276
-num_merges = vocab_size - 256
-
-merges = {}
-for i in range(num_merges):
-    stats = get_stats(ids)
-    pair = max(stats, key=stats.get)
-    idx = 256 + i
-    print(f"merging {pair} into a new token {idx}")
-    ids = merge(ids, pair, idx)
-    merges[pair] = idx
-
-print(len(tokens) / len(ids))
-
-vocab = {idx: bytes([idx]) for idx in range(256)}
-for (p0, p1), idx in merges.items():
-    vocab[idx] = vocab[p0] + vocab[p1]
-def decode(ids):
-    # give ids (list of integers), return Python string
-    tokens = b"".join(vocab[idx] for idx in ids)
-    text = tokens.decode("utf-8", errors='replace')
-    return text
-
-def encode(text):
-    # given a string, return list of integers (the token)
-    tokens = list(text.encode("utf-8"))
-    while len(tokens) >= 2:
-        stats = get_stats(tokens)
-        pair = min(stats, key=lambda p: merges.get(p, float("inf")))
-        if pair not in merges:
-            break
-        idx = merges[pair]
-        tokens = merge(tokens, pair, idx)
-    return tokens
-
-print(decode(encode("HELLO hello world!")))
\ No newline at end of file
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6c10037..6a788fc 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,7 +1,27 @@
 project(tests LANGUAGES CXX)
 
+# FETCH CONTENT ---------------------------------
+include(FetchContent)
+set(FETCHCONTENT_TRY_FIND_PACKAGE TRUE)
+
+FetchContent_Declare(
+  googletest
+  GIT_REPOSITORY https://github.com/google/googletest.git
+  GIT_TAG        v1.14.0
+  GIT_SHALLOW    TRUE
+  FIND_PACKAGE_ARGS CONFIG 
+)
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
 # EXECUTABLES ---------------------------------
+enable_testing()
+
 add_executable(playground playground.cpp)
 target_link_libraries(playground
   PRIVATE pytokenizer
 )
+
+include(GoogleTest)
+# gtest_discover_tests(_tests PROPERTIES LABELS "library_tests")
\ No newline at end of file
diff --git a/test/playground.cpp b/test/playground.cpp
index 7e57196..cad0388 100644
--- a/test/playground.cpp
+++ b/test/playground.cpp
@@ -1,33 +1,11 @@
 // main entry point for the program. Ideally in the future we'll setup a testing
 // framework like Gtest, but this will be okay for now just for quick testing.
 
-#include <algorithm>
 #include <iostream>
-#include <string>
-#include <vector>
 #include "libtokenizer/libtokenizer.hpp"
 
 int main() {
-    using namespace pytokenizer::core;
-
-    std::string text = "Wow, C++ is so hard!";
-    std::vector<int> ids(text.begin(), text.end());
-
-    int next_id = 256;
-    for (int i = 0; i < 3; ++i) {
-        auto stats = Tokenizer::get_stats(ids);
-        auto best = std::max_element(stats.begin(), stats.end(),
-            [](const auto& a, const auto& b) { return a.second < b.second; });
-
-        std::cout << "Merging (" << best->first.first << ", " << best->first.second
-                  << ") -> " << next_id << "  [freq=" << best->second << "]\n";
-
-        ids = Tokenizer::merge(ids, best->first, next_id++);
-    }
-
-    std::cout << "Final ids:";
-    for (int id : ids) std::cout << " " << id;
-    std::cout << "\n";
-
+    pytokenizer::core::Tokenizer tokenizer;
+    std::cout << tokenizer.foo() << std::endl;
     return 0;
 }
\ No newline at end of file