UBCAgroBot · Markus8888888 · Mar 21, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 21, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -23,7 +23,6 @@ target_include_directories(pytokenizer_compile_config
 add_subdirectory(csrc/src/core)
 add_subdirectory(csrc/bindings)
 add_subdirectory(test)
-add_subdirectory(benchmarks)
 
 add_library(pytokenizer INTERFACE)
 target_link_libraries(pytokenizer INTERFACE pytokenizer_core)

diff --git a/csrc/include/libtokenizer/core/tokenizer.hpp b/csrc/include/libtokenizer/core/tokenizer.hpp
@@ -1,13 +1,17 @@
 #pragma once
 
-#include <string>
+#include <vector>
+#include <map>
+#include <utility>
 
 namespace pytokenizer {
 namespace core {
 
 class Tokenizer {
 public:
-    std::string foo();
+    static std::map<std::pair<int, int>, int> get_stats(const std::vector<int>& ids);
+    static std::vector<int> merge(const std::vector<int>& ids, const std::pair<int, int>& pair, int idx);
+    void foo();
 };
 
 } // namespace core

diff --git a/csrc/src/core/tokenizer.cpp b/csrc/src/core/tokenizer.cpp
@@ -1,10 +1,36 @@
-#include "libtokenizer/core/tokenizer.hpp"
+#include <algorithm>
 #include <iostream>
+#include <string>
+#include <vector>
+#include <map>
+#include "libtokenizer/libtokenizer.hpp"
 
 namespace pytokenizer {
 namespace core {
 
-std::string Tokenizer::foo() { return "foo"; }
+std::map<std::pair<int, int>, int> Tokenizer::get_stats(const std::vector<int>& ids) {
+    std::map<std::pair<int, int>, int> counts;
+    for (size_t i = 0; i < ids.size() - 1; ++i) {
+        std::pair<int, int> pair = {ids[i], ids[i + 1]};
+        counts[pair]++;
+    }
+    return counts;
+}
+
+std::vector<int> Tokenizer::merge(const std::vector<int>& ids, const std::pair<int, int>& pair, int idx) {
+    std::vector<int> newids;
+    size_t i = 0;
+    while (i < ids.size()) {
+        if (i < ids.size() - 1 && ids[i] == pair.first && ids[i + 1] == pair.second) {
+            newids.push_back(idx);
+            i += 2;
+        } else {
+            newids.push_back(ids[i]);
+            i += 1;
+        }
+    }
+    return newids;
+}
 
 } // namespace core
 } // namespace pytokenizer
diff --git a/src/pytokenizer/python_implementation.py b/src/pytokenizer/python_implementation.py
@@ -0,0 +1,78 @@
+# sample text
+text = "Hello, world! This is a test sentence for a tokenizer. It includes numbers like 123.45, special characters (@#$), and a contraction (don't)."
+
+tokens = text.encode("utf-8")
+tokens = list(map(int, tokens))
+
+def get_stats(ids):
+    counts = {}
+    for pair in zip(ids, ids[1:]):
+        counts[pair] = counts.get(pair, 0) + 1
+    return counts
+
+stats = get_stats(tokens) # {(72, 101): 1, ...}
+
+top_pair = max(stats, key = stats.get)
+
+def merge(ids, pair, idx):
+    # in new list of ints (ids), replace all consecutive occurences of pair with the new token idx
+    newids = []
+    i = 0
+    while i < len(ids):
+        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
+            newids.append(idx)
+            i += 2
+        else:
+            newids.append(ids[i])
+            i += 1
+    return newids
+
+tokens2 = merge(tokens, top_pair, 256)
+
+#print(stats)
+#print(top_pair)
+print(len(tokens))
+print(len(tokens2))
+#print(merge([5, 6, 6, 7, 9, 1], (6, 7), 99))
+
+
+
+# ---- Tokenization ----
+ids = list(tokens) # copy so don't destroy original list
+
+vocab_size = 276
+num_merges = vocab_size - 256
+
+merges = {}
+for i in range(num_merges):
+    stats = get_stats(ids)
+    pair = max(stats, key=stats.get)
+    idx = 256 + i
+    print(f"merging {pair} into a new token {idx}")
+    ids = merge(ids, pair, idx)
+    merges[pair] = idx
+
+print(len(tokens) / len(ids))
+
+vocab = {idx: bytes([idx]) for idx in range(256)}
+for (p0, p1), idx in merges.items():
+    vocab[idx] = vocab[p0] + vocab[p1]
+def decode(ids):
+    # give ids (list of integers), return Python string
+    tokens = b"".join(vocab[idx] for idx in ids)
+    text = tokens.decode("utf-8", errors='replace')
+    return text
+
+def encode(text):
+    # given a string, return list of integers (the token)
+    tokens = list(text.encode("utf-8"))
+    while len(tokens) >= 2:
+        stats = get_stats(tokens)
+        pair = min(stats, key=lambda p: merges.get(p, float("inf")))
+        if pair not in merges:
+            break
+        idx = merges[pair]
+        tokens = merge(tokens, pair, idx)
+    return tokens
+
+print(decode(encode("HELLO hello world!")))
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -1,27 +1,7 @@
 project(tests LANGUAGES CXX)
 
-# FETCH CONTENT ---------------------------------
-include(FetchContent)
-set(FETCHCONTENT_TRY_FIND_PACKAGE TRUE)
-
-FetchContent_Declare(
-  googletest
-  GIT_REPOSITORY https://github.com/google/googletest.git
-  GIT_TAG        v1.14.0
-  GIT_SHALLOW    TRUE
-  FIND_PACKAGE_ARGS CONFIG 
-)
-set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
-FetchContent_MakeAvailable(googletest)
-
 # EXECUTABLES ---------------------------------
-enable_testing()
-
 add_executable(playground playground.cpp)
 target_link_libraries(playground
   PRIVATE pytokenizer
 )
-
-include(GoogleTest)
-# gtest_discover_tests(_tests PROPERTIES LABELS "library_tests")
diff --git a/test/playground.cpp b/test/playground.cpp
@@ -1,11 +1,33 @@
 // main entry point for the program. Ideally in the future we'll setup a testing
 // framework like Gtest, but this will be okay for now just for quick testing.
 
+#include <algorithm>
 #include <iostream>
+#include <string>
+#include <vector>
 #include "libtokenizer/libtokenizer.hpp"
 
 int main() {
-    pytokenizer::core::Tokenizer tokenizer;
-    std::cout << tokenizer.foo() << std::endl;
+    using namespace pytokenizer::core;
+
+    std::string text = "Wow, C++ is so hard!";
+    std::vector<int> ids(text.begin(), text.end());
+
+    int next_id = 256;
+    for (int i = 0; i < 3; ++i) {
+        auto stats = Tokenizer::get_stats(ids);
+        auto best = std::max_element(stats.begin(), stats.end(),
+            [](const auto& a, const auto& b) { return a.second < b.second; });
+
+        std::cout << "Merging (" << best->first.first << ", " << best->first.second
+                  << ") -> " << next_id << "  [freq=" << best->second << "]\n";
+
+        ids = Tokenizer::merge(ids, best->first, next_id++);
+    }
+
+    std::cout << "Final ids:";
+    for (int id : ids) std::cout << " " << id;
+    std::cout << "\n";
+
     return 0;
 }