From 5665310ba98a1f42a6dc7a3558f4536fea7b079d Mon Sep 17 00:00:00 2001
From: Neil Lin <neillin164@gmail.com>
Date: Sat, 7 Mar 2026 12:02:45 -0800
Subject: [PATCH 1/3] code now compiles, set up interface + compile options

---
 CMakeLists.txt                                | 18 +++++++++++++++++-
 csrc/include/libtokenizer/core/CMakeLists.txt |  0
 csrc/include/libtokenizer/core/tokenizer.hpp  | 12 ++++++++++++
 csrc/include/libtokenizer/libtokenizer.hpp    |  4 +++-
 csrc/src/core/CMakeLists.txt                  | 11 +++++++++++
 csrc/src/core/tokenizer.cpp                   | 12 ++++++++++++
 test/CMakeLists.txt                           |  7 +++++++
 test/playground.cpp                           | 11 +++++++++++
 8 files changed, 73 insertions(+), 2 deletions(-)
 delete mode 100644 csrc/include/libtokenizer/core/CMakeLists.txt
 create mode 100644 csrc/src/core/CMakeLists.txt
 create mode 100644 test/CMakeLists.txt
 create mode 100644 test/playground.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9050ec8..9d376a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,8 +7,24 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 
 # CORE TARGETS ---------------------------------
-add_subdirectory(csrc/src/)
+add_library(pytokenizer_compile_config INTERFACE)
+target_compile_options(pytokenizer_compile_config
+  INTERFACE
+    $<$<CONFIG:Release>:-march=native;-ffast-math> # also compiles with -O3 -DNDEBUG
+    $<$<CONFIG:Debug>:-march=native;-ffast-math> # -g
+)
+target_include_directories(pytokenizer_compile_config
+  INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc/include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/csrc/include>
+)
+
+add_subdirectory(csrc/src/core)
 add_subdirectory(csrc/bindings)
+add_subdirectory(test)
+
+add_library(pytokenizer INTERFACE)
+target_link_libraries(pytokenizer INTERFACE pytokenizer_core)
 
 
 # FORMATTING ---------------------------------
diff --git a/csrc/include/libtokenizer/core/CMakeLists.txt b/csrc/include/libtokenizer/core/CMakeLists.txt
deleted file mode 100644
index e69de29..0000000
diff --git a/csrc/include/libtokenizer/core/tokenizer.hpp b/csrc/include/libtokenizer/core/tokenizer.hpp
index e69de29..de2b30e 100644
--- a/csrc/include/libtokenizer/core/tokenizer.hpp
+++ b/csrc/include/libtokenizer/core/tokenizer.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+namespace pytokenizer {
+namespace core {
+
+class Tokenizer {
+public:
+    void foo();
+};
+
+} // namespace core
+} // namespace pytokenizer
\ No newline at end of file
diff --git a/csrc/include/libtokenizer/libtokenizer.hpp b/csrc/include/libtokenizer/libtokenizer.hpp
index 7b9637e..dbcb5c7 100644
--- a/csrc/include/libtokenizer/libtokenizer.hpp
+++ b/csrc/include/libtokenizer/libtokenizer.hpp
@@ -1 +1,3 @@
-#pragma once
\ No newline at end of file
+#pragma once
+
+#include "core/tokenizer.hpp"
\ No newline at end of file
diff --git a/csrc/src/core/CMakeLists.txt b/csrc/src/core/CMakeLists.txt
new file mode 100644
index 0000000..5fbd5bf
--- /dev/null
+++ b/csrc/src/core/CMakeLists.txt
@@ -0,0 +1,11 @@
+project(pytokenizer_core LANGUAGES CXX)
+
+set(PYTOKENIZER_SOURCES
+  tokenizer.cpp
+)
+
+add_library(pytokenizer_core SHARED ${PYTOKENIZER_SOURCES})
+
+target_link_libraries(pytokenizer_core
+  PUBLIC pytokenizer_compile_config
+)
\ No newline at end of file
diff --git a/csrc/src/core/tokenizer.cpp b/csrc/src/core/tokenizer.cpp
index e69de29..43581d8 100644
--- a/csrc/src/core/tokenizer.cpp
+++ b/csrc/src/core/tokenizer.cpp
@@ -0,0 +1,12 @@
+#include "libtokenizer/core/tokenizer.hpp"
+#include <iostream>
+
+namespace pytokenizer {
+namespace core {
+
+void Tokenizer::foo() {
+    std::cout << "foo" << std::endl;
+}
+
+} // namespace core
+} // namespace pytokenizer
\ No newline at end of file
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..d61a005
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,7 @@
+project(tests LANGUAGES CXX)
+
+# EXECUTABLES ---------------------------------
+add_executable(playground playground.cpp)
+target_link_libraries(playground
+  PRIVATE pytokenizer
+)
\ No newline at end of file
diff --git a/test/playground.cpp b/test/playground.cpp
new file mode 100644
index 0000000..0c8b276
--- /dev/null
+++ b/test/playground.cpp
@@ -0,0 +1,11 @@
+// main entry point for the program. Ideally in the future we'll setup a testing
+// framework like Gtest, but this will be okay for now just for quick testing.
+
+#include <iostream>
+#include "libtokenizer/libtokenizer.hpp"
+
+int main() {
+    pytokenizer::core::Tokenizer tokenizer;
+    tokenizer.foo();
+    return 0;
+}
\ No newline at end of file

From 0cf7f0f740cc1f692a59bcb49179a8c11164f66d Mon Sep 17 00:00:00 2001
From: Neil Lin <neillin164@gmail.com>
Date: Sat, 7 Mar 2026 12:08:56 -0800
Subject: [PATCH 2/3] linting workflow

---
 .github/workflows/lint.yml  | 33 +++++++++++++++++++++++++++++++++
 csrc/src/core/tokenizer.cpp |  4 +---
 2 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/lint.yml

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000..4ec3968
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,33 @@
+name: lint
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+
+jobs:
+  greet:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup system deps
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y \
+            build-essential \
+            cmake \
+            git
+
+      - name: Build CMake project
+        run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Debug
+
+      - name: Run C++ formatting
+        run: | 
+          cmake --build build --target format
+          git diff --exit-code
+
+      - name: Run C++ linting
+        run: cmake --build build --target lint
diff --git a/csrc/src/core/tokenizer.cpp b/csrc/src/core/tokenizer.cpp
index 43581d8..72e3b5d 100644
--- a/csrc/src/core/tokenizer.cpp
+++ b/csrc/src/core/tokenizer.cpp
@@ -4,9 +4,7 @@
 namespace pytokenizer {
 namespace core {
 
-void Tokenizer::foo() {
-    std::cout << "foo" << std::endl;
-}
+void Tokenizer::foo() { std::cout << "foo" << std::endl; }
 
 } // namespace core
 } // namespace pytokenizer
\ No newline at end of file

From d8b6024518f75ee617d0f18e466be19e4d226cc4 Mon Sep 17 00:00:00 2001
From: Markus Chu <markuschu06@gmail.com>
Date: Sat, 21 Mar 2026 12:28:29 -0700
Subject: [PATCH 3/3] Python implementation of BPE

---
 csrc/include/libtokenizer/core/tokenizer.hpp |  6 ++
 csrc/src/core/tokenizer.cpp                  | 30 +++++++-
 src/pytokenizer/python_implementation.py     | 78 ++++++++++++++++++++
 test/playground.cpp                          | 26 ++++++-
 4 files changed, 136 insertions(+), 4 deletions(-)
 create mode 100644 src/pytokenizer/python_implementation.py

diff --git a/csrc/include/libtokenizer/core/tokenizer.hpp b/csrc/include/libtokenizer/core/tokenizer.hpp
index de2b30e..b70737d 100644
--- a/csrc/include/libtokenizer/core/tokenizer.hpp
+++ b/csrc/include/libtokenizer/core/tokenizer.hpp
@@ -1,10 +1,16 @@
 #pragma once
 
+#include <vector>
+#include <map>
+#include <utility>
+
 namespace pytokenizer {
 namespace core {
 
 class Tokenizer {
 public:
+    static std::map<std::pair<int, int>, int> get_stats(const std::vector<int>& ids);
+    static std::vector<int> merge(const std::vector<int>& ids, const std::pair<int, int>& pair, int idx);
     void foo();
 };
 
diff --git a/csrc/src/core/tokenizer.cpp b/csrc/src/core/tokenizer.cpp
index 72e3b5d..bd3d35d 100644
--- a/csrc/src/core/tokenizer.cpp
+++ b/csrc/src/core/tokenizer.cpp
@@ -1,10 +1,36 @@
-#include "libtokenizer/core/tokenizer.hpp"
+#include <algorithm>
 #include <iostream>
+#include <string>
+#include <vector>
+#include <map>
+#include "libtokenizer/libtokenizer.hpp"
 
 namespace pytokenizer {
 namespace core {
 
-void Tokenizer::foo() { std::cout << "foo" << std::endl; }
+std::map<std::pair<int, int>, int> Tokenizer::get_stats(const std::vector<int>& ids) {
+    std::map<std::pair<int, int>, int> counts;
+    for (size_t i = 0; i < ids.size() - 1; ++i) {
+        std::pair<int, int> pair = {ids[i], ids[i + 1]};
+        counts[pair]++;
+    }
+    return counts;
+}
+
+std::vector<int> Tokenizer::merge(const std::vector<int>& ids, const std::pair<int, int>& pair, int idx) {
+    std::vector<int> newids;
+    size_t i = 0;
+    while (i < ids.size()) {
+        if (i < ids.size() - 1 && ids[i] == pair.first && ids[i + 1] == pair.second) {
+            newids.push_back(idx);
+            i += 2;
+        } else {
+            newids.push_back(ids[i]);
+            i += 1;
+        }
+    }
+    return newids;
+}
 
 } // namespace core
 } // namespace pytokenizer
\ No newline at end of file
diff --git a/src/pytokenizer/python_implementation.py b/src/pytokenizer/python_implementation.py
new file mode 100644
index 0000000..3db1734
--- /dev/null
+++ b/src/pytokenizer/python_implementation.py
@@ -0,0 +1,78 @@
+# sample text
+text = "Hello, world! This is a test sentence for a tokenizer. It includes numbers like 123.45, special characters (@#$), and a contraction (don't)."
+
+tokens = text.encode("utf-8")
+tokens = list(map(int, tokens))
+
+def get_stats(ids):
+    counts = {}
+    for pair in zip(ids, ids[1:]):
+        counts[pair] = counts.get(pair, 0) + 1
+    return counts
+
+stats = get_stats(tokens) # {(72, 101): 1, ...}
+
+top_pair = max(stats, key = stats.get)
+
+def merge(ids, pair, idx):
+    # in new list of ints (ids), replace all consecutive occurences of pair with the new token idx
+    newids = []
+    i = 0
+    while i < len(ids):
+        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
+            newids.append(idx)
+            i += 2
+        else:
+            newids.append(ids[i])
+            i += 1
+    return newids
+
+tokens2 = merge(tokens, top_pair, 256)
+
+#print(stats)
+#print(top_pair)
+print(len(tokens))
+print(len(tokens2))
+#print(merge([5, 6, 6, 7, 9, 1], (6, 7), 99))
+
+
+
+# ---- Tokenization ----
+ids = list(tokens) # copy so don't destroy original list
+
+vocab_size = 276
+num_merges = vocab_size - 256
+
+merges = {}
+for i in range(num_merges):
+    stats = get_stats(ids)
+    pair = max(stats, key=stats.get)
+    idx = 256 + i
+    print(f"merging {pair} into a new token {idx}")
+    ids = merge(ids, pair, idx)
+    merges[pair] = idx
+
+print(len(tokens) / len(ids))
+
+vocab = {idx: bytes([idx]) for idx in range(256)}
+for (p0, p1), idx in merges.items():
+    vocab[idx] = vocab[p0] + vocab[p1]
+def decode(ids):
+    # give ids (list of integers), return Python string
+    tokens = b"".join(vocab[idx] for idx in ids)
+    text = tokens.decode("utf-8", errors='replace')
+    return text
+
+def encode(text):
+    # given a string, return list of integers (the token)
+    tokens = list(text.encode("utf-8"))
+    while len(tokens) >= 2:
+        stats = get_stats(tokens)
+        pair = min(stats, key=lambda p: merges.get(p, float("inf")))
+        if pair not in merges:
+            break
+        idx = merges[pair]
+        tokens = merge(tokens, pair, idx)
+    return tokens
+
+print(decode(encode("HELLO hello world!")))
\ No newline at end of file
diff --git a/test/playground.cpp b/test/playground.cpp
index 0c8b276..7e57196 100644
--- a/test/playground.cpp
+++ b/test/playground.cpp
@@ -1,11 +1,33 @@
 // main entry point for the program. Ideally in the future we'll setup a testing
 // framework like Gtest, but this will be okay for now just for quick testing.
 
+#include <algorithm>
 #include <iostream>
+#include <string>
+#include <vector>
 #include "libtokenizer/libtokenizer.hpp"
 
 int main() {
-    pytokenizer::core::Tokenizer tokenizer;
-    tokenizer.foo();
+    using namespace pytokenizer::core;
+
+    std::string text = "Wow, C++ is so hard!";
+    std::vector<int> ids(text.begin(), text.end());
+
+    int next_id = 256;
+    for (int i = 0; i < 3; ++i) {
+        auto stats = Tokenizer::get_stats(ids);
+        auto best = std::max_element(stats.begin(), stats.end(),
+            [](const auto& a, const auto& b) { return a.second < b.second; });
+
+        std::cout << "Merging (" << best->first.first << ", " << best->first.second
+                  << ") -> " << next_id << "  [freq=" << best->second << "]\n";
+
+        ids = Tokenizer::merge(ids, best->first, next_id++);
+    }
+
+    std::cout << "Final ids:";
+    for (int id : ids) std::cout << " " << id;
+    std::cout << "\n";
+
     return 0;
 }
\ No newline at end of file