UBCAgroBot · Markus8888888 · Mar 21, 2026 · Mar 21, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -23,6 +23,7 @@ target_include_directories(pytokenizer_compile_config
 add_subdirectory(csrc/src/core)
 add_subdirectory(csrc/bindings)
 add_subdirectory(test)
+add_subdirectory(benchmarks)
 
 add_library(pytokenizer INTERFACE)
 target_link_libraries(pytokenizer INTERFACE pytokenizer_core)

diff --git a/csrc/include/libtokenizer/core/tokenizer.hpp b/csrc/include/libtokenizer/core/tokenizer.hpp
@@ -1,17 +1,13 @@
 #pragma once
 
-#include <vector>
-#include <map>
-#include <utility>
+#include <string>
 
 namespace pytokenizer {
 namespace core {
 
 class Tokenizer {
 public:
-    static std::map<std::pair<int, int>, int> get_stats(const std::vector<int>& ids);
-    static std::vector<int> merge(const std::vector<int>& ids, const std::pair<int, int>& pair, int idx);
-    void foo();
+    std::string foo();
 };
 
 } // namespace core

diff --git a/csrc/src/core/tokenizer.cpp b/csrc/src/core/tokenizer.cpp
@@ -1,36 +1,10 @@
-#include <algorithm>
+#include "libtokenizer/core/tokenizer.hpp"
 #include <iostream>
-#include <string>
-#include <vector>
-#include <map>
-#include "libtokenizer/libtokenizer.hpp"
 
 namespace pytokenizer {
 namespace core {
 
-std::map<std::pair<int, int>, int> Tokenizer::get_stats(const std::vector<int>& ids) {
-    std::map<std::pair<int, int>, int> counts;
-    for (size_t i = 0; i < ids.size() - 1; ++i) {
-        std::pair<int, int> pair = {ids[i], ids[i + 1]};
-        counts[pair]++;
-    }
-    return counts;
-}
-
-std::vector<int> Tokenizer::merge(const std::vector<int>& ids, const std::pair<int, int>& pair, int idx) {
-    std::vector<int> newids;
-    size_t i = 0;
-    while (i < ids.size()) {
-        if (i < ids.size() - 1 && ids[i] == pair.first && ids[i + 1] == pair.second) {
-            newids.push_back(idx);
-            i += 2;
-        } else {
-            newids.push_back(ids[i]);
-            i += 1;
-        }
-    }
-    return newids;
-}
+std::string Tokenizer::foo() { return "foo"; }
 
 } // namespace core
 } // namespace pytokenizer
diff --git a/src/pytokenizer/python_implementation.py b/src/pytokenizer/python_implementation.py
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -1,7 +1,27 @@
 project(tests LANGUAGES CXX)
 
+# FETCH CONTENT ---------------------------------
+include(FetchContent)
+set(FETCHCONTENT_TRY_FIND_PACKAGE TRUE)
+
+FetchContent_Declare(
+  googletest
+  GIT_REPOSITORY https://github.com/google/googletest.git
+  GIT_TAG        v1.14.0
+  GIT_SHALLOW    TRUE
+  FIND_PACKAGE_ARGS CONFIG 
+)
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
 # EXECUTABLES ---------------------------------
+enable_testing()
+
 add_executable(playground playground.cpp)
 target_link_libraries(playground
   PRIVATE pytokenizer
 )
+
+include(GoogleTest)
+# gtest_discover_tests(_tests PROPERTIES LABELS "library_tests")
diff --git a/test/playground.cpp b/test/playground.cpp
@@ -1,33 +1,11 @@
 // main entry point for the program. Ideally in the future we'll setup a testing
 // framework like Gtest, but this will be okay for now just for quick testing.
 
-#include <algorithm>
 #include <iostream>
-#include <string>
-#include <vector>
 #include "libtokenizer/libtokenizer.hpp"
 
 int main() {
-    using namespace pytokenizer::core;
-
-    std::string text = "Wow, C++ is so hard!";
-    std::vector<int> ids(text.begin(), text.end());
-
-    int next_id = 256;
-    for (int i = 0; i < 3; ++i) {
-        auto stats = Tokenizer::get_stats(ids);
-        auto best = std::max_element(stats.begin(), stats.end(),
-            [](const auto& a, const auto& b) { return a.second < b.second; });
-
-        std::cout << "Merging (" << best->first.first << ", " << best->first.second
-                  << ") -> " << next_id << "  [freq=" << best->second << "]\n";
-
-        ids = Tokenizer::merge(ids, best->first, next_id++);
-    }
-
-    std::cout << "Final ids:";
-    for (int id : ids) std::cout << " " << id;
-    std::cout << "\n";
-
+    pytokenizer::core::Tokenizer tokenizer;
+    std::cout << tokenizer.foo() << std::endl;
     return 0;
 }