Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ target_include_directories(pytokenizer_compile_config
add_subdirectory(csrc/src/core)
add_subdirectory(csrc/bindings)
add_subdirectory(test)
add_subdirectory(benchmarks)

add_library(pytokenizer INTERFACE)
target_link_libraries(pytokenizer INTERFACE pytokenizer_core)
Expand Down
8 changes: 2 additions & 6 deletions csrc/include/libtokenizer/core/tokenizer.hpp
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
#pragma once

#include <vector>
#include <map>
#include <utility>
#include <string>

namespace pytokenizer {
namespace core {

class Tokenizer {
public:
static std::map<std::pair<int, int>, int> get_stats(const std::vector<int>& ids);
static std::vector<int> merge(const std::vector<int>& ids, const std::pair<int, int>& pair, int idx);
void foo();
std::string foo();
};

} // namespace core
Expand Down
30 changes: 2 additions & 28 deletions csrc/src/core/tokenizer.cpp
Original file line number Diff line number Diff line change
@@ -1,36 +1,10 @@
#include <algorithm>
#include "libtokenizer/core/tokenizer.hpp"
#include <iostream>
#include <string>
#include <vector>
#include <map>
#include "libtokenizer/libtokenizer.hpp"

namespace pytokenizer {
namespace core {

std::map<std::pair<int, int>, int> Tokenizer::get_stats(const std::vector<int>& ids) {
std::map<std::pair<int, int>, int> counts;
for (size_t i = 0; i < ids.size() - 1; ++i) {
std::pair<int, int> pair = {ids[i], ids[i + 1]};
counts[pair]++;
}
return counts;
}

std::vector<int> Tokenizer::merge(const std::vector<int>& ids, const std::pair<int, int>& pair, int idx) {
std::vector<int> newids;
size_t i = 0;
while (i < ids.size()) {
if (i < ids.size() - 1 && ids[i] == pair.first && ids[i + 1] == pair.second) {
newids.push_back(idx);
i += 2;
} else {
newids.push_back(ids[i]);
i += 1;
}
}
return newids;
}
std::string Tokenizer::foo() { return "foo"; }

} // namespace core
} // namespace pytokenizer
78 changes: 0 additions & 78 deletions src/pytokenizer/python_implementation.py

This file was deleted.

20 changes: 20 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,27 @@
project(tests LANGUAGES CXX)

# FETCH CONTENT ---------------------------------
include(FetchContent)
set(FETCHCONTENT_TRY_FIND_PACKAGE TRUE)

FetchContent_Declare(
googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG v1.14.0
GIT_SHALLOW TRUE
FIND_PACKAGE_ARGS CONFIG
)
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)

# EXECUTABLES ---------------------------------
enable_testing()

add_executable(playground playground.cpp)
target_link_libraries(playground
PRIVATE pytokenizer
)

include(GoogleTest)
# gtest_discover_tests(_tests PROPERTIES LABELS "library_tests")
26 changes: 2 additions & 24 deletions test/playground.cpp
Original file line number Diff line number Diff line change
@@ -1,33 +1,11 @@
// main entry point for the program. Ideally in the future we'll setup a testing
// framework like Gtest, but this will be okay for now just for quick testing.

#include <algorithm>
#include <iostream>
#include <string>
#include <vector>
#include "libtokenizer/libtokenizer.hpp"

int main() {
using namespace pytokenizer::core;

std::string text = "Wow, C++ is so hard!";
std::vector<int> ids(text.begin(), text.end());

int next_id = 256;
for (int i = 0; i < 3; ++i) {
auto stats = Tokenizer::get_stats(ids);
auto best = std::max_element(stats.begin(), stats.end(),
[](const auto& a, const auto& b) { return a.second < b.second; });

std::cout << "Merging (" << best->first.first << ", " << best->first.second
<< ") -> " << next_id << " [freq=" << best->second << "]\n";

ids = Tokenizer::merge(ids, best->first, next_id++);
}

std::cout << "Final ids:";
for (int id : ids) std::cout << " " << id;
std::cout << "\n";

pytokenizer::core::Tokenizer tokenizer;
std::cout << tokenizer.foo() << std::endl;
return 0;
}
Loading