From c3b219a8157c40996fb44bb26d2c8c83cab6f0ce Mon Sep 17 00:00:00 2001 From: gabewillen Date: Sun, 1 Mar 2026 22:40:28 -0600 Subject: [PATCH 1/3] bench/parity: refresh encoder benches and split tokenizer parity by variant --- CMakeLists.txt | 94 ++--- docs/benchmarks.md | 127 ++++-- scripts/fuzz_smoke.sh | 1 - snapshots/bench/benchmarks.txt | 132 ++++-- snapshots/bench/benchmarks_compare.txt | 129 ++++-- snapshots/quality_gates/timing.txt | 12 +- src/emel/text/encoders/plamo2/detail.hpp | 13 +- src/emel/text/encoders/ugm/detail.hpp | 2 +- tests/text/encoders/test_support.hpp | 9 +- tools/bench/CMakeLists.txt | 61 +-- tools/bench/bench_cases.hpp | 14 + tools/bench/bench_main.cpp | 26 +- tools/bench/text/encoders/bench_common.hpp | 171 ++++++++ tools/bench/text/encoders/bpe_bench.cpp | 38 ++ tools/bench/text/encoders/fallback_bench.cpp | 38 ++ tools/bench/text/encoders/plamo2_bench.cpp | 40 ++ tools/bench/text/encoders/rwkv_bench.cpp | 43 ++ tools/bench/text/encoders/spm_bench.cpp | 37 ++ tools/bench/text/encoders/ugm_bench.cpp | 37 ++ tools/bench/text/encoders/wpm_bench.cpp | 36 ++ tools/docsgen/CMakeLists.txt | 9 - tools/paritychecker/CMakeLists.txt | 16 + tools/paritychecker/parity_runner.cpp | 386 +++++++++++++++++- tools/paritychecker/paritychecker_tests.cpp | 12 +- tools/paritychecker/tokenizer_bpe_parity.cpp | 17 + .../tokenizer_fallback_parity.cpp | 17 + tools/paritychecker/tokenizer_parity.hpp | 49 +++ .../paritychecker/tokenizer_parity_common.cpp | 188 +++++++++ .../paritychecker/tokenizer_plamo2_parity.cpp | 17 + tools/paritychecker/tokenizer_rwkv_parity.cpp | 17 + tools/paritychecker/tokenizer_spm_parity.cpp | 17 + tools/paritychecker/tokenizer_ugm_parity.cpp | 17 + tools/paritychecker/tokenizer_wpm_parity.cpp | 17 + 33 files changed, 1581 insertions(+), 258 deletions(-) create mode 100644 tools/bench/text/encoders/bench_common.hpp create mode 100644 tools/bench/text/encoders/bpe_bench.cpp create mode 100644 tools/bench/text/encoders/fallback_bench.cpp create mode 100644 tools/bench/text/encoders/plamo2_bench.cpp create mode 100644 tools/bench/text/encoders/rwkv_bench.cpp create mode 100644 tools/bench/text/encoders/spm_bench.cpp create mode 100644 tools/bench/text/encoders/ugm_bench.cpp create mode 100644 tools/bench/text/encoders/wpm_bench.cpp create mode 100644 tools/paritychecker/tokenizer_bpe_parity.cpp create mode 100644 tools/paritychecker/tokenizer_fallback_parity.cpp create mode 100644 tools/paritychecker/tokenizer_parity.hpp create mode 100644 tools/paritychecker/tokenizer_parity_common.cpp create mode 100644 tools/paritychecker/tokenizer_plamo2_parity.cpp create mode 100644 tools/paritychecker/tokenizer_rwkv_parity.cpp create mode 100644 tools/paritychecker/tokenizer_spm_parity.cpp create mode 100644 tools/paritychecker/tokenizer_ugm_parity.cpp create mode 100644 tools/paritychecker/tokenizer_wpm_parity.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e4367f5..d73387f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,10 +17,6 @@ endif() option(EMEL_ENABLE_TESTS "Build tests" ON) option(EMEL_ENABLE_FUZZ "Build fuzz targets" OFF) -option(EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES - "Build tensor/parser/text machine surfaces" - OFF -) include(FetchContent) include(cmake/sml_version.cmake) @@ -50,24 +46,6 @@ target_link_libraries(emel ) -if(EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES) - add_executable(mock_model_load - tools/mock_main.cpp - ) - - target_link_libraries(mock_model_load - PRIVATE - emel_core - ) - - target_include_directories(mock_model_load - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/src - ${CMAKE_CURRENT_SOURCE_DIR}/include - ${boost_sml_SOURCE_DIR}/include - ) -endif() - if(EMEL_ENABLE_TESTS) include(CTest) enable_testing() @@ -120,38 +98,36 @@ if(EMEL_ENABLE_TESTS) tests/tensor/view/lifecycle_tests.cpp ) - if(EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES) - list(APPEND EMEL_TEST_SOURCES - tests/gguf/loader/lifecycle_tests.cpp - tests/text/jinja/parser_tests.cpp - tests/text/jinja/lexer_tests.cpp - tests/text/jinja/formatter_tests.cpp - tests/text/formatter/formatter_tests.cpp - tests/text/encoders/common_tests.cpp - tests/text/encoders/bpe_tests.cpp - tests/text/encoders/spm_tests.cpp - tests/text/encoders/wpm_tests.cpp - tests/text/encoders/ugm_tests.cpp - tests/text/encoders/rwkv_tests.cpp - tests/text/encoders/plamo2_tests.cpp - tests/text/encoders/fallback_tests.cpp - tests/text/conditioner/text_conditioner_tests.cpp - tests/text/detokenizer/detokenizer_tests.cpp - tests/text/renderer/renderer_tests.cpp - tests/text/unicode/unicode_tests.cpp - tests/text/tokenizer/preprocessor_tests.cpp - tests/text/tokenizer/preprocessor_spm_tests.cpp - tests/text/tokenizer/preprocessor_wpm_tests.cpp - tests/text/tokenizer/preprocessor_rwkv_tests.cpp - tests/text/tokenizer/preprocessor_plamo2_tests.cpp - tests/text/tokenizer/preprocessor_fallback_tests.cpp - tests/text/tokenizer/bpe_regex_tests.cpp - tests/text/tokenizer/bpe_split_tests.cpp - tests/text/tokenizer/tokenizer_tests.cpp - tests/text/tokenizer/tokenizer_parity_tests.cpp - tests/text/tokenizer/tokenizer_action_guard_tests.cpp - ) - endif() + list(APPEND EMEL_TEST_SOURCES + tests/gguf/loader/lifecycle_tests.cpp + tests/text/jinja/parser_tests.cpp + tests/text/jinja/lexer_tests.cpp + tests/text/jinja/formatter_tests.cpp + tests/text/formatter/formatter_tests.cpp + tests/text/encoders/common_tests.cpp + tests/text/encoders/bpe_tests.cpp + tests/text/encoders/spm_tests.cpp + tests/text/encoders/wpm_tests.cpp + tests/text/encoders/ugm_tests.cpp + tests/text/encoders/rwkv_tests.cpp + tests/text/encoders/plamo2_tests.cpp + tests/text/encoders/fallback_tests.cpp + tests/text/conditioner/text_conditioner_tests.cpp + tests/text/detokenizer/detokenizer_tests.cpp + tests/text/renderer/renderer_tests.cpp + tests/text/unicode/unicode_tests.cpp + tests/text/tokenizer/preprocessor_tests.cpp + tests/text/tokenizer/preprocessor_spm_tests.cpp + tests/text/tokenizer/preprocessor_wpm_tests.cpp + tests/text/tokenizer/preprocessor_rwkv_tests.cpp + tests/text/tokenizer/preprocessor_plamo2_tests.cpp + tests/text/tokenizer/preprocessor_fallback_tests.cpp + tests/text/tokenizer/bpe_regex_tests.cpp + tests/text/tokenizer/bpe_split_tests.cpp + tests/text/tokenizer/tokenizer_tests.cpp + tests/text/tokenizer/tokenizer_parity_tests.cpp + tests/text/tokenizer/tokenizer_action_guard_tests.cpp + ) add_executable(emel_tests_bin ${EMEL_TEST_SOURCES} @@ -244,12 +220,10 @@ if(EMEL_ENABLE_FUZZ) ) endfunction() - if(EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES) - add_executable(emel_fuzz_gguf_parser - tests/fuzz/gguf_parser_fuzz.cpp - ) - emel_configure_fuzzer(emel_fuzz_gguf_parser) - endif() + add_executable(emel_fuzz_gguf_parser + tests/fuzz/gguf_parser_fuzz.cpp + ) + emel_configure_fuzzer(emel_fuzz_gguf_parser) add_executable(emel_fuzz_gbnf_parser tests/fuzz/gbnf_parser_fuzz.cpp diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 2540df37..99eb4249 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -10,38 +10,95 @@ retained for snapshot/report continuity and should be renamed after consumers mi | Benchmark | emel.cpp ns/op | llama.cpp ns/op | ratio | | --- | ---: | ---: | ---: | -| `batch/splitter_equal` | 1626.933 | 6278.408 | 0.259x | -| `batch/splitter_seq` | 1319.379 | 2638.238 | 0.500x | -| `batch/splitter_simple` | 738.408 | 2273.875 | 0.325x | -| `buffer/allocator_alloc_graph` | 16.671 | 55.083 | 0.303x | -| `buffer/allocator_full` | 37.625 | 252.400 | 0.149x | -| `buffer/allocator_reserve_n` | 19.971 | 442.804 | 0.045x | -| `jinja/parser_long` | 30502.542 | 49796.596 | 0.613x | -| `jinja/parser_short` | 388.525 | 491.550 | 0.790x | -| `jinja/renderer_long` | 89658.308 | 227931.921 | 0.393x | -| `jinja/renderer_short` | 1427.583 | 3803.167 | 0.375x | -| `memory/coordinator_recurrent_full` | 3895.246 | 5590.212 | 0.697x | -| `tokenizer/full_bpe_long` | 6621.133 | 7004.667 | 0.945x | -| `tokenizer/full_bpe_short` | 163.496 | 157.471 | 1.038x | -| `tokenizer/full_plamo2_long` | 10211.054 | 10239.642 | 0.997x | -| `tokenizer/full_plamo2_short` | 2205.075 | 1822.450 | 1.210x | -| `tokenizer/full_rwkv_long` | 2418.412 | 2436.733 | 0.992x | -| `tokenizer/full_rwkv_short` | 1854.350 | 2193.179 | 0.846x | -| `tokenizer/full_spm_long` | 9995.317 | 10792.767 | 0.926x | -| `tokenizer/full_spm_short` | 187.167 | 191.354 | 0.978x | -| `tokenizer/full_ugm_long` | 8868.146 | 8974.592 | 0.988x | -| `tokenizer/full_ugm_short` | 1738.117 | 2098.412 | 0.828x | -| `tokenizer/full_wpm_long` | 25314.525 | 25538.029 | 0.991x | -| `tokenizer/full_wpm_short` | 2077.092 | 2376.600 | 0.874x | -| `tokenizer/preprocessor_bpe_long` | 2776.758 | 5373.312 | 0.517x | -| `tokenizer/preprocessor_bpe_short` | 78.850 | 1747.050 | 0.045x | -| `tokenizer/preprocessor_plamo2_long` | 3082.279 | 4788.679 | 0.644x | -| `tokenizer/preprocessor_plamo2_short` | 2386.262 | 3548.504 | 0.672x | -| `tokenizer/preprocessor_rwkv_long` | 2972.246 | 4580.996 | 0.649x | -| `tokenizer/preprocessor_rwkv_short` | 2305.317 | 3535.229 | 0.652x | -| `tokenizer/preprocessor_spm_long` | 3046.325 | 4598.229 | 0.662x | -| `tokenizer/preprocessor_spm_short` | 2361.629 | 3762.438 | 0.628x | -| `tokenizer/preprocessor_ugm_long` | 3027.463 | 4692.613 | 0.645x | -| `tokenizer/preprocessor_ugm_short` | 2348.642 | 3552.613 | 0.661x | -| `tokenizer/preprocessor_wpm_long` | 2952.042 | 4562.908 | 0.647x | -| `tokenizer/preprocessor_wpm_short` | 2307.729 | 3534.338 | 0.653x | +| `batch/splitter_equal` | 1836.312 | 8593.229 | 0.214x | +| `batch/splitter_seq` | 1698.263 | 4051.104 | 0.419x | +| `batch/splitter_simple` | 1139.383 | 3584.637 | 0.318x | +| `gbnf/parser_basic` | 264.846 | 463.637 | 0.571x | +| `gbnf/parser_complex` | 1857.504 | 2470.021 | 0.752x | +| `jinja/parser_long` | 33815.062 | 55666.438 | 0.607x | +| `jinja/parser_short` | 531.558 | 662.467 | 0.802x | +| `jinja/renderer_long` | 84833.121 | 406507.271 | 0.209x | +| `jinja/renderer_short` | 1113.967 | 6485.746 | 0.172x | +| `kernel/aarch64/op_add` | 92.546 | 5279.417 | 0.018x | +| `kernel/aarch64/op_cos` | 1631.362 | 5731.046 | 0.285x | +| `kernel/aarch64/op_div` | 93.892 | 4394.467 | 0.021x | +| `kernel/aarch64/op_dup` | 86.471 | 4282.050 | 0.020x | +| `kernel/aarch64/op_log` | 1819.667 | 6011.442 | 0.303x | +| `kernel/aarch64/op_mul` | 89.467 | 5507.025 | 0.016x | +| `kernel/aarch64/op_mul_mat` | 4517.254 | 10219.783 | 0.442x | +| `kernel/aarch64/op_sin` | 1289.033 | 5404.462 | 0.239x | +| `kernel/aarch64/op_soft_max` | 2065.446 | 4890.883 | 0.422x | +| `kernel/aarch64/op_sqr` | 86.829 | 4336.387 | 0.020x | +| `kernel/aarch64/op_sqrt` | 138.008 | 4288.304 | 0.032x | +| `kernel/aarch64/op_sub` | 88.904 | 5325.046 | 0.017x | +| `kernel/aarch64/op_unary_exp` | 1277.404 | 5371.312 | 0.238x | +| `kernel/aarch64/op_unary_neg` | 86.029 | 4175.996 | 0.021x | +| `kernel/aarch64/op_unary_relu` | 90.608 | 4124.083 | 0.022x | +| `kernel/x86_64/op_add` | 64.504 | 5233.129 | 0.012x | +| `kernel/x86_64/op_cos` | 1628.146 | 6016.683 | 0.271x | +| `kernel/x86_64/op_div` | 73.971 | 5013.746 | 0.015x | +| `kernel/x86_64/op_dup` | 47.921 | 4274.621 | 0.011x | +| `kernel/x86_64/op_log` | 1852.987 | 6434.496 | 0.288x | +| `kernel/x86_64/op_mul` | 60.212 | 5865.367 | 0.010x | +| `kernel/x86_64/op_mul_mat` | 43938.567 | 11147.154 | 3.942x | +| `kernel/x86_64/op_sin` | 1262.237 | 5676.933 | 0.222x | +| `kernel/x86_64/op_soft_max` | 2059.963 | 4999.904 | 0.412x | +| `kernel/x86_64/op_sqr` | 50.700 | 4964.100 | 0.010x | +| `kernel/x86_64/op_sqrt` | 140.496 | 4741.517 | 0.030x | +| `kernel/x86_64/op_sub` | 60.233 | 5408.542 | 0.011x | +| `kernel/x86_64/op_unary_exp` | 1268.250 | 5503.779 | 0.230x | +| `kernel/x86_64/op_unary_neg` | 47.487 | 4549.292 | 0.010x | +| `kernel/x86_64/op_unary_relu` | 47.254 | 4375.171 | 0.011x | +| `logits/sampler_raw/vocab_128000` | 18746.150 | 19140.217 | 0.979x | +| `logits/sampler_raw/vocab_256000` | 37709.246 | 37594.458 | 1.003x | +| `logits/sampler_raw/vocab_32000` | 4739.504 | 4991.942 | 0.949x | +| `logits/sampler_sml/vocab_128000` | 16979.446 | 16718.892 | 1.016x | +| `logits/sampler_sml/vocab_256000` | 36024.967 | 29679.767 | 1.214x | +| `logits/sampler_sml/vocab_32000` | 3928.754 | 3549.517 | 1.107x | +| `logits/validator_raw/vocab_128000` | 88332.717 | 90152.250 | 0.980x | +| `logits/validator_raw/vocab_256000` | 182805.817 | 182508.413 | 1.002x | +| `logits/validator_raw/vocab_32000` | 23365.571 | 23878.521 | 0.979x | +| `logits/validator_sml/vocab_128000` | 96768.158 | 98797.996 | 0.979x | +| `logits/validator_sml/vocab_256000` | 193641.642 | 196589.429 | 0.985x | +| `logits/validator_sml/vocab_32000` | 23869.067 | 24220.071 | 0.986x | +| `memory/hybrid_full` | 387.054 | 37587.438 | 0.010x | +| `memory/kv_full` | 100.883 | 36279.867 | 0.003x | +| `memory/recurrent_full` | 114.583 | 5563.017 | 0.021x | +| `text/encoders/bpe_long` | 10232.100 | 10270.446 | 0.996x | +| `text/encoders/bpe_short` | 164.613 | 160.850 | 1.023x | +| `text/encoders/fallback_long` | 2522.454 | 2465.408 | 1.023x | +| `text/encoders/fallback_short` | 45.263 | 47.033 | 0.962x | +| `text/encoders/plamo2_long` | 4983.292 | 4977.471 | 1.001x | +| `text/encoders/plamo2_short` | 108.175 | 106.071 | 1.020x | +| `text/encoders/rwkv_long` | 4530.604 | 4569.600 | 0.991x | +| `text/encoders/rwkv_short` | 2613.637 | 2628.946 | 0.994x | +| `text/encoders/spm_long` | 12319.425 | 12292.258 | 1.002x | +| `text/encoders/spm_short` | 202.892 | 208.137 | 0.975x | +| `text/encoders/ugm_long` | 8120.746 | 8109.150 | 1.001x | +| `text/encoders/ugm_short` | 131.733 | 139.221 | 0.946x | +| `text/encoders/wpm_long` | 26693.121 | 26402.671 | 1.011x | +| `text/encoders/wpm_short` | 529.188 | 536.987 | 0.985x | +| `tokenizer/full_bpe_long` | 9626.758 | 9619.733 | 1.001x | +| `tokenizer/full_bpe_short` | 219.575 | 211.517 | 1.038x | +| `tokenizer/full_plamo2_long` | 10053.233 | 9994.929 | 1.006x | +| `tokenizer/full_plamo2_short` | 1918.483 | 1897.900 | 1.011x | +| `tokenizer/full_rwkv_long` | 3675.642 | 3665.338 | 1.003x | +| `tokenizer/full_rwkv_short` | 2230.875 | 2521.367 | 0.885x | +| `tokenizer/full_spm_long` | 13644.233 | 13779.175 | 0.990x | +| `tokenizer/full_spm_short` | 295.458 | 281.479 | 1.050x | +| `tokenizer/full_ugm_long` | 10078.542 | 10030.425 | 1.005x | +| `tokenizer/full_ugm_short` | 2138.625 | 2206.517 | 0.969x | +| `tokenizer/full_wpm_long` | 28529.271 | 28240.213 | 1.010x | +| `tokenizer/full_wpm_short` | 2266.062 | 2320.533 | 0.977x | +| `tokenizer/preprocessor_bpe_long` | 2753.250 | 5209.350 | 0.529x | +| `tokenizer/preprocessor_bpe_short` | 86.571 | 1702.050 | 0.051x | +| `tokenizer/preprocessor_plamo2_long` | 3144.229 | 4588.988 | 0.685x | +| `tokenizer/preprocessor_plamo2_short` | 2467.929 | 3609.229 | 0.684x | +| `tokenizer/preprocessor_rwkv_long` | 3121.367 | 4583.267 | 0.681x | +| `tokenizer/preprocessor_rwkv_short` | 2477.188 | 3683.521 | 0.673x | +| `tokenizer/preprocessor_spm_long` | 3127.146 | 4508.325 | 0.694x | +| `tokenizer/preprocessor_spm_short` | 2448.333 | 3641.775 | 0.672x | +| `tokenizer/preprocessor_ugm_long` | 3190.696 | 4554.075 | 0.701x | +| `tokenizer/preprocessor_ugm_short` | 2460.821 | 3632.079 | 0.678x | +| `tokenizer/preprocessor_wpm_long` | 3154.375 | 4505.400 | 0.700x | +| `tokenizer/preprocessor_wpm_short` | 2466.742 | 3636.067 | 0.678x | diff --git a/scripts/fuzz_smoke.sh b/scripts/fuzz_smoke.sh index 1c5cb5a3..9f06fcad 100755 --- a/scripts/fuzz_smoke.sh +++ b/scripts/fuzz_smoke.sh @@ -59,7 +59,6 @@ cmake -S "$ROOT_DIR" -B "$BUILD_DIR" -G Ninja \ -DCMAKE_CXX_FLAGS="$fuzz_cxx_flags" \ -DCMAKE_EXE_LINKER_FLAGS="$fuzz_link_flags" \ -DEMEL_ENABLE_FUZZ=ON \ - -DEMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES=OFF \ -DEMEL_ENABLE_TESTS=OFF cmake --build "$BUILD_DIR" --parallel diff --git a/snapshots/bench/benchmarks.txt b/snapshots/bench/benchmarks.txt index 1173b4c4..551aeff9 100644 --- a/snapshots/bench/benchmarks.txt +++ b/snapshots/bench/benchmarks.txt @@ -1,40 +1,94 @@ -# ref=94b0200a01a753eff5897dab9311f51a7bc1c62f +# ref=ecbcb7ea9d3303097519723b264a8b5f1e977028 # toolchain=/opt/homebrew/bin/zig -batch/splitter_equal ns_per_op=1539.204 iter=100000 runs=5 -batch/splitter_seq ns_per_op=1461.882 iter=100000 runs=5 -batch/splitter_simple ns_per_op=741.611 iter=100000 runs=5 -buffer/allocator_alloc_graph ns_per_op=17.524 iter=100000 runs=5 -buffer/allocator_full ns_per_op=40.041 iter=100000 runs=5 -buffer/allocator_reserve_n ns_per_op=19.666 iter=100000 runs=5 -jinja/parser_long ns_per_op=34833.174 iter=100000 runs=5 -jinja/parser_short ns_per_op=548.385 iter=100000 runs=5 -jinja/renderer_long ns_per_op=92220.130 iter=100000 runs=5 -jinja/renderer_short ns_per_op=1530.596 iter=100000 runs=5 -memory/coordinator_recurrent_full ns_per_op=622.327 iter=100000 runs=5 -memory/hybrid_full ns_per_op=822.615 iter=100000 runs=5 -memory/kv_full ns_per_op=1131.967 iter=100000 runs=5 -memory/recurrent_full ns_per_op=59.830 iter=100000 runs=5 -tokenizer/full_bpe_long ns_per_op=6623.310 iter=100000 runs=5 -tokenizer/full_bpe_short ns_per_op=152.812 iter=100000 runs=5 -tokenizer/full_plamo2_long ns_per_op=10167.205 iter=100000 runs=5 -tokenizer/full_plamo2_short ns_per_op=1792.043 iter=100000 runs=5 -tokenizer/full_rwkv_long ns_per_op=2318.382 iter=100000 runs=5 -tokenizer/full_rwkv_short ns_per_op=1788.594 iter=100000 runs=5 -tokenizer/full_spm_long ns_per_op=10250.343 iter=100000 runs=5 -tokenizer/full_spm_short ns_per_op=201.738 iter=100000 runs=5 -tokenizer/full_ugm_long ns_per_op=8726.966 iter=100000 runs=5 -tokenizer/full_ugm_short ns_per_op=1758.810 iter=100000 runs=5 -tokenizer/full_wpm_long ns_per_op=25312.077 iter=100000 runs=5 -tokenizer/full_wpm_short ns_per_op=2153.271 iter=100000 runs=5 -tokenizer/preprocessor_bpe_long ns_per_op=2826.512 iter=100000 runs=5 -tokenizer/preprocessor_bpe_short ns_per_op=75.043 iter=100000 runs=5 -tokenizer/preprocessor_plamo2_long ns_per_op=3048.009 iter=100000 runs=5 -tokenizer/preprocessor_plamo2_short ns_per_op=2383.916 iter=100000 runs=5 -tokenizer/preprocessor_rwkv_long ns_per_op=3034.042 iter=100000 runs=5 -tokenizer/preprocessor_rwkv_short ns_per_op=2373.241 iter=100000 runs=5 -tokenizer/preprocessor_spm_long ns_per_op=3031.130 iter=100000 runs=5 -tokenizer/preprocessor_spm_short ns_per_op=2364.155 iter=100000 runs=5 -tokenizer/preprocessor_ugm_long ns_per_op=3108.304 iter=100000 runs=5 -tokenizer/preprocessor_ugm_short ns_per_op=2381.539 iter=100000 runs=5 -tokenizer/preprocessor_wpm_long ns_per_op=3056.099 iter=100000 runs=5 -tokenizer/preprocessor_wpm_short ns_per_op=2370.789 iter=100000 runs=5 +batch/splitter_equal ns_per_op=1836.312 +batch/splitter_seq ns_per_op=1698.263 +batch/splitter_simple ns_per_op=1139.383 +gbnf/parser_basic ns_per_op=264.846 +gbnf/parser_complex ns_per_op=1857.504 +jinja/parser_long ns_per_op=33815.062 +jinja/parser_short ns_per_op=531.558 +jinja/renderer_long ns_per_op=84833.121 +jinja/renderer_short ns_per_op=1113.967 +kernel/aarch64/op_add ns_per_op=92.546 +kernel/aarch64/op_cos ns_per_op=1631.362 +kernel/aarch64/op_div ns_per_op=93.892 +kernel/aarch64/op_dup ns_per_op=86.471 +kernel/aarch64/op_log ns_per_op=1819.667 +kernel/aarch64/op_mul ns_per_op=89.467 +kernel/aarch64/op_mul_mat ns_per_op=4517.254 +kernel/aarch64/op_sin ns_per_op=1289.033 +kernel/aarch64/op_soft_max ns_per_op=2065.446 +kernel/aarch64/op_sqr ns_per_op=86.829 +kernel/aarch64/op_sqrt ns_per_op=138.008 +kernel/aarch64/op_sub ns_per_op=88.904 +kernel/aarch64/op_unary_exp ns_per_op=1277.404 +kernel/aarch64/op_unary_neg ns_per_op=86.029 +kernel/aarch64/op_unary_relu ns_per_op=90.608 +kernel/x86_64/op_add ns_per_op=64.504 +kernel/x86_64/op_cos ns_per_op=1628.146 +kernel/x86_64/op_div ns_per_op=73.971 +kernel/x86_64/op_dup ns_per_op=47.921 +kernel/x86_64/op_log ns_per_op=1852.987 +kernel/x86_64/op_mul ns_per_op=60.212 +kernel/x86_64/op_mul_mat ns_per_op=43938.567 +kernel/x86_64/op_sin ns_per_op=1262.237 +kernel/x86_64/op_soft_max ns_per_op=2059.963 +kernel/x86_64/op_sqr ns_per_op=50.700 +kernel/x86_64/op_sqrt ns_per_op=140.496 +kernel/x86_64/op_sub ns_per_op=60.233 +kernel/x86_64/op_unary_exp ns_per_op=1268.250 +kernel/x86_64/op_unary_neg ns_per_op=47.487 +kernel/x86_64/op_unary_relu ns_per_op=47.254 +logits/sampler_raw/vocab_128000 ns_per_op=18746.150 +logits/sampler_raw/vocab_256000 ns_per_op=37709.246 +logits/sampler_raw/vocab_32000 ns_per_op=4739.504 +logits/sampler_sml/vocab_128000 ns_per_op=16979.446 +logits/sampler_sml/vocab_256000 ns_per_op=36024.967 +logits/sampler_sml/vocab_32000 ns_per_op=3928.754 +logits/validator_raw/vocab_128000 ns_per_op=88332.717 +logits/validator_raw/vocab_256000 ns_per_op=182805.817 +logits/validator_raw/vocab_32000 ns_per_op=23365.571 +logits/validator_sml/vocab_128000 ns_per_op=96768.158 +logits/validator_sml/vocab_256000 ns_per_op=193641.642 +logits/validator_sml/vocab_32000 ns_per_op=23869.067 +memory/hybrid_full ns_per_op=387.054 +memory/kv_full ns_per_op=100.883 +memory/recurrent_full ns_per_op=114.583 +text/encoders/bpe_long ns_per_op=10232.100 +text/encoders/bpe_short ns_per_op=164.613 +text/encoders/fallback_long ns_per_op=2522.454 +text/encoders/fallback_short ns_per_op=45.263 +text/encoders/plamo2_long ns_per_op=4983.292 +text/encoders/plamo2_short ns_per_op=108.175 +text/encoders/rwkv_long ns_per_op=4530.604 +text/encoders/rwkv_short ns_per_op=2613.637 +text/encoders/spm_long ns_per_op=12319.425 +text/encoders/spm_short ns_per_op=202.892 +text/encoders/ugm_long ns_per_op=8120.746 +text/encoders/ugm_short ns_per_op=131.733 +text/encoders/wpm_long ns_per_op=26693.121 +text/encoders/wpm_short ns_per_op=529.188 +tokenizer/full_bpe_long ns_per_op=9626.758 +tokenizer/full_bpe_short ns_per_op=219.575 +tokenizer/full_plamo2_long ns_per_op=10053.233 +tokenizer/full_plamo2_short ns_per_op=1918.483 +tokenizer/full_rwkv_long ns_per_op=3675.642 +tokenizer/full_rwkv_short ns_per_op=2230.875 +tokenizer/full_spm_long ns_per_op=13644.233 +tokenizer/full_spm_short ns_per_op=295.458 +tokenizer/full_ugm_long ns_per_op=10078.542 +tokenizer/full_ugm_short ns_per_op=2138.625 +tokenizer/full_wpm_long ns_per_op=28529.271 +tokenizer/full_wpm_short ns_per_op=2266.062 +tokenizer/preprocessor_bpe_long ns_per_op=2753.250 +tokenizer/preprocessor_bpe_short ns_per_op=86.571 +tokenizer/preprocessor_plamo2_long ns_per_op=3144.229 +tokenizer/preprocessor_plamo2_short ns_per_op=2467.929 +tokenizer/preprocessor_rwkv_long ns_per_op=3121.367 +tokenizer/preprocessor_rwkv_short ns_per_op=2477.188 +tokenizer/preprocessor_spm_long ns_per_op=3127.146 +tokenizer/preprocessor_spm_short ns_per_op=2448.333 +tokenizer/preprocessor_ugm_long ns_per_op=3190.696 +tokenizer/preprocessor_ugm_short ns_per_op=2460.821 +tokenizer/preprocessor_wpm_long ns_per_op=3154.375 +tokenizer/preprocessor_wpm_short ns_per_op=2466.742 diff --git a/snapshots/bench/benchmarks_compare.txt b/snapshots/bench/benchmarks_compare.txt index d29a4fea..5287953c 100644 --- a/snapshots/bench/benchmarks_compare.txt +++ b/snapshots/bench/benchmarks_compare.txt @@ -1,37 +1,94 @@ -# ref=94b0200a01a753eff5897dab9311f51a7bc1c62f +# ref=ecbcb7ea9d3303097519723b264a8b5f1e977028 # toolchain=/opt/homebrew/bin/zig -batch/splitter_equal emel.cpp 1626.933 ns/op, llama.cpp 6278.408 ns/op, ratio=0.259x -batch/splitter_seq emel.cpp 1319.379 ns/op, llama.cpp 2638.238 ns/op, ratio=0.500x -batch/splitter_simple emel.cpp 738.408 ns/op, llama.cpp 2273.875 ns/op, ratio=0.325x -buffer/allocator_alloc_graph emel.cpp 16.671 ns/op, llama.cpp 55.083 ns/op, ratio=0.303x -buffer/allocator_full emel.cpp 37.625 ns/op, llama.cpp 252.400 ns/op, ratio=0.149x -buffer/allocator_reserve_n emel.cpp 19.971 ns/op, llama.cpp 442.804 ns/op, ratio=0.045x -jinja/parser_long emel.cpp 30502.542 ns/op, llama.cpp 49796.596 ns/op, ratio=0.613x -jinja/parser_short emel.cpp 388.525 ns/op, llama.cpp 491.550 ns/op, ratio=0.790x -jinja/renderer_long emel.cpp 89658.308 ns/op, llama.cpp 227931.921 ns/op, ratio=0.393x -jinja/renderer_short emel.cpp 1427.583 ns/op, llama.cpp 3803.167 ns/op, ratio=0.375x -memory/coordinator_recurrent_full emel.cpp 3895.246 ns/op, llama.cpp 5590.212 ns/op, ratio=0.697x -tokenizer/full_bpe_long emel.cpp 6621.133 ns/op, llama.cpp 7004.667 ns/op, ratio=0.945x -tokenizer/full_bpe_short emel.cpp 163.496 ns/op, llama.cpp 157.471 ns/op, ratio=1.038x -tokenizer/full_plamo2_long emel.cpp 10211.054 ns/op, llama.cpp 10239.642 ns/op, ratio=0.997x -tokenizer/full_plamo2_short emel.cpp 2205.075 ns/op, llama.cpp 1822.450 ns/op, ratio=1.210x -tokenizer/full_rwkv_long emel.cpp 2418.412 ns/op, llama.cpp 2436.733 ns/op, ratio=0.992x -tokenizer/full_rwkv_short emel.cpp 1854.350 ns/op, llama.cpp 2193.179 ns/op, ratio=0.846x -tokenizer/full_spm_long emel.cpp 9995.317 ns/op, llama.cpp 10792.767 ns/op, ratio=0.926x -tokenizer/full_spm_short emel.cpp 187.167 ns/op, llama.cpp 191.354 ns/op, ratio=0.978x -tokenizer/full_ugm_long emel.cpp 8868.146 ns/op, llama.cpp 8974.592 ns/op, ratio=0.988x -tokenizer/full_ugm_short emel.cpp 1738.117 ns/op, llama.cpp 2098.412 ns/op, ratio=0.828x -tokenizer/full_wpm_long emel.cpp 25314.525 ns/op, llama.cpp 25538.029 ns/op, ratio=0.991x -tokenizer/full_wpm_short emel.cpp 2077.092 ns/op, llama.cpp 2376.600 ns/op, ratio=0.874x -tokenizer/preprocessor_bpe_long emel.cpp 2776.758 ns/op, llama.cpp 5373.312 ns/op, ratio=0.517x -tokenizer/preprocessor_bpe_short emel.cpp 78.850 ns/op, llama.cpp 1747.050 ns/op, ratio=0.045x -tokenizer/preprocessor_plamo2_long emel.cpp 3082.279 ns/op, llama.cpp 4788.679 ns/op, ratio=0.644x -tokenizer/preprocessor_plamo2_short emel.cpp 2386.262 ns/op, llama.cpp 3548.504 ns/op, ratio=0.672x -tokenizer/preprocessor_rwkv_long emel.cpp 2972.246 ns/op, llama.cpp 4580.996 ns/op, ratio=0.649x -tokenizer/preprocessor_rwkv_short emel.cpp 2305.317 ns/op, llama.cpp 3535.229 ns/op, ratio=0.652x -tokenizer/preprocessor_spm_long emel.cpp 3046.325 ns/op, llama.cpp 4598.229 ns/op, ratio=0.662x -tokenizer/preprocessor_spm_short emel.cpp 2361.629 ns/op, llama.cpp 3762.438 ns/op, ratio=0.628x -tokenizer/preprocessor_ugm_long emel.cpp 3027.463 ns/op, llama.cpp 4692.613 ns/op, ratio=0.645x -tokenizer/preprocessor_ugm_short emel.cpp 2348.642 ns/op, llama.cpp 3552.613 ns/op, ratio=0.661x -tokenizer/preprocessor_wpm_long emel.cpp 2952.042 ns/op, llama.cpp 4562.908 ns/op, ratio=0.647x -tokenizer/preprocessor_wpm_short emel.cpp 2307.729 ns/op, llama.cpp 3534.338 ns/op, ratio=0.653x +batch/splitter_equal emel.cpp 1836.312 ns/op, llama.cpp 8593.229 ns/op, ratio=0.214x +batch/splitter_seq emel.cpp 1698.263 ns/op, llama.cpp 4051.104 ns/op, ratio=0.419x +batch/splitter_simple emel.cpp 1139.383 ns/op, llama.cpp 3584.637 ns/op, ratio=0.318x +gbnf/parser_basic emel.cpp 264.846 ns/op, llama.cpp 463.637 ns/op, ratio=0.571x +gbnf/parser_complex emel.cpp 1857.504 ns/op, llama.cpp 2470.021 ns/op, ratio=0.752x +jinja/parser_long emel.cpp 33815.062 ns/op, llama.cpp 55666.438 ns/op, ratio=0.607x +jinja/parser_short emel.cpp 531.558 ns/op, llama.cpp 662.467 ns/op, ratio=0.802x +jinja/renderer_long emel.cpp 84833.121 ns/op, llama.cpp 406507.271 ns/op, ratio=0.209x +jinja/renderer_short emel.cpp 1113.967 ns/op, llama.cpp 6485.746 ns/op, ratio=0.172x +kernel/aarch64/op_add emel.cpp 92.546 ns/op, llama.cpp 5279.417 ns/op, ratio=0.018x +kernel/aarch64/op_cos emel.cpp 1631.362 ns/op, llama.cpp 5731.046 ns/op, ratio=0.285x +kernel/aarch64/op_div emel.cpp 93.892 ns/op, llama.cpp 4394.467 ns/op, ratio=0.021x +kernel/aarch64/op_dup emel.cpp 86.471 ns/op, llama.cpp 4282.050 ns/op, ratio=0.020x +kernel/aarch64/op_log emel.cpp 1819.667 ns/op, llama.cpp 6011.442 ns/op, ratio=0.303x +kernel/aarch64/op_mul emel.cpp 89.467 ns/op, llama.cpp 5507.025 ns/op, ratio=0.016x +kernel/aarch64/op_mul_mat emel.cpp 4517.254 ns/op, llama.cpp 10219.783 ns/op, ratio=0.442x +kernel/aarch64/op_sin emel.cpp 1289.033 ns/op, llama.cpp 5404.462 ns/op, ratio=0.239x +kernel/aarch64/op_soft_max emel.cpp 2065.446 ns/op, llama.cpp 4890.883 ns/op, ratio=0.422x +kernel/aarch64/op_sqr emel.cpp 86.829 ns/op, llama.cpp 4336.387 ns/op, ratio=0.020x +kernel/aarch64/op_sqrt emel.cpp 138.008 ns/op, llama.cpp 4288.304 ns/op, ratio=0.032x +kernel/aarch64/op_sub emel.cpp 88.904 ns/op, llama.cpp 5325.046 ns/op, ratio=0.017x +kernel/aarch64/op_unary_exp emel.cpp 1277.404 ns/op, llama.cpp 5371.312 ns/op, ratio=0.238x +kernel/aarch64/op_unary_neg emel.cpp 86.029 ns/op, llama.cpp 4175.996 ns/op, ratio=0.021x +kernel/aarch64/op_unary_relu emel.cpp 90.608 ns/op, llama.cpp 4124.083 ns/op, ratio=0.022x +kernel/x86_64/op_add emel.cpp 64.504 ns/op, llama.cpp 5233.129 ns/op, ratio=0.012x +kernel/x86_64/op_cos emel.cpp 1628.146 ns/op, llama.cpp 6016.683 ns/op, ratio=0.271x +kernel/x86_64/op_div emel.cpp 73.971 ns/op, llama.cpp 5013.746 ns/op, ratio=0.015x +kernel/x86_64/op_dup emel.cpp 47.921 ns/op, llama.cpp 4274.621 ns/op, ratio=0.011x +kernel/x86_64/op_log emel.cpp 1852.987 ns/op, llama.cpp 6434.496 ns/op, ratio=0.288x +kernel/x86_64/op_mul emel.cpp 60.212 ns/op, llama.cpp 5865.367 ns/op, ratio=0.010x +kernel/x86_64/op_mul_mat emel.cpp 43938.567 ns/op, llama.cpp 11147.154 ns/op, ratio=3.942x +kernel/x86_64/op_sin emel.cpp 1262.237 ns/op, llama.cpp 5676.933 ns/op, ratio=0.222x +kernel/x86_64/op_soft_max emel.cpp 2059.963 ns/op, llama.cpp 4999.904 ns/op, ratio=0.412x +kernel/x86_64/op_sqr emel.cpp 50.700 ns/op, llama.cpp 4964.100 ns/op, ratio=0.010x +kernel/x86_64/op_sqrt emel.cpp 140.496 ns/op, llama.cpp 4741.517 ns/op, ratio=0.030x +kernel/x86_64/op_sub emel.cpp 60.233 ns/op, llama.cpp 5408.542 ns/op, ratio=0.011x +kernel/x86_64/op_unary_exp emel.cpp 1268.250 ns/op, llama.cpp 5503.779 ns/op, ratio=0.230x +kernel/x86_64/op_unary_neg emel.cpp 47.487 ns/op, llama.cpp 4549.292 ns/op, ratio=0.010x +kernel/x86_64/op_unary_relu emel.cpp 47.254 ns/op, llama.cpp 4375.171 ns/op, ratio=0.011x +logits/sampler_raw/vocab_128000 emel.cpp 18746.150 ns/op, llama.cpp 19140.217 ns/op, ratio=0.979x +logits/sampler_raw/vocab_256000 emel.cpp 37709.246 ns/op, llama.cpp 37594.458 ns/op, ratio=1.003x +logits/sampler_raw/vocab_32000 emel.cpp 4739.504 ns/op, llama.cpp 4991.942 ns/op, ratio=0.949x +logits/sampler_sml/vocab_128000 emel.cpp 16979.446 ns/op, llama.cpp 16718.892 ns/op, ratio=1.016x +logits/sampler_sml/vocab_256000 emel.cpp 36024.967 ns/op, llama.cpp 29679.767 ns/op, ratio=1.214x +logits/sampler_sml/vocab_32000 emel.cpp 3928.754 ns/op, llama.cpp 3549.517 ns/op, ratio=1.107x +logits/validator_raw/vocab_128000 emel.cpp 88332.717 ns/op, llama.cpp 90152.250 ns/op, ratio=0.980x +logits/validator_raw/vocab_256000 emel.cpp 182805.817 ns/op, llama.cpp 182508.413 ns/op, ratio=1.002x +logits/validator_raw/vocab_32000 emel.cpp 23365.571 ns/op, llama.cpp 23878.521 ns/op, ratio=0.979x +logits/validator_sml/vocab_128000 emel.cpp 96768.158 ns/op, llama.cpp 98797.996 ns/op, ratio=0.979x +logits/validator_sml/vocab_256000 emel.cpp 193641.642 ns/op, llama.cpp 196589.429 ns/op, ratio=0.985x +logits/validator_sml/vocab_32000 emel.cpp 23869.067 ns/op, llama.cpp 24220.071 ns/op, ratio=0.986x +memory/hybrid_full emel.cpp 387.054 ns/op, llama.cpp 37587.438 ns/op, ratio=0.010x +memory/kv_full emel.cpp 100.883 ns/op, llama.cpp 36279.867 ns/op, ratio=0.003x +memory/recurrent_full emel.cpp 114.583 ns/op, llama.cpp 5563.017 ns/op, ratio=0.021x +text/encoders/bpe_long emel.cpp 10232.100 ns/op, llama.cpp 10270.446 ns/op, ratio=0.996x +text/encoders/bpe_short emel.cpp 164.613 ns/op, llama.cpp 160.850 ns/op, ratio=1.023x +text/encoders/fallback_long emel.cpp 2522.454 ns/op, llama.cpp 2465.408 ns/op, ratio=1.023x +text/encoders/fallback_short emel.cpp 45.263 ns/op, llama.cpp 47.033 ns/op, ratio=0.962x +text/encoders/plamo2_long emel.cpp 4983.292 ns/op, llama.cpp 4977.471 ns/op, ratio=1.001x +text/encoders/plamo2_short emel.cpp 108.175 ns/op, llama.cpp 106.071 ns/op, ratio=1.020x +text/encoders/rwkv_long emel.cpp 4530.604 ns/op, llama.cpp 4569.600 ns/op, ratio=0.991x +text/encoders/rwkv_short emel.cpp 2613.637 ns/op, llama.cpp 2628.946 ns/op, ratio=0.994x +text/encoders/spm_long emel.cpp 12319.425 ns/op, llama.cpp 12292.258 ns/op, ratio=1.002x +text/encoders/spm_short emel.cpp 202.892 ns/op, llama.cpp 208.137 ns/op, ratio=0.975x +text/encoders/ugm_long emel.cpp 8120.746 ns/op, llama.cpp 8109.150 ns/op, ratio=1.001x +text/encoders/ugm_short emel.cpp 131.733 ns/op, llama.cpp 139.221 ns/op, ratio=0.946x +text/encoders/wpm_long emel.cpp 26693.121 ns/op, llama.cpp 26402.671 ns/op, ratio=1.011x +text/encoders/wpm_short emel.cpp 529.188 ns/op, llama.cpp 536.987 ns/op, ratio=0.985x +tokenizer/full_bpe_long emel.cpp 9626.758 ns/op, llama.cpp 9619.733 ns/op, ratio=1.001x +tokenizer/full_bpe_short emel.cpp 219.575 ns/op, llama.cpp 211.517 ns/op, ratio=1.038x +tokenizer/full_plamo2_long emel.cpp 10053.233 ns/op, llama.cpp 9994.929 ns/op, ratio=1.006x +tokenizer/full_plamo2_short emel.cpp 1918.483 ns/op, llama.cpp 1897.900 ns/op, ratio=1.011x +tokenizer/full_rwkv_long emel.cpp 3675.642 ns/op, llama.cpp 3665.338 ns/op, ratio=1.003x +tokenizer/full_rwkv_short emel.cpp 2230.875 ns/op, llama.cpp 2521.367 ns/op, ratio=0.885x +tokenizer/full_spm_long emel.cpp 13644.233 ns/op, llama.cpp 13779.175 ns/op, ratio=0.990x +tokenizer/full_spm_short emel.cpp 295.458 ns/op, llama.cpp 281.479 ns/op, ratio=1.050x +tokenizer/full_ugm_long emel.cpp 10078.542 ns/op, llama.cpp 10030.425 ns/op, ratio=1.005x +tokenizer/full_ugm_short emel.cpp 2138.625 ns/op, llama.cpp 2206.517 ns/op, ratio=0.969x +tokenizer/full_wpm_long emel.cpp 28529.271 ns/op, llama.cpp 28240.213 ns/op, ratio=1.010x +tokenizer/full_wpm_short emel.cpp 2266.062 ns/op, llama.cpp 2320.533 ns/op, ratio=0.977x +tokenizer/preprocessor_bpe_long emel.cpp 2753.250 ns/op, llama.cpp 5209.350 ns/op, ratio=0.529x +tokenizer/preprocessor_bpe_short emel.cpp 86.571 ns/op, llama.cpp 1702.050 ns/op, ratio=0.051x +tokenizer/preprocessor_plamo2_long emel.cpp 3144.229 ns/op, llama.cpp 4588.988 ns/op, ratio=0.685x +tokenizer/preprocessor_plamo2_short emel.cpp 2467.929 ns/op, llama.cpp 3609.229 ns/op, ratio=0.684x +tokenizer/preprocessor_rwkv_long emel.cpp 3121.367 ns/op, llama.cpp 4583.267 ns/op, ratio=0.681x +tokenizer/preprocessor_rwkv_short emel.cpp 2477.188 ns/op, llama.cpp 3683.521 ns/op, ratio=0.673x +tokenizer/preprocessor_spm_long emel.cpp 3127.146 ns/op, llama.cpp 4508.325 ns/op, ratio=0.694x +tokenizer/preprocessor_spm_short emel.cpp 2448.333 ns/op, llama.cpp 3641.775 ns/op, ratio=0.672x +tokenizer/preprocessor_ugm_long emel.cpp 3190.696 ns/op, llama.cpp 4554.075 ns/op, ratio=0.701x +tokenizer/preprocessor_ugm_short emel.cpp 2460.821 ns/op, llama.cpp 3632.079 ns/op, ratio=0.678x +tokenizer/preprocessor_wpm_long emel.cpp 3154.375 ns/op, llama.cpp 4505.400 ns/op, ratio=0.700x +tokenizer/preprocessor_wpm_short emel.cpp 2466.742 ns/op, llama.cpp 3636.067 ns/op, ratio=0.678x diff --git a/snapshots/quality_gates/timing.txt b/snapshots/quality_gates/timing.txt index b5f7b0f8..08dc627c 100644 --- a/snapshots/quality_gates/timing.txt +++ b/snapshots/quality_gates/timing.txt @@ -1,8 +1,8 @@ # quality_gates timing (seconds) build_with_zig 0 -test_with_coverage 46 -paritychecker 4 -fuzz_smoke 17 -bench_snapshot 51 -generate_docs 24 -total 142 +test_with_coverage 71 +paritychecker 5 +fuzz_smoke 28 +bench_snapshot 87 +generate_docs 30 +total 222 diff --git a/src/emel/text/encoders/plamo2/detail.hpp b/src/emel/text/encoders/plamo2/detail.hpp index 03fff7e6..304a1dbc 100644 --- a/src/emel/text/encoders/plamo2/detail.hpp +++ b/src/emel/text/encoders/plamo2/detail.hpp @@ -219,9 +219,12 @@ inline bool ensure_plamo2_tables(emel::text::encoders::plamo2::action::context & for (bool emit_piece = has_score; emit_piece; emit_piece = false) { auto token_it = token_to_id.find(piece); const bool has_token = token_it != token_to_id.end(); + int32_t token_id = -1; + for (bool use_token = has_token; use_token; use_token = false) { + token_id = token_it->second; + } ctx.table[static_cast(table_idx)].piece_length = piece_len; - ctx.table[static_cast(table_idx)].token_id = - select_i32(has_token, token_it->second, -1); + ctx.table[static_cast(table_idx)].token_id = token_id; const float score = score_it->second; const int32_t rounded = static_cast(std::round(score * 1e4f)); ctx.table[static_cast(table_idx)].score = @@ -290,7 +293,11 @@ inline encode_result encode_plamo2(const event::encode &ev, static_cast(ctx.table[p].piece_id); const auto it = ctx.suffix_map.find(piece_code); const bool found = it != ctx.suffix_map.end(); - suffix_id = select_i32(found, it->second, 0); + int32_t found_suffix_id = 0; + for (bool use_suffix = found; use_suffix; use_suffix = false) { + found_suffix_id = it->second; + } + suffix_id = found_suffix_id; const bool stop = suffix_id > 0 || ctx.table[p].score == k_unknown_score; for (bool stop_scan = stop; stop_scan; stop_scan = false) { p = ctx.table.size(); diff --git a/src/emel/text/encoders/ugm/detail.hpp b/src/emel/text/encoders/ugm/detail.hpp index 1f22c3ac..2dfed49d 100644 --- a/src/emel/text/encoders/ugm/detail.hpp +++ b/src/emel/text/encoders/ugm/detail.hpp @@ -332,7 +332,7 @@ inline normalization_result normalize_prefix(const emel::model::data::vocab &voc return {replacement, replacement_len, longest_prefix_length}; } - constexpr std::array replacement = {'\xEF', '\xBF', '\xBD'}; + static constexpr std::array replacement = {'\xEF', '\xBF', '\xBD'}; const uint8_t first = static_cast(input[input_offset]); const bool continuation = (first & 0xC0u) == 0x80u; const size_t len_raw = ugm_utf8_len(static_cast(first)); diff --git a/tests/text/encoders/test_support.hpp b/tests/text/encoders/test_support.hpp index a7ba4960..d6d20756 100644 --- a/tests/text/encoders/test_support.hpp +++ b/tests/text/encoders/test_support.hpp @@ -39,7 +39,7 @@ emel::model::data::vocab & vocab_storage() { return storage; } -size_t sum_offsets(const std::vector & offsets) { +[[maybe_unused]] size_t sum_offsets(const std::vector & offsets) { size_t sum = 0; for (const size_t value : offsets) { sum += value; @@ -52,7 +52,8 @@ struct dispatch_recorder { int error_count = 0; }; -bool record_done(void * owner, const emel::text::encoders::events::encoding_done &) { +[[maybe_unused]] bool record_done(void * owner, + const emel::text::encoders::events::encoding_done &) { if (owner == nullptr) { return false; } @@ -60,7 +61,8 @@ bool record_done(void * owner, const emel::text::encoders::events::encoding_done return true; } -bool record_error(void * owner, const emel::text::encoders::events::encoding_error &) { +[[maybe_unused]] bool record_error(void * owner, + const emel::text::encoders::events::encoding_error &) { if (owner == nullptr) { return false; } @@ -183,4 +185,3 @@ struct vocab_builder { }; } // namespace - diff --git a/tools/bench/CMakeLists.txt b/tools/bench/CMakeLists.txt index d5275fe6..6f1dfadf 100644 --- a/tools/bench/CMakeLists.txt +++ b/tools/bench/CMakeLists.txt @@ -3,10 +3,6 @@ project(emel_bench_tools C CXX) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) -option(EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES - "Build tensor/parser/text machine surfaces" - OFF -) if(NOT EMEL_ROOT) get_filename_component(EMEL_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../.." ABSOLUTE) @@ -107,9 +103,6 @@ add_subdirectory(${reference_impl_SOURCE_DIR}/ggml ggml) add_subdirectory(${reference_impl_SOURCE_DIR}/src llama_src) set(EMEL_ENABLE_TESTS OFF CACHE BOOL "" FORCE) -set(EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES - ${EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES} CACHE BOOL "" FORCE -) add_subdirectory(${EMEL_ROOT} emel) find_path(NLOHMANN_JSON_INCLUDE_DIR nlohmann/json.hpp) @@ -123,31 +116,33 @@ set(BENCH_RUNNER_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/memory/recurrent_bench.cpp ${CMAKE_CURRENT_SOURCE_DIR}/memory/hybrid_bench.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sm_any_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/jinja/parser_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/jinja/formatter_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/preprocessor/bpe_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/preprocessor/spm_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/preprocessor/ugm_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/preprocessor/wpm_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/preprocessor/rwkv_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/preprocessor/plamo2_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/encoders/bpe_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/encoders/spm_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/encoders/wpm_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/encoders/ugm_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/encoders/rwkv_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/encoders/plamo2_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/encoders/fallback_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/tokenizer_bench.cpp + ${reference_impl_SOURCE_DIR}/common/jinja/lexer.cpp + ${reference_impl_SOURCE_DIR}/common/jinja/parser.cpp + ${reference_impl_SOURCE_DIR}/common/jinja/runtime.cpp + ${reference_impl_SOURCE_DIR}/common/jinja/value.cpp + ${reference_impl_SOURCE_DIR}/common/jinja/string.cpp + ${reference_impl_SOURCE_DIR}/common/jinja/caps.cpp ${CMAKE_CURRENT_SOURCE_DIR}/bench_cases.hpp ${CMAKE_CURRENT_SOURCE_DIR}/bench_common.hpp ${CMAKE_CURRENT_SOURCE_DIR}/bench_main.cpp ) -if(EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES) - list(APPEND BENCH_RUNNER_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/text/jinja/parser_bench.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/text/jinja/formatter_bench.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/preprocessor/bpe_bench.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/preprocessor/spm_bench.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/preprocessor/ugm_bench.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/preprocessor/wpm_bench.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/preprocessor/rwkv_bench.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/preprocessor/plamo2_bench.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/text/tokenizer/tokenizer_bench.cpp - ${reference_impl_SOURCE_DIR}/common/jinja/lexer.cpp - ${reference_impl_SOURCE_DIR}/common/jinja/parser.cpp - ${reference_impl_SOURCE_DIR}/common/jinja/runtime.cpp - ${reference_impl_SOURCE_DIR}/common/jinja/value.cpp - ${reference_impl_SOURCE_DIR}/common/jinja/string.cpp - ${reference_impl_SOURCE_DIR}/common/jinja/caps.cpp - ) -endif() - add_executable(bench_runner ${BENCH_RUNNER_SOURCES} ) @@ -172,18 +167,6 @@ target_include_directories(bench_runner ${NLOHMANN_JSON_INCLUDE_DIR} ) -if(EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES) - target_compile_definitions(bench_runner - PRIVATE - EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES=1 - ) -else() - target_compile_definitions(bench_runner - PRIVATE - EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES=0 - ) -endif() - add_executable(gbnf_bench_runner ${CMAKE_CURRENT_SOURCE_DIR}/gbnf/parser_bench.cpp ${CMAKE_CURRENT_SOURCE_DIR}/gbnf/bench_main.cpp diff --git a/tools/bench/bench_cases.hpp b/tools/bench/bench_cases.hpp index 683cd156..98cfc8d6 100644 --- a/tools/bench/bench_cases.hpp +++ b/tools/bench/bench_cases.hpp @@ -50,6 +50,20 @@ void append_emel_tokenizer_preprocessor_plamo2_cases(std::vector & resul const config & cfg); void append_reference_tokenizer_preprocessor_plamo2_cases(std::vector & results, const config & cfg); +void append_emel_encoder_bpe_cases(std::vector & results, const config & cfg); +void append_reference_encoder_bpe_cases(std::vector & results, const config & cfg); +void append_emel_encoder_spm_cases(std::vector & results, const config & cfg); +void append_reference_encoder_spm_cases(std::vector & results, const config & cfg); +void append_emel_encoder_wpm_cases(std::vector & results, const config & cfg); +void append_reference_encoder_wpm_cases(std::vector & results, const config & cfg); +void append_emel_encoder_ugm_cases(std::vector & results, const config & cfg); +void append_reference_encoder_ugm_cases(std::vector & results, const config & cfg); +void append_emel_encoder_rwkv_cases(std::vector & results, const config & cfg); +void append_reference_encoder_rwkv_cases(std::vector & results, const config & cfg); +void append_emel_encoder_plamo2_cases(std::vector & results, const config & cfg); +void append_reference_encoder_plamo2_cases(std::vector & results, const config & cfg); +void append_emel_encoder_fallback_cases(std::vector & results, const config & cfg); +void append_reference_encoder_fallback_cases(std::vector & results, const config & cfg); void append_emel_tokenizer_cases(std::vector & results, const config & cfg); void append_reference_tokenizer_cases(std::vector & results, const config & cfg); diff --git a/tools/bench/bench_main.cpp b/tools/bench/bench_main.cpp index 8a074360..0aefdc31 100644 --- a/tools/bench/bench_main.cpp +++ b/tools/bench/bench_main.cpp @@ -50,27 +50,28 @@ std::vector run_emel_benchmarks(const bench::config & cfg, bench::append_emel_memory_kv_cases(results, cfg); bench::append_emel_memory_recurrent_cases(results, cfg); bench::append_emel_memory_hybrid_cases(results, cfg); -#if EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES bench::append_emel_jinja_parser_cases(results, cfg); bench::append_emel_jinja_formatter_cases(results, cfg); -#endif bench::append_emel_gbnf_parser_cases(results, cfg); bench::append_emel_logits_cases(results, cfg); bench::append_emel_kernel_cases(results, cfg); bench::append_emel_sm_any_cases(results, cfg); -#if EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES bench::append_emel_tokenizer_preprocessor_bpe_cases(results, cfg); bench::append_emel_tokenizer_preprocessor_spm_cases(results, cfg); bench::append_emel_tokenizer_preprocessor_ugm_cases(results, cfg); bench::append_emel_tokenizer_preprocessor_wpm_cases(results, cfg); bench::append_emel_tokenizer_preprocessor_rwkv_cases(results, cfg); bench::append_emel_tokenizer_preprocessor_plamo2_cases(results, cfg); + bench::append_emel_encoder_bpe_cases(results, cfg); + bench::append_emel_encoder_spm_cases(results, cfg); + bench::append_emel_encoder_wpm_cases(results, cfg); + bench::append_emel_encoder_ugm_cases(results, cfg); + bench::append_emel_encoder_rwkv_cases(results, cfg); + bench::append_emel_encoder_plamo2_cases(results, cfg); + bench::append_emel_encoder_fallback_cases(results, cfg); if (include_tokenizer) { bench::append_emel_tokenizer_cases(results, cfg); } -#else - (void)include_tokenizer; -#endif return results; } @@ -82,27 +83,28 @@ std::vector run_reference_benchmarks(const bench::config & cfg, bench::append_reference_memory_kv_cases(results, cfg); bench::append_reference_memory_recurrent_cases(results, cfg); bench::append_reference_memory_hybrid_cases(results, cfg); -#if EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES bench::append_reference_jinja_parser_cases(results, cfg); bench::append_reference_jinja_formatter_cases(results, cfg); -#endif bench::append_reference_gbnf_parser_cases(results, cfg); bench::append_reference_logits_cases(results, cfg); bench::append_reference_kernel_cases(results, cfg); bench::append_reference_sm_any_cases(results, cfg); -#if EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES bench::append_reference_tokenizer_preprocessor_bpe_cases(results, cfg); bench::append_reference_tokenizer_preprocessor_spm_cases(results, cfg); bench::append_reference_tokenizer_preprocessor_ugm_cases(results, cfg); bench::append_reference_tokenizer_preprocessor_wpm_cases(results, cfg); bench::append_reference_tokenizer_preprocessor_rwkv_cases(results, cfg); bench::append_reference_tokenizer_preprocessor_plamo2_cases(results, cfg); + bench::append_reference_encoder_bpe_cases(results, cfg); + bench::append_reference_encoder_spm_cases(results, cfg); + bench::append_reference_encoder_wpm_cases(results, cfg); + bench::append_reference_encoder_ugm_cases(results, cfg); + bench::append_reference_encoder_rwkv_cases(results, cfg); + bench::append_reference_encoder_plamo2_cases(results, cfg); + bench::append_reference_encoder_fallback_cases(results, cfg); if (include_tokenizer) { bench::append_reference_tokenizer_cases(results, cfg); } -#else - (void)include_tokenizer; -#endif return results; } diff --git a/tools/bench/text/encoders/bench_common.hpp b/tools/bench/text/encoders/bench_common.hpp new file mode 100644 index 00000000..1b6201a7 --- /dev/null +++ b/tools/bench/text/encoders/bench_common.hpp @@ -0,0 +1,171 @@ +#pragma once + +#include "../../bench_common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "emel/emel.h" +#include "emel/model/data.hpp" +#include "emel/text/encoders/events.hpp" +#include "emel/text/unicode.hpp" + +namespace emel::bench::encoder_bench { + +constexpr size_t k_token_capacity = 4096; + +inline int32_t add_token(emel::model::data::vocab & vocab, + const char * text, + const uint32_t len, + const float score, + const int32_t type) { + const uint32_t offset = vocab.token_bytes_used; + std::memcpy(vocab.token_storage.data() + offset, text, len); + const uint32_t id = vocab.n_tokens; + vocab.entries[id].text_offset = offset; + vocab.entries[id].text_length = len; + vocab.entries[id].score = score; + vocab.entries[id].type = type; + vocab.token_bytes_used += len; + vocab.n_tokens = id + 1; + return static_cast(id); +} + +inline int32_t add_token(emel::model::data::vocab & vocab, + const char * text, + const float score, + const int32_t type) { + return add_token(vocab, text, static_cast(std::strlen(text)), score, type); +} + +inline int32_t add_byte_token(emel::model::data::vocab & vocab, const uint8_t value) { + const std::string token = emel::text::unicode_byte_to_utf8(value); + return add_token(vocab, + token.c_str(), + static_cast(token.size()), + 0.0f, + 6); +} + +inline int32_t add_raw_byte_token(emel::model::data::vocab & vocab, const uint8_t value) { + const char byte = static_cast(value); + return add_token(vocab, &byte, 1u, 0.0f, 6); +} + +inline int32_t add_plamo2_byte_token(emel::model::data::vocab & vocab, const uint8_t value) { + char token[7] = {}; + std::snprintf(token, sizeof(token), "<0x%02X>", value); + return add_token(vocab, token, 0.0f, 6); +} + +inline void add_all_byte_tokens(emel::model::data::vocab & vocab) { + for (int value = 0; value < 256; ++value) { + (void)add_byte_token(vocab, static_cast(value)); + } +} + +inline void add_all_raw_byte_tokens(emel::model::data::vocab & vocab) { + for (int value = 0; value < 256; ++value) { + (void)add_raw_byte_token(vocab, static_cast(value)); + } +} + +inline void add_all_plamo2_byte_tokens(emel::model::data::vocab & vocab) { + for (int value = 0; value < 256; ++value) { + (void)add_plamo2_byte_token(vocab, static_cast(value)); + } +} + +inline std::string make_repeated_text(const int repeats) { + std::string out; + out.reserve(static_cast(repeats) * 12); + for (int i = 0; i < repeats; ++i) { + if (i > 0) { + out += ' '; + } + out += "hello world"; + } + return out; +} + +template +inline bool run_encode(machine_type & machine, + emel::text::encoders::event::encode & request, + int32_t & token_count, + int32_t & err) { + token_count = 0; + err = EMEL_OK; + const bool accepted = machine.process_event(request); + return accepted && err == EMEL_OK; +} + +template +inline void ensure_encodes(machine_type & machine, + emel::text::encoders::event::encode & request, + const char * label) { + int32_t token_count = 0; + int32_t err = EMEL_OK; + if (!run_encode(machine, request, token_count, err)) { + std::fprintf(stderr, + "error: encoder failed to process text (%s, err=%d)\n", + label, + err); + std::abort(); + } +} + +template +inline void append_emel_encoder_cases(std::vector & results, + const config & cfg, + const char * short_name, + const char * long_name, + build_vocab_fn build_vocab, + const bool preprocessed, + const int short_repeats = 1, + const int long_repeats = 64) { + const std::string short_text = make_repeated_text(short_repeats); + const std::string long_text = make_repeated_text(long_repeats); + auto vocab = build_vocab(); + + machine_type machine{}; + std::array tokens = {}; + int32_t token_count = 0; + int32_t err = EMEL_OK; + + emel::text::encoders::event::encode short_request{ + .vocab = *vocab, + .text = short_text, + .preprocessed = preprocessed, + .token_ids = std::span(tokens.data(), tokens.size()), + .token_count_out = &token_count, + .error_out = &err, + }; + emel::text::encoders::event::encode long_request{ + .vocab = *vocab, + .text = long_text, + .preprocessed = preprocessed, + .token_ids = std::span(tokens.data(), tokens.size()), + .token_count_out = &token_count, + .error_out = &err, + }; + + ensure_encodes(machine, short_request, short_name); + ensure_encodes(machine, long_request, long_name); + + auto short_fn = [&]() { (void)run_encode(machine, short_request, token_count, err); }; + auto long_fn = [&]() { (void)run_encode(machine, long_request, token_count, err); }; + + results.push_back(measure_case(short_name, cfg, short_fn)); + results.push_back(measure_case(long_name, cfg, long_fn)); +} + +} // namespace emel::bench::encoder_bench diff --git a/tools/bench/text/encoders/bpe_bench.cpp b/tools/bench/text/encoders/bpe_bench.cpp new file mode 100644 index 00000000..3ae199fb --- /dev/null +++ b/tools/bench/text/encoders/bpe_bench.cpp @@ -0,0 +1,38 @@ +#include "bench_cases.hpp" +#include "bench_common.hpp" + +#include +#include + +#include "emel/model/data.hpp" +#include "emel/text/encoders/bpe/sm.hpp" + +namespace { + +std::unique_ptr make_bpe_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::BPE; + vocab->tokenizer_pre_id = emel::model::data::tokenizer_pre::GPT2; + vocab->ignore_merges = true; + emel::bench::encoder_bench::add_all_byte_tokens(*vocab); + (void)emel::bench::encoder_bench::add_token(*vocab, "hello", 0.5f, 1); + (void)emel::bench::encoder_bench::add_token(*vocab, "world", 0.5f, 1); + (void)emel::bench::encoder_bench::add_token(*vocab, " ", 0.1f, 1); + return vocab; +} + +} // namespace + +namespace emel::bench { + +void append_emel_encoder_bpe_cases(std::vector & results, const config & cfg) { + encoder_bench::append_emel_encoder_cases( + results, cfg, "text/encoders/bpe_short", "text/encoders/bpe_long", make_bpe_vocab, true); +} + +void append_reference_encoder_bpe_cases(std::vector & results, const config & cfg) { + // Reference encoder benchmarks reuse the EMEL path until llama.cpp parity is wired. + append_emel_encoder_bpe_cases(results, cfg); +} + +} // namespace emel::bench diff --git a/tools/bench/text/encoders/fallback_bench.cpp b/tools/bench/text/encoders/fallback_bench.cpp new file mode 100644 index 00000000..74f0c197 --- /dev/null +++ b/tools/bench/text/encoders/fallback_bench.cpp @@ -0,0 +1,38 @@ +#include "bench_cases.hpp" +#include "bench_common.hpp" + +#include +#include + +#include "emel/model/data.hpp" +#include "emel/text/encoders/fallback/sm.hpp" + +namespace { + +std::unique_ptr make_fallback_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::UNKNOWN; + emel::bench::encoder_bench::add_all_raw_byte_tokens(*vocab); + return vocab; +} + +} // namespace + +namespace emel::bench { + +void append_emel_encoder_fallback_cases(std::vector & results, const config & cfg) { + encoder_bench::append_emel_encoder_cases( + results, + cfg, + "text/encoders/fallback_short", + "text/encoders/fallback_long", + make_fallback_vocab, + false); +} + +void append_reference_encoder_fallback_cases(std::vector & results, const config & cfg) { + // Reference encoder benchmarks reuse the EMEL path until llama.cpp parity is wired. + append_emel_encoder_fallback_cases(results, cfg); +} + +} // namespace emel::bench diff --git a/tools/bench/text/encoders/plamo2_bench.cpp b/tools/bench/text/encoders/plamo2_bench.cpp new file mode 100644 index 00000000..0d50bacf --- /dev/null +++ b/tools/bench/text/encoders/plamo2_bench.cpp @@ -0,0 +1,40 @@ +#include "bench_cases.hpp" +#include "bench_common.hpp" + +#include +#include + +#include "emel/model/data.hpp" +#include "emel/text/encoders/plamo2/sm.hpp" + +namespace { + +std::unique_ptr make_plamo2_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::PLAMO2; + // Keep id=0 non-byte so parsed byte ids are all non-zero in the table-complete check. + (void)emel::bench::encoder_bench::add_token(*vocab, "", 0.0f, 1); + emel::bench::encoder_bench::add_all_plamo2_byte_tokens(*vocab); + return vocab; +} + +} // namespace + +namespace emel::bench { + +void append_emel_encoder_plamo2_cases(std::vector & results, const config & cfg) { + encoder_bench::append_emel_encoder_cases( + results, + cfg, + "text/encoders/plamo2_short", + "text/encoders/plamo2_long", + make_plamo2_vocab, + false); +} + +void append_reference_encoder_plamo2_cases(std::vector & results, const config & cfg) { + // Reference encoder benchmarks reuse the EMEL path until llama.cpp parity is wired. + append_emel_encoder_plamo2_cases(results, cfg); +} + +} // namespace emel::bench diff --git a/tools/bench/text/encoders/rwkv_bench.cpp b/tools/bench/text/encoders/rwkv_bench.cpp new file mode 100644 index 00000000..b26599cc --- /dev/null +++ b/tools/bench/text/encoders/rwkv_bench.cpp @@ -0,0 +1,43 @@ +#include "bench_cases.hpp" +#include "bench_common.hpp" + +#include +#include + +#include "emel/model/data.hpp" +#include "emel/text/encoders/rwkv/sm.hpp" + +namespace { + +std::unique_ptr make_rwkv_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::RWKV; + emel::bench::encoder_bench::add_all_byte_tokens(*vocab); + (void)emel::bench::encoder_bench::add_token(*vocab, "hello", 0.5f, 1); + (void)emel::bench::encoder_bench::add_token(*vocab, "world", 0.5f, 1); + (void)emel::bench::encoder_bench::add_token(*vocab, " ", 0.1f, 1); + return vocab; +} + +} // namespace + +namespace emel::bench { + +void append_emel_encoder_rwkv_cases(std::vector & results, const config & cfg) { + encoder_bench::append_emel_encoder_cases( + results, + cfg, + "text/encoders/rwkv_short", + "text/encoders/rwkv_long", + make_rwkv_vocab, + false, + 16, + 64); +} + +void append_reference_encoder_rwkv_cases(std::vector & results, const config & cfg) { + // Reference encoder benchmarks reuse the EMEL path until llama.cpp parity is wired. + append_emel_encoder_rwkv_cases(results, cfg); +} + +} // namespace emel::bench diff --git a/tools/bench/text/encoders/spm_bench.cpp b/tools/bench/text/encoders/spm_bench.cpp new file mode 100644 index 00000000..12967f35 --- /dev/null +++ b/tools/bench/text/encoders/spm_bench.cpp @@ -0,0 +1,37 @@ +#include "bench_cases.hpp" +#include "bench_common.hpp" + +#include +#include + +#include "emel/model/data.hpp" +#include "emel/text/encoders/spm/sm.hpp" + +namespace { + +std::unique_ptr make_spm_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::SPM; + vocab->add_space_prefix = true; + emel::bench::encoder_bench::add_all_plamo2_byte_tokens(*vocab); + (void)emel::bench::encoder_bench::add_token(*vocab, "\xE2\x96\x81" "hello", 0.5f, 1); + (void)emel::bench::encoder_bench::add_token(*vocab, "\xE2\x96\x81" "world", 0.5f, 1); + (void)emel::bench::encoder_bench::add_token(*vocab, "\xE2\x96\x81", 0.1f, 1); + return vocab; +} + +} // namespace + +namespace emel::bench { + +void append_emel_encoder_spm_cases(std::vector & results, const config & cfg) { + encoder_bench::append_emel_encoder_cases( + results, cfg, "text/encoders/spm_short", "text/encoders/spm_long", make_spm_vocab, false); +} + +void append_reference_encoder_spm_cases(std::vector & results, const config & cfg) { + // Reference encoder benchmarks reuse the EMEL path until llama.cpp parity is wired. + append_emel_encoder_spm_cases(results, cfg); +} + +} // namespace emel::bench diff --git a/tools/bench/text/encoders/ugm_bench.cpp b/tools/bench/text/encoders/ugm_bench.cpp new file mode 100644 index 00000000..79e55e45 --- /dev/null +++ b/tools/bench/text/encoders/ugm_bench.cpp @@ -0,0 +1,37 @@ +#include "bench_cases.hpp" +#include "bench_common.hpp" + +#include +#include + +#include "emel/model/data.hpp" +#include "emel/text/encoders/ugm/sm.hpp" + +namespace { + +std::unique_ptr make_ugm_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::UGM; + vocab->add_space_prefix = true; + const int32_t unk_id = emel::bench::encoder_bench::add_token(*vocab, "", 0.0f, 2); + vocab->unk_id = unk_id; + (void)emel::bench::encoder_bench::add_token(*vocab, "\xE2\x96\x81" "hello", 0.5f, 1); + (void)emel::bench::encoder_bench::add_token(*vocab, "\xE2\x96\x81" "world", 0.5f, 1); + return vocab; +} + +} // namespace + +namespace emel::bench { + +void append_emel_encoder_ugm_cases(std::vector & results, const config & cfg) { + encoder_bench::append_emel_encoder_cases( + results, cfg, "text/encoders/ugm_short", "text/encoders/ugm_long", make_ugm_vocab, false); +} + +void append_reference_encoder_ugm_cases(std::vector & results, const config & cfg) { + // Reference encoder benchmarks reuse the EMEL path until llama.cpp parity is wired. + append_emel_encoder_ugm_cases(results, cfg); +} + +} // namespace emel::bench diff --git a/tools/bench/text/encoders/wpm_bench.cpp b/tools/bench/text/encoders/wpm_bench.cpp new file mode 100644 index 00000000..6d67da27 --- /dev/null +++ b/tools/bench/text/encoders/wpm_bench.cpp @@ -0,0 +1,36 @@ +#include "bench_cases.hpp" +#include "bench_common.hpp" + +#include +#include + +#include "emel/model/data.hpp" +#include "emel/text/encoders/wpm/sm.hpp" + +namespace { + +std::unique_ptr make_wpm_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::WPM; + const int32_t unk_id = emel::bench::encoder_bench::add_token(*vocab, "", 0.0f, 2); + vocab->unk_id = unk_id; + (void)emel::bench::encoder_bench::add_token(*vocab, "\xE2\x96\x81" "hello", 0.5f, 1); + (void)emel::bench::encoder_bench::add_token(*vocab, "\xE2\x96\x81" "world", 0.5f, 1); + return vocab; +} + +} // namespace + +namespace emel::bench { + +void append_emel_encoder_wpm_cases(std::vector & results, const config & cfg) { + encoder_bench::append_emel_encoder_cases( + results, cfg, "text/encoders/wpm_short", "text/encoders/wpm_long", make_wpm_vocab, false); +} + +void append_reference_encoder_wpm_cases(std::vector & results, const config & cfg) { + // Reference encoder benchmarks reuse the EMEL path until llama.cpp parity is wired. + append_emel_encoder_wpm_cases(results, cfg); +} + +} // namespace emel::bench diff --git a/tools/docsgen/CMakeLists.txt b/tools/docsgen/CMakeLists.txt index 19eb89fb..e122cca9 100644 --- a/tools/docsgen/CMakeLists.txt +++ b/tools/docsgen/CMakeLists.txt @@ -4,10 +4,6 @@ project(docsgen LANGUAGES CXX) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) -option(EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES - "Build tensor/parser/text machine surfaces" - OFF -) include(FetchContent) include("${CMAKE_CURRENT_LIST_DIR}/../../cmake/sml_version.cmake") @@ -22,11 +18,6 @@ set(ROOT_DIR "${CMAKE_CURRENT_LIST_DIR}/../..") file(GLOB_RECURSE SM_HEADERS CONFIGURE_DEPENDS "${ROOT_DIR}/src/emel/**/sm.hpp") list(FILTER SM_HEADERS EXCLUDE REGEX ".*/src/emel/sm.hpp$") -if(NOT EMEL_ENABLE_TENSOR_PARSER_TEXT_MACHINES) - list(FILTER SM_HEADERS EXCLUDE REGEX ".*/src/emel/parser/.*") - list(FILTER SM_HEADERS EXCLUDE REGEX ".*/src/emel/text/.*") - list(FILTER SM_HEADERS EXCLUDE REGEX ".*/src/emel/tensor/.*") -endif() list(SORT SM_HEADERS) set(DOCS_ENABLED_SM_HEADERS "") diff --git a/tools/paritychecker/CMakeLists.txt b/tools/paritychecker/CMakeLists.txt index 51414f0c..ef554076 100644 --- a/tools/paritychecker/CMakeLists.txt +++ b/tools/paritychecker/CMakeLists.txt @@ -108,6 +108,14 @@ add_subdirectory(${EMEL_ROOT} emel) add_executable(paritychecker ${CMAKE_CURRENT_SOURCE_DIR}/parity_main.cpp ${CMAKE_CURRENT_SOURCE_DIR}/parity_runner.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_parity_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_spm_parity.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_bpe_parity.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_wpm_parity.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_ugm_parity.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_rwkv_parity.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_plamo2_parity.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_fallback_parity.cpp ) target_link_libraries(paritychecker @@ -135,6 +143,14 @@ endif() add_executable(paritychecker_tests ${CMAKE_CURRENT_SOURCE_DIR}/paritychecker_tests.cpp ${CMAKE_CURRENT_SOURCE_DIR}/parity_runner.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_parity_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_spm_parity.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_bpe_parity.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_wpm_parity.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_ugm_parity.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_rwkv_parity.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_plamo2_parity.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_fallback_parity.cpp ) target_link_libraries(paritychecker_tests diff --git a/tools/paritychecker/parity_runner.cpp b/tools/paritychecker/parity_runner.cpp index 1e11beef..542f9e01 100644 --- a/tools/paritychecker/parity_runner.cpp +++ b/tools/paritychecker/parity_runner.cpp @@ -1,8 +1,12 @@ #include "parity_runner.hpp" +#include "tokenizer_parity.hpp" +#include +#include #include #include #include +#include #include #include #include @@ -15,10 +19,12 @@ #include "emel/kernel/events.hpp" #include "emel/kernel/x86_64/context.hpp" #include "emel/kernel/x86_64/detail.hpp" +#include "emel/model/data.hpp" #include "ggml-cpu.h" #include "ggml.h" #include "llama-grammar.h" +#include "llama-vocab.h" namespace { @@ -173,6 +179,330 @@ void dump_llama_grammar(const llama_grammar_rules & rules) { } } +struct llama_backend_guard { + llama_backend_guard() { + llama_backend_init(); + } + + ~llama_backend_guard() { + llama_backend_free(); + } +}; + +template +void copy_name(std::array & dst, const std::string & value) { + static_assert(k_array_size > 0, "copy_name requires non-empty destination"); + dst.fill('\0'); + const size_t copy_len = std::min(value.size(), k_array_size - 1); + if (copy_len > 0) { + std::memcpy(dst.data(), value.data(), copy_len); + } +} + +template +void set_token_flag(std::array & flags, const uint32_t token_id) { + const uint32_t byte_index = token_id >> 3u; + if (byte_index >= k_array_size) { + return; + } + const uint8_t bit = static_cast(1u << (token_id & 7u)); + flags[byte_index] = static_cast(flags[byte_index] | bit); +} + +bool attr_has(const llama_token_attr attr, const llama_token_attr flag) { + const uint32_t attr_bits = static_cast(attr); + const uint32_t flag_bits = static_cast(flag); + return (attr_bits & flag_bits) != 0u; +} + +int32_t token_type_from_attr(const llama_token_attr attr) { + if (attr_has(attr, LLAMA_TOKEN_ATTR_UNKNOWN)) { + return static_cast(LLAMA_TOKEN_TYPE_UNKNOWN); + } + if (attr_has(attr, LLAMA_TOKEN_ATTR_CONTROL)) { + return static_cast(LLAMA_TOKEN_TYPE_CONTROL); + } + if (attr_has(attr, LLAMA_TOKEN_ATTR_USER_DEFINED)) { + return static_cast(LLAMA_TOKEN_TYPE_USER_DEFINED); + } + if (attr_has(attr, LLAMA_TOKEN_ATTR_UNUSED)) { + return static_cast(LLAMA_TOKEN_TYPE_UNUSED); + } + if (attr_has(attr, LLAMA_TOKEN_ATTR_BYTE)) { + return static_cast(LLAMA_TOKEN_TYPE_BYTE); + } + if (attr_has(attr, LLAMA_TOKEN_ATTR_NORMAL)) { + return static_cast(LLAMA_TOKEN_TYPE_NORMAL); + } + return static_cast(LLAMA_TOKEN_TYPE_UNDEFINED); +} + +emel::model::data::tokenizer_model to_emel_tokenizer_model( + const enum llama_vocab_type type) { + using tokenizer_model = emel::model::data::tokenizer_model; + + switch (type) { + case LLAMA_VOCAB_TYPE_NONE: + return tokenizer_model::NONE; + case LLAMA_VOCAB_TYPE_SPM: + return tokenizer_model::SPM; + case LLAMA_VOCAB_TYPE_BPE: + return tokenizer_model::BPE; + case LLAMA_VOCAB_TYPE_WPM: + return tokenizer_model::WPM; + case LLAMA_VOCAB_TYPE_UGM: + return tokenizer_model::UGM; + case LLAMA_VOCAB_TYPE_RWKV: + return tokenizer_model::RWKV; + case LLAMA_VOCAB_TYPE_PLAMO2: + return tokenizer_model::PLAMO2; + default: + return tokenizer_model::UNKNOWN; + } +} + +emel::model::data::tokenizer_pre to_emel_tokenizer_pre( + const llama_vocab_pre_type type) { + using tokenizer_pre = emel::model::data::tokenizer_pre; + + switch (type) { + case LLAMA_VOCAB_PRE_TYPE_DEFAULT: + return tokenizer_pre::DEFAULT; + case LLAMA_VOCAB_PRE_TYPE_LLAMA3: + return tokenizer_pre::LLAMA3; + case LLAMA_VOCAB_PRE_TYPE_JAIS2: + return tokenizer_pre::JAIS2; + case LLAMA_VOCAB_PRE_TYPE_DBRX: + return tokenizer_pre::DBRX; + case LLAMA_VOCAB_PRE_TYPE_SMAUG: + return tokenizer_pre::SMAUG; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: + return tokenizer_pre::DEEPSEEK_LLM; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: + return tokenizer_pre::DEEPSEEK_CODER; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM: + return tokenizer_pre::DEEPSEEK3_LLM; + case LLAMA_VOCAB_PRE_TYPE_YOUTU: + return tokenizer_pre::YOUTU; + case LLAMA_VOCAB_PRE_TYPE_FALCON: + return tokenizer_pre::FALCON; + case LLAMA_VOCAB_PRE_TYPE_MPT: + return tokenizer_pre::MPT; + case LLAMA_VOCAB_PRE_TYPE_STARCODER: + return tokenizer_pre::STARCODER; + case LLAMA_VOCAB_PRE_TYPE_GPT2: + return tokenizer_pre::GPT2; + case LLAMA_VOCAB_PRE_TYPE_JAIS: + return tokenizer_pre::JAIS; + case LLAMA_VOCAB_PRE_TYPE_REFACT: + return tokenizer_pre::REFACT; + case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: + return tokenizer_pre::COMMAND_R; + case LLAMA_VOCAB_PRE_TYPE_QWEN2: + return tokenizer_pre::QWEN2; + case LLAMA_VOCAB_PRE_TYPE_QWEN35: + return tokenizer_pre::QWEN35; + case LLAMA_VOCAB_PRE_TYPE_STABLELM2: + return tokenizer_pre::STABLELM2; + case LLAMA_VOCAB_PRE_TYPE_OLMO: + return tokenizer_pre::OLMO; + case LLAMA_VOCAB_PRE_TYPE_PORO: + return tokenizer_pre::PORO; + case LLAMA_VOCAB_PRE_TYPE_CHATGLM4: + return tokenizer_pre::CHATGLM4; + case LLAMA_VOCAB_PRE_TYPE_VIKING: + return tokenizer_pre::VIKING; + case LLAMA_VOCAB_PRE_TYPE_TEKKEN: + return tokenizer_pre::TEKKEN; + case LLAMA_VOCAB_PRE_TYPE_SMOLLM: + return tokenizer_pre::SMOLLM; + case LLAMA_VOCAB_PRE_TYPE_CODESHELL: + return tokenizer_pre::CODESHELL; + case LLAMA_VOCAB_PRE_TYPE_BLOOM: + return tokenizer_pre::BLOOM; + case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH: + return tokenizer_pre::GPT3_FINNISH; + case LLAMA_VOCAB_PRE_TYPE_EXAONE: + return tokenizer_pre::EXAONE; + case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE: + return tokenizer_pre::EXAONE_MOE; + case LLAMA_VOCAB_PRE_TYPE_CHAMELEON: + return tokenizer_pre::CHAMELEON; + case LLAMA_VOCAB_PRE_TYPE_MINERVA: + return tokenizer_pre::MINERVA; + case LLAMA_VOCAB_PRE_TYPE_GPT4O: + return tokenizer_pre::GPT4O; + case LLAMA_VOCAB_PRE_TYPE_TINY_AYA: + return tokenizer_pre::TINY_AYA; + case LLAMA_VOCAB_PRE_TYPE_SUPERBPE: + return tokenizer_pre::SUPERBPE; + case LLAMA_VOCAB_PRE_TYPE_TRILLION: + return tokenizer_pre::TRILLION; + case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING: + return tokenizer_pre::GRANITE_DOCLING; + case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE: + return tokenizer_pre::BAILINGMOE; + case LLAMA_VOCAB_PRE_TYPE_SEED_CODER: + return tokenizer_pre::SEED_CODER; + case LLAMA_VOCAB_PRE_TYPE_HUNYUAN: + return tokenizer_pre::HUNYUAN; + case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE: + return tokenizer_pre::HUNYUAN_DENSE; + case LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM: + return tokenizer_pre::JOYAI_LLM; + case LLAMA_VOCAB_PRE_TYPE_KIMI_K2: + return tokenizer_pre::KIMI_K2; + case LLAMA_VOCAB_PRE_TYPE_GROK_2: + return tokenizer_pre::GROK_2; + case LLAMA_VOCAB_PRE_TYPE_AFMOE: + return tokenizer_pre::AFMOE; + case LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2: + return tokenizer_pre::MINIMAX_M2; + case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN: + return tokenizer_pre::SOLAR_OPEN; + case LLAMA_VOCAB_PRE_TYPE_CHATGLM3: + case LLAMA_VOCAB_PRE_TYPE_LLAMA4: + case LLAMA_VOCAB_PRE_TYPE_PIXTRAL: + return tokenizer_pre::UNKNOWN; + default: + return tokenizer_pre::UNKNOWN; + } +} + +bool load_emel_vocab_from_llama(const llama_vocab & src, emel::model::data::vocab & dst) { + dst = {}; + dst.tokenizer_model_id = to_emel_tokenizer_model(src.get_type()); + dst.tokenizer_pre_id = to_emel_tokenizer_pre(src.get_pre_type()); + copy_name(dst.tokenizer_model_name, src.get_tokenizer_model()); + copy_name(dst.tokenizer_pre_name, src.get_tokenizer_pre()); + + const uint32_t token_count = src.n_tokens(); + if (token_count > emel::model::data::k_max_vocab_tokens) { + std::fprintf(stderr, + "vocab token count exceeds emel capacity: %u > %d\n", + token_count, + emel::model::data::k_max_vocab_tokens); + return false; + } + dst.n_tokens = token_count; + dst.n_token_types = src.n_token_types(); + + uint32_t token_bytes_used = 0; + for (uint32_t token_id = 0; token_id < token_count; ++token_id) { + const llama_token llama_id = static_cast(token_id); + const auto & token = src.get_token_data(llama_id); + const uint32_t token_len = static_cast(token.text.size()); + if (token_bytes_used + token_len > emel::model::data::k_max_vocab_bytes) { + std::fprintf(stderr, + "token storage exceeds emel capacity at token %u (%u + %u > %d)\n", + token_id, + token_bytes_used, + token_len, + emel::model::data::k_max_vocab_bytes); + return false; + } + + if (token_len > 0) { + std::memcpy(dst.token_storage.data() + token_bytes_used, + token.text.data(), + token_len); + } + + emel::model::data::vocab_entry & entry = dst.entries[token_id]; + entry.text_offset = token_bytes_used; + entry.text_length = token_len; + entry.score = token.score; + entry.type = token_type_from_attr(token.attr); + token_bytes_used += token_len; + + if (attr_has(token.attr, LLAMA_TOKEN_ATTR_LSTRIP)) { + set_token_flag(dst.lstrip_flags, token_id); + } + if (attr_has(token.attr, LLAMA_TOKEN_ATTR_RSTRIP)) { + set_token_flag(dst.rstrip_flags, token_id); + } + } + dst.token_bytes_used = token_bytes_used; + + const std::vector merges = src.get_bpe_merges(); + if (merges.size() > emel::model::data::k_max_merges) { + std::fprintf(stderr, + "merge count exceeds emel capacity: %zu > %d\n", + merges.size(), + emel::model::data::k_max_merges); + return false; + } + + uint32_t merge_bytes_used = 0; + for (size_t i = 0; i < merges.size(); ++i) { + const std::string & merge = merges[i]; + const uint32_t merge_len = static_cast(merge.size()); + if (merge_bytes_used + merge_len > emel::model::data::k_max_merge_bytes) { + std::fprintf(stderr, + "merge storage exceeds emel capacity at merge %zu (%u + %u > %d)\n", + i, + merge_bytes_used, + merge_len, + emel::model::data::k_max_merge_bytes); + return false; + } + if (merge_len > 0) { + std::memcpy(dst.merge_storage.data() + merge_bytes_used, + merge.data(), + merge_len); + } + dst.merge_offsets[i] = merge_bytes_used; + dst.merge_lengths[i] = merge_len; + merge_bytes_used += merge_len; + } + dst.n_merges = static_cast(merges.size()); + dst.merge_bytes_used = merge_bytes_used; + + const std::vector precompiled_charsmap = src.get_precompiled_charsmap(); + if (precompiled_charsmap.size() > emel::model::data::k_max_precompiled_charsmap_bytes) { + std::fprintf(stderr, + "precompiled charsmap exceeds emel capacity: %zu > %d\n", + precompiled_charsmap.size(), + emel::model::data::k_max_precompiled_charsmap_bytes); + return false; + } + if (!precompiled_charsmap.empty()) { + std::memcpy(dst.precompiled_charsmap.data(), + precompiled_charsmap.data(), + precompiled_charsmap.size()); + } + dst.precompiled_charsmap_size = static_cast(precompiled_charsmap.size()); + + dst.bos_id = src.token_bos(); + dst.eos_id = src.token_eos(); + dst.eot_id = src.token_eot(); + dst.eom_id = src.token_eom(); + dst.unk_id = src.token_unk(); + dst.sep_id = src.token_sep(); + dst.pad_id = src.token_pad(); + dst.mask_id = src.token_mask(); + dst.prefix_id = src.token_prefix(); + dst.suffix_id = src.token_suffix(); + dst.middle_id = src.token_middle(); + dst.fim_pre_id = src.token_fim_pre(); + dst.fim_suf_id = src.token_fim_suf(); + dst.fim_mid_id = src.token_fim_mid(); + dst.fim_pad_id = src.token_fim_pad(); + dst.fim_rep_id = src.token_fim_rep(); + dst.fim_sep_id = src.token_fim_sep(); + + dst.add_bos = src.get_add_bos(); + dst.add_eos = src.get_add_eos(); + dst.add_sep = src.get_add_sep(); + dst.add_space_prefix = src.get_add_space_prefix(); + dst.ignore_merges = src.get_ignore_merges(); + dst.remove_extra_whitespaces = src.get_remove_extra_whitespaces(); + dst.escape_whitespaces = src.get_escape_whitespaces(); + dst.treat_whitespace_as_suffix = src.get_treat_whitespace_as_suffix(); + + return true; +} + constexpr double k_f32_rtol = 1e-5; constexpr double k_f32_atol = 1e-6; @@ -739,9 +1069,59 @@ int run_kernel_parity(const emel::paritychecker::parity_options &) { return 1; } -int run_tokenizer_parity(const emel::paritychecker::parity_options &) { - std::fprintf(stderr, "tokenizer parity is scaffolded\n"); - return 1; +int run_tokenizer_parity(const emel::paritychecker::parity_options & opts) { + llama_backend_guard backend_guard{}; + + llama_model_params model_params = llama_model_default_params(); + model_params.vocab_only = true; + model_params.check_tensors = false; + + std::unique_ptr model( + llama_model_load_from_file(opts.model_path.c_str(), model_params), + llama_model_free); + if (model == nullptr) { + std::fprintf(stderr, "failed to load model: %s\n", opts.model_path.c_str()); + return 1; + } + + const llama_vocab * llama_vocab_ptr = llama_model_get_vocab(model.get()); + if (llama_vocab_ptr == nullptr) { + std::fprintf(stderr, "model has no vocabulary: %s\n", opts.model_path.c_str()); + return 1; + } + + auto emel_vocab = std::make_unique(); + if (!load_emel_vocab_from_llama(*llama_vocab_ptr, *emel_vocab)) { + std::fprintf(stderr, "failed to map llama vocab into emel layout\n"); + return 1; + } + + using tokenizer_model = emel::model::data::tokenizer_model; + switch (emel_vocab->tokenizer_model_id) { + case tokenizer_model::SPM: + return emel::paritychecker::run_tokenizer_spm_parity( + opts, *llama_vocab_ptr, *emel_vocab); + case tokenizer_model::BPE: + return emel::paritychecker::run_tokenizer_bpe_parity( + opts, *llama_vocab_ptr, *emel_vocab); + case tokenizer_model::WPM: + return emel::paritychecker::run_tokenizer_wpm_parity( + opts, *llama_vocab_ptr, *emel_vocab); + case tokenizer_model::UGM: + return emel::paritychecker::run_tokenizer_ugm_parity( + opts, *llama_vocab_ptr, *emel_vocab); + case tokenizer_model::RWKV: + return emel::paritychecker::run_tokenizer_rwkv_parity( + opts, *llama_vocab_ptr, *emel_vocab); + case tokenizer_model::PLAMO2: + return emel::paritychecker::run_tokenizer_plamo2_parity( + opts, *llama_vocab_ptr, *emel_vocab); + case tokenizer_model::NONE: + case tokenizer_model::UNKNOWN: + default: + return emel::paritychecker::run_tokenizer_fallback_parity( + opts, *llama_vocab_ptr, *emel_vocab); + } } int run_gbnf_parser_parity(const emel::paritychecker::parity_options & opts) { diff --git a/tools/paritychecker/paritychecker_tests.cpp b/tools/paritychecker/paritychecker_tests.cpp index 6fe0ab86..60e2c482 100644 --- a/tools/paritychecker/paritychecker_tests.cpp +++ b/tools/paritychecker/paritychecker_tests.cpp @@ -226,11 +226,7 @@ TEST_CASE("paritychecker matches llama tokens across tiny models") { for (const auto & test_case : cases) { INFO("case: " << test_case.label); REQUIRE(file_exists(test_case.text_path)); - if (!run_paritychecker_process(model, test_case)) { - WARN("paritychecker skipped (emel parser scaffolded)"); - return; - } - CHECK(true); + CHECK(run_paritychecker_process(model, test_case)); } const std::string special_text = special_text_for_model(model); if (!special_text.empty()) { @@ -241,11 +237,7 @@ TEST_CASE("paritychecker matches llama tokens across tiny models") { special_case.text_path = special_text; special_case.add_special = true; special_case.parse_special = true; - if (!run_paritychecker_process(model, special_case)) { - WARN("paritychecker skipped (emel parser scaffolded)"); - return; - } - CHECK(true); + CHECK(run_paritychecker_process(model, special_case)); } } } diff --git a/tools/paritychecker/tokenizer_bpe_parity.cpp b/tools/paritychecker/tokenizer_bpe_parity.cpp new file mode 100644 index 00000000..28dd6146 --- /dev/null +++ b/tools/paritychecker/tokenizer_bpe_parity.cpp @@ -0,0 +1,17 @@ +#include "tokenizer_parity.hpp" + +namespace emel::paritychecker { + +int run_tokenizer_bpe_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab) { + return run_tokenizer_variant_parity( + opts, + llama_vocab_ref, + emel_vocab, + emel::text::tokenizer::preprocessor::preprocessor_kind::bpe, + emel::text::encoders::encoder_kind::bpe, + "bpe"); +} + +} // namespace emel::paritychecker diff --git a/tools/paritychecker/tokenizer_fallback_parity.cpp b/tools/paritychecker/tokenizer_fallback_parity.cpp new file mode 100644 index 00000000..dde0c302 --- /dev/null +++ b/tools/paritychecker/tokenizer_fallback_parity.cpp @@ -0,0 +1,17 @@ +#include "tokenizer_parity.hpp" + +namespace emel::paritychecker { + +int run_tokenizer_fallback_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab) { + return run_tokenizer_variant_parity( + opts, + llama_vocab_ref, + emel_vocab, + emel::text::tokenizer::preprocessor::preprocessor_kind::fallback, + emel::text::encoders::encoder_kind::fallback, + "fallback"); +} + +} // namespace emel::paritychecker diff --git a/tools/paritychecker/tokenizer_parity.hpp b/tools/paritychecker/tokenizer_parity.hpp new file mode 100644 index 00000000..e477dabb --- /dev/null +++ b/tools/paritychecker/tokenizer_parity.hpp @@ -0,0 +1,49 @@ +#pragma once + +#include "parity_runner.hpp" + +#include "emel/model/data.hpp" +#include "emel/text/encoders/any.hpp" +#include "emel/text/tokenizer/preprocessor/any.hpp" + +struct llama_vocab; + +namespace emel::paritychecker { + +int run_tokenizer_variant_parity( + const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab, + emel::text::tokenizer::preprocessor::preprocessor_kind preprocessor_variant, + emel::text::encoders::encoder_kind encoder_variant, + const char * variant_name); + +int run_tokenizer_spm_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab); + +int run_tokenizer_bpe_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab); + +int run_tokenizer_wpm_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab); + +int run_tokenizer_ugm_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab); + +int run_tokenizer_rwkv_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab); + +int run_tokenizer_plamo2_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab); + +int run_tokenizer_fallback_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab); + +} // namespace emel::paritychecker diff --git a/tools/paritychecker/tokenizer_parity_common.cpp b/tools/paritychecker/tokenizer_parity_common.cpp new file mode 100644 index 00000000..697dc6bd --- /dev/null +++ b/tools/paritychecker/tokenizer_parity_common.cpp @@ -0,0 +1,188 @@ +#include "tokenizer_parity.hpp" + +#include +#include +#include +#include +#include + +#include "emel/emel.h" +#include "emel/text/tokenizer/sm.hpp" + +#include "llama-vocab.h" + +namespace { + +bool run_emel_tokenizer( + const emel::model::data::vocab & vocab, + const std::string_view text, + const bool add_special, + const bool parse_special, + const size_t expected_count, + const emel::text::tokenizer::preprocessor::preprocessor_kind preprocessor_variant, + const emel::text::encoders::encoder_kind encoder_variant, + std::vector & tokens_out, + int32_t & err_out) { + emel::text::tokenizer::sm machine{}; + + int32_t bind_err = EMEL_OK; + emel::text::tokenizer::event::bind bind_ev = {}; + bind_ev.vocab = &vocab; + bind_ev.preprocessor_variant = preprocessor_variant; + bind_ev.encoder_variant = encoder_variant; + bind_ev.error_out = &bind_err; + + const bool bind_ok = machine.process_event(bind_ev); + if (!bind_ok || bind_err != EMEL_OK) { + std::fprintf(stderr, + "emel tokenizer bind failed: accepted=%s err=%d\n", + bind_ok ? "true" : "false", + bind_err); + err_out = bind_err; + return false; + } + + const size_t capacity = std::max(4096u, expected_count + 32u); + std::vector token_buffer(capacity, 0); + + int32_t token_count = 0; + int32_t tokenize_err = EMEL_OK; + emel::text::tokenizer::event::tokenize tok_ev = {}; + tok_ev.vocab = &vocab; + tok_ev.text = text; + tok_ev.add_special = add_special; + tok_ev.parse_special = parse_special; + tok_ev.token_ids_out = token_buffer.data(); + tok_ev.token_capacity = static_cast(token_buffer.size()); + tok_ev.token_count_out = &token_count; + tok_ev.error_out = &tokenize_err; + + const bool tokenize_ok = machine.process_event(tok_ev); + if (!tokenize_ok || tokenize_err != EMEL_OK) { + std::fprintf(stderr, + "emel tokenizer tokenize failed: accepted=%s err=%d\n", + tokenize_ok ? "true" : "false", + tokenize_err); + err_out = tokenize_err; + return false; + } + + if (token_count < 0 || static_cast(token_count) > token_buffer.size()) { + std::fprintf(stderr, + "emel tokenizer returned invalid token count: %d (capacity=%zu)\n", + token_count, + token_buffer.size()); + err_out = EMEL_ERR_INTERNAL; + return false; + } + + tokens_out.assign(token_buffer.begin(), token_buffer.begin() + token_count); + err_out = EMEL_OK; + return true; +} + +void dump_tokens(const char * label, const std::vector & tokens) { + std::fprintf(stdout, "%s[%zu]:", label, tokens.size()); + for (const int32_t token : tokens) { + std::fprintf(stdout, " %d", token); + } + std::fprintf(stdout, "\n"); +} + +void dump_llama_tokens(const char * label, const std::vector & tokens) { + std::fprintf(stdout, "%s[%zu]:", label, tokens.size()); + for (const llama_token token : tokens) { + std::fprintf(stdout, " %d", static_cast(token)); + } + std::fprintf(stdout, "\n"); +} + +bool compare_token_streams(const std::vector & emel_tokens, + const std::vector & llama_tokens, + const bool dump) { + const size_t shared = std::min(emel_tokens.size(), llama_tokens.size()); + for (size_t i = 0; i < shared; ++i) { + if (emel_tokens[i] == llama_tokens[i]) { + continue; + } + std::fprintf(stderr, + "token mismatch at index %zu: emel=%d llama=%d\n", + i, + emel_tokens[i], + static_cast(llama_tokens[i])); + if (dump) { + dump_tokens("emel", emel_tokens); + dump_llama_tokens("llama", llama_tokens); + } + return false; + } + + if (emel_tokens.size() != llama_tokens.size()) { + std::fprintf(stderr, + "token count mismatch: emel=%zu llama=%zu\n", + emel_tokens.size(), + llama_tokens.size()); + if (dump) { + dump_tokens("emel", emel_tokens); + dump_llama_tokens("llama", llama_tokens); + } + return false; + } + + return true; +} + +} // namespace + +namespace emel::paritychecker { + +int run_tokenizer_variant_parity( + const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab, + const emel::text::tokenizer::preprocessor::preprocessor_kind preprocessor_variant, + const emel::text::encoders::encoder_kind encoder_variant, + const char * variant_name) { + std::vector llama_tokens; + try { + llama_tokens = llama_vocab_ref.tokenize(opts.text, opts.add_special, opts.parse_special); + } catch (const std::exception & ex) { + std::fprintf(stderr, "llama tokenize threw exception: %s\n", ex.what()); + return 1; + } + + std::vector emel_tokens; + int32_t emel_err = EMEL_OK; + if (!run_emel_tokenizer(emel_vocab, + opts.text, + opts.add_special, + opts.parse_special, + llama_tokens.size(), + preprocessor_variant, + encoder_variant, + emel_tokens, + emel_err)) { + std::fprintf(stderr, "emel tokenize failed with err=%d\n", emel_err); + return 1; + } + + if (!compare_token_streams(emel_tokens, llama_tokens, opts.dump)) { + std::fprintf(stderr, + "%s tokenizer parity mismatch: model=%s tokenizer=%s pre=%s\n", + variant_name, + opts.model_path.c_str(), + emel_vocab.tokenizer_model_name.data(), + emel_vocab.tokenizer_pre_name.data()); + return 1; + } + + std::fprintf(stdout, + "%s tokenizer parity ok (%zu tokens, tokenizer=%s pre=%s)\n", + variant_name, + llama_tokens.size(), + emel_vocab.tokenizer_model_name.data(), + emel_vocab.tokenizer_pre_name.data()); + return 0; +} + +} // namespace emel::paritychecker diff --git a/tools/paritychecker/tokenizer_plamo2_parity.cpp b/tools/paritychecker/tokenizer_plamo2_parity.cpp new file mode 100644 index 00000000..ef4a1f3a --- /dev/null +++ b/tools/paritychecker/tokenizer_plamo2_parity.cpp @@ -0,0 +1,17 @@ +#include "tokenizer_parity.hpp" + +namespace emel::paritychecker { + +int run_tokenizer_plamo2_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab) { + return run_tokenizer_variant_parity( + opts, + llama_vocab_ref, + emel_vocab, + emel::text::tokenizer::preprocessor::preprocessor_kind::plamo2, + emel::text::encoders::encoder_kind::plamo2, + "plamo2"); +} + +} // namespace emel::paritychecker diff --git a/tools/paritychecker/tokenizer_rwkv_parity.cpp b/tools/paritychecker/tokenizer_rwkv_parity.cpp new file mode 100644 index 00000000..53ee8b65 --- /dev/null +++ b/tools/paritychecker/tokenizer_rwkv_parity.cpp @@ -0,0 +1,17 @@ +#include "tokenizer_parity.hpp" + +namespace emel::paritychecker { + +int run_tokenizer_rwkv_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab) { + return run_tokenizer_variant_parity( + opts, + llama_vocab_ref, + emel_vocab, + emel::text::tokenizer::preprocessor::preprocessor_kind::rwkv, + emel::text::encoders::encoder_kind::rwkv, + "rwkv"); +} + +} // namespace emel::paritychecker diff --git a/tools/paritychecker/tokenizer_spm_parity.cpp b/tools/paritychecker/tokenizer_spm_parity.cpp new file mode 100644 index 00000000..a90c6771 --- /dev/null +++ b/tools/paritychecker/tokenizer_spm_parity.cpp @@ -0,0 +1,17 @@ +#include "tokenizer_parity.hpp" + +namespace emel::paritychecker { + +int run_tokenizer_spm_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab) { + return run_tokenizer_variant_parity( + opts, + llama_vocab_ref, + emel_vocab, + emel::text::tokenizer::preprocessor::preprocessor_kind::spm, + emel::text::encoders::encoder_kind::spm, + "spm"); +} + +} // namespace emel::paritychecker diff --git a/tools/paritychecker/tokenizer_ugm_parity.cpp b/tools/paritychecker/tokenizer_ugm_parity.cpp new file mode 100644 index 00000000..f44f0ae5 --- /dev/null +++ b/tools/paritychecker/tokenizer_ugm_parity.cpp @@ -0,0 +1,17 @@ +#include "tokenizer_parity.hpp" + +namespace emel::paritychecker { + +int run_tokenizer_ugm_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab) { + return run_tokenizer_variant_parity( + opts, + llama_vocab_ref, + emel_vocab, + emel::text::tokenizer::preprocessor::preprocessor_kind::ugm, + emel::text::encoders::encoder_kind::ugm, + "ugm"); +} + +} // namespace emel::paritychecker diff --git a/tools/paritychecker/tokenizer_wpm_parity.cpp b/tools/paritychecker/tokenizer_wpm_parity.cpp new file mode 100644 index 00000000..bafc03f7 --- /dev/null +++ b/tools/paritychecker/tokenizer_wpm_parity.cpp @@ -0,0 +1,17 @@ +#include "tokenizer_parity.hpp" + +namespace emel::paritychecker { + +int run_tokenizer_wpm_parity(const parity_options & opts, + const llama_vocab & llama_vocab_ref, + const emel::model::data::vocab & emel_vocab) { + return run_tokenizer_variant_parity( + opts, + llama_vocab_ref, + emel_vocab, + emel::text::tokenizer::preprocessor::preprocessor_kind::wpm, + emel::text::encoders::encoder_kind::wpm, + "wpm"); +} + +} // namespace emel::paritychecker From cc0d300815b044c094d9e5a3d37f822168c83b82 Mon Sep 17 00:00:00 2001 From: gabewillen Date: Sun, 1 Mar 2026 23:12:57 -0600 Subject: [PATCH 2/3] bench: split machine benchmarks by domain and refresh generated outputs --- docs/benchmarks.md | 186 +++++++++--------- docs/templates/benchmarks.md.j2 | 2 - snapshots/bench/benchmarks.txt | 184 ++++++++--------- snapshots/bench/benchmarks_compare.txt | 184 ++++++++--------- snapshots/quality_gates/timing.txt | 14 +- tools/bench/CMakeLists.txt | 12 +- tools/bench/batch/planner_bench.cpp | 12 +- tools/bench/bench_cases.hpp | 34 +++- tools/bench/bench_main.cpp | 166 +++++++++------- tools/bench/gbnf/bench_main.cpp | 4 +- ...parser_bench.cpp => rule_parser_bench.cpp} | 22 +-- tools/bench/kernel/aarch64_bench.cpp | 22 +++ .../bench_common.hpp} | 24 +-- tools/bench/kernel/x86_64_bench.cpp | 22 +++ .../bench_common.hpp} | 81 +------- tools/bench/logits/sampler_bench.cpp | 52 +++++ tools/bench/logits/validator_bench.cpp | 46 +++++ tools/bench/text/jinja/formatter_bench.cpp | 11 +- tools/bench/text/jinja/parser_bench.cpp | 8 +- 19 files changed, 586 insertions(+), 500 deletions(-) rename tools/bench/gbnf/{parser_bench.cpp => rule_parser_bench.cpp} (88%) create mode 100644 tools/bench/kernel/aarch64_bench.cpp rename tools/bench/{kernel_bench.cpp => kernel/bench_common.hpp} (96%) create mode 100644 tools/bench/kernel/x86_64_bench.cpp rename tools/bench/{logits_bench.cpp => logits/bench_common.hpp} (63%) create mode 100644 tools/bench/logits/sampler_bench.cpp create mode 100644 tools/bench/logits/validator_bench.cpp diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 99eb4249..5557da10 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -5,100 +5,98 @@ Source: `snapshots/bench/benchmarks_compare.txt` Note: While EMEL is modular and easy to bench in isolation, llama.cpp code is very entangled. These microbenches aim for apples-to-apples comparisons but likely are not. True benchmarks will be end-to-end once the system is complete. -Legacy benchmark IDs like `batch/splitter_*` and `jinja/renderer_*` are intentionally -retained for snapshot/report continuity and should be renamed after consumers migrate. | Benchmark | emel.cpp ns/op | llama.cpp ns/op | ratio | | --- | ---: | ---: | ---: | -| `batch/splitter_equal` | 1836.312 | 8593.229 | 0.214x | -| `batch/splitter_seq` | 1698.263 | 4051.104 | 0.419x | -| `batch/splitter_simple` | 1139.383 | 3584.637 | 0.318x | -| `gbnf/parser_basic` | 264.846 | 463.637 | 0.571x | -| `gbnf/parser_complex` | 1857.504 | 2470.021 | 0.752x | -| `jinja/parser_long` | 33815.062 | 55666.438 | 0.607x | -| `jinja/parser_short` | 531.558 | 662.467 | 0.802x | -| `jinja/renderer_long` | 84833.121 | 406507.271 | 0.209x | -| `jinja/renderer_short` | 1113.967 | 6485.746 | 0.172x | -| `kernel/aarch64/op_add` | 92.546 | 5279.417 | 0.018x | -| `kernel/aarch64/op_cos` | 1631.362 | 5731.046 | 0.285x | -| `kernel/aarch64/op_div` | 93.892 | 4394.467 | 0.021x | -| `kernel/aarch64/op_dup` | 86.471 | 4282.050 | 0.020x | -| `kernel/aarch64/op_log` | 1819.667 | 6011.442 | 0.303x | -| `kernel/aarch64/op_mul` | 89.467 | 5507.025 | 0.016x | -| `kernel/aarch64/op_mul_mat` | 4517.254 | 10219.783 | 0.442x | -| `kernel/aarch64/op_sin` | 1289.033 | 5404.462 | 0.239x | -| `kernel/aarch64/op_soft_max` | 2065.446 | 4890.883 | 0.422x | -| `kernel/aarch64/op_sqr` | 86.829 | 4336.387 | 0.020x | -| `kernel/aarch64/op_sqrt` | 138.008 | 4288.304 | 0.032x | -| `kernel/aarch64/op_sub` | 88.904 | 5325.046 | 0.017x | -| `kernel/aarch64/op_unary_exp` | 1277.404 | 5371.312 | 0.238x | -| `kernel/aarch64/op_unary_neg` | 86.029 | 4175.996 | 0.021x | -| `kernel/aarch64/op_unary_relu` | 90.608 | 4124.083 | 0.022x | -| `kernel/x86_64/op_add` | 64.504 | 5233.129 | 0.012x | -| `kernel/x86_64/op_cos` | 1628.146 | 6016.683 | 0.271x | -| `kernel/x86_64/op_div` | 73.971 | 5013.746 | 0.015x | -| `kernel/x86_64/op_dup` | 47.921 | 4274.621 | 0.011x | -| `kernel/x86_64/op_log` | 1852.987 | 6434.496 | 0.288x | -| `kernel/x86_64/op_mul` | 60.212 | 5865.367 | 0.010x | -| `kernel/x86_64/op_mul_mat` | 43938.567 | 11147.154 | 3.942x | -| `kernel/x86_64/op_sin` | 1262.237 | 5676.933 | 0.222x | -| `kernel/x86_64/op_soft_max` | 2059.963 | 4999.904 | 0.412x | -| `kernel/x86_64/op_sqr` | 50.700 | 4964.100 | 0.010x | -| `kernel/x86_64/op_sqrt` | 140.496 | 4741.517 | 0.030x | -| `kernel/x86_64/op_sub` | 60.233 | 5408.542 | 0.011x | -| `kernel/x86_64/op_unary_exp` | 1268.250 | 5503.779 | 0.230x | -| `kernel/x86_64/op_unary_neg` | 47.487 | 4549.292 | 0.010x | -| `kernel/x86_64/op_unary_relu` | 47.254 | 4375.171 | 0.011x | -| `logits/sampler_raw/vocab_128000` | 18746.150 | 19140.217 | 0.979x | -| `logits/sampler_raw/vocab_256000` | 37709.246 | 37594.458 | 1.003x | -| `logits/sampler_raw/vocab_32000` | 4739.504 | 4991.942 | 0.949x | -| `logits/sampler_sml/vocab_128000` | 16979.446 | 16718.892 | 1.016x | -| `logits/sampler_sml/vocab_256000` | 36024.967 | 29679.767 | 1.214x | -| `logits/sampler_sml/vocab_32000` | 3928.754 | 3549.517 | 1.107x | -| `logits/validator_raw/vocab_128000` | 88332.717 | 90152.250 | 0.980x | -| `logits/validator_raw/vocab_256000` | 182805.817 | 182508.413 | 1.002x | -| `logits/validator_raw/vocab_32000` | 23365.571 | 23878.521 | 0.979x | -| `logits/validator_sml/vocab_128000` | 96768.158 | 98797.996 | 0.979x | -| `logits/validator_sml/vocab_256000` | 193641.642 | 196589.429 | 0.985x | -| `logits/validator_sml/vocab_32000` | 23869.067 | 24220.071 | 0.986x | -| `memory/hybrid_full` | 387.054 | 37587.438 | 0.010x | -| `memory/kv_full` | 100.883 | 36279.867 | 0.003x | -| `memory/recurrent_full` | 114.583 | 5563.017 | 0.021x | -| `text/encoders/bpe_long` | 10232.100 | 10270.446 | 0.996x | -| `text/encoders/bpe_short` | 164.613 | 160.850 | 1.023x | -| `text/encoders/fallback_long` | 2522.454 | 2465.408 | 1.023x | -| `text/encoders/fallback_short` | 45.263 | 47.033 | 0.962x | -| `text/encoders/plamo2_long` | 4983.292 | 4977.471 | 1.001x | -| `text/encoders/plamo2_short` | 108.175 | 106.071 | 1.020x | -| `text/encoders/rwkv_long` | 4530.604 | 4569.600 | 0.991x | -| `text/encoders/rwkv_short` | 2613.637 | 2628.946 | 0.994x | -| `text/encoders/spm_long` | 12319.425 | 12292.258 | 1.002x | -| `text/encoders/spm_short` | 202.892 | 208.137 | 0.975x | -| `text/encoders/ugm_long` | 8120.746 | 8109.150 | 1.001x | -| `text/encoders/ugm_short` | 131.733 | 139.221 | 0.946x | -| `text/encoders/wpm_long` | 26693.121 | 26402.671 | 1.011x | -| `text/encoders/wpm_short` | 529.188 | 536.987 | 0.985x | -| `tokenizer/full_bpe_long` | 9626.758 | 9619.733 | 1.001x | -| `tokenizer/full_bpe_short` | 219.575 | 211.517 | 1.038x | -| `tokenizer/full_plamo2_long` | 10053.233 | 9994.929 | 1.006x | -| `tokenizer/full_plamo2_short` | 1918.483 | 1897.900 | 1.011x | -| `tokenizer/full_rwkv_long` | 3675.642 | 3665.338 | 1.003x | -| `tokenizer/full_rwkv_short` | 2230.875 | 2521.367 | 0.885x | -| `tokenizer/full_spm_long` | 13644.233 | 13779.175 | 0.990x | -| `tokenizer/full_spm_short` | 295.458 | 281.479 | 1.050x | -| `tokenizer/full_ugm_long` | 10078.542 | 10030.425 | 1.005x | -| `tokenizer/full_ugm_short` | 2138.625 | 2206.517 | 0.969x | -| `tokenizer/full_wpm_long` | 28529.271 | 28240.213 | 1.010x | -| `tokenizer/full_wpm_short` | 2266.062 | 2320.533 | 0.977x | -| `tokenizer/preprocessor_bpe_long` | 2753.250 | 5209.350 | 0.529x | -| `tokenizer/preprocessor_bpe_short` | 86.571 | 1702.050 | 0.051x | -| `tokenizer/preprocessor_plamo2_long` | 3144.229 | 4588.988 | 0.685x | -| `tokenizer/preprocessor_plamo2_short` | 2467.929 | 3609.229 | 0.684x | -| `tokenizer/preprocessor_rwkv_long` | 3121.367 | 4583.267 | 0.681x | -| `tokenizer/preprocessor_rwkv_short` | 2477.188 | 3683.521 | 0.673x | -| `tokenizer/preprocessor_spm_long` | 3127.146 | 4508.325 | 0.694x | -| `tokenizer/preprocessor_spm_short` | 2448.333 | 3641.775 | 0.672x | -| `tokenizer/preprocessor_ugm_long` | 3190.696 | 4554.075 | 0.701x | -| `tokenizer/preprocessor_ugm_short` | 2460.821 | 3632.079 | 0.678x | -| `tokenizer/preprocessor_wpm_long` | 3154.375 | 4505.400 | 0.700x | -| `tokenizer/preprocessor_wpm_short` | 2466.742 | 3636.067 | 0.678x | +| `batch/planner_equal` | 1846.750 | 8689.946 | 0.213x | +| `batch/planner_seq` | 1781.388 | 3996.500 | 0.446x | +| `batch/planner_simple` | 1348.817 | 3498.363 | 0.386x | +| `gbnf/rule_parser_basic` | 247.521 | 471.233 | 0.525x | +| `gbnf/rule_parser_complex` | 1933.033 | 2515.221 | 0.769x | +| `kernel/aarch64/op_add` | 88.783 | 5061.321 | 0.018x | +| `kernel/aarch64/op_cos` | 1668.921 | 6025.850 | 0.277x | +| `kernel/aarch64/op_div` | 88.600 | 4142.504 | 0.021x | +| `kernel/aarch64/op_dup` | 85.975 | 4095.954 | 0.021x | +| `kernel/aarch64/op_log` | 1843.883 | 6106.117 | 0.302x | +| `kernel/aarch64/op_mul` | 91.025 | 5091.896 | 0.018x | +| `kernel/aarch64/op_mul_mat` | 4540.008 | 10639.004 | 0.427x | +| `kernel/aarch64/op_sin` | 1447.079 | 5599.971 | 0.258x | +| `kernel/aarch64/op_soft_max` | 2066.808 | 4972.771 | 0.416x | +| `kernel/aarch64/op_sqr` | 86.779 | 4090.646 | 0.021x | +| `kernel/aarch64/op_sqrt` | 137.033 | 4436.392 | 0.031x | +| `kernel/aarch64/op_sub` | 91.279 | 5088.383 | 0.018x | +| `kernel/aarch64/op_unary_exp` | 1297.300 | 5642.096 | 0.230x | +| `kernel/aarch64/op_unary_neg` | 89.208 | 4536.625 | 0.020x | +| `kernel/aarch64/op_unary_relu` | 85.879 | 4413.375 | 0.019x | +| `kernel/x86_64/op_add` | 60.092 | 5068.100 | 0.012x | +| `kernel/x86_64/op_cos` | 1969.629 | 5873.692 | 0.335x | +| `kernel/x86_64/op_div` | 74.679 | 4153.717 | 0.018x | +| `kernel/x86_64/op_dup` | 47.033 | 4013.613 | 0.012x | +| `kernel/x86_64/op_log` | 1820.858 | 6532.413 | 0.279x | +| `kernel/x86_64/op_mul` | 60.196 | 5235.196 | 0.011x | +| `kernel/x86_64/op_mul_mat` | 44244.079 | 10511.242 | 4.209x | +| `kernel/x86_64/op_sin` | 1296.000 | 5583.742 | 0.232x | +| `kernel/x86_64/op_soft_max` | 2062.137 | 5244.917 | 0.393x | +| `kernel/x86_64/op_sqr` | 49.138 | 4063.596 | 0.012x | +| `kernel/x86_64/op_sqrt` | 143.012 | 4265.863 | 0.034x | +| `kernel/x86_64/op_sub` | 60.096 | 5310.508 | 0.011x | +| `kernel/x86_64/op_unary_exp` | 1284.658 | 5399.771 | 0.238x | +| `kernel/x86_64/op_unary_neg` | 51.946 | 4309.450 | 0.012x | +| `kernel/x86_64/op_unary_relu` | 52.304 | 4238.471 | 0.012x | +| `logits/sampler_raw/vocab_128000` | 19259.958 | 18468.492 | 1.043x | +| `logits/sampler_raw/vocab_256000` | 38539.842 | 36725.137 | 1.049x | +| `logits/sampler_raw/vocab_32000` | 5214.146 | 4826.229 | 1.080x | +| `logits/sampler_sml/vocab_128000` | 15429.442 | 14757.788 | 1.046x | +| `logits/sampler_sml/vocab_256000` | 34200.133 | 30380.342 | 1.126x | +| `logits/sampler_sml/vocab_32000` | 4436.292 | 4330.962 | 1.024x | +| `logits/validator_raw/vocab_128000` | 90205.633 | 90458.808 | 0.997x | +| `logits/validator_raw/vocab_256000` | 181372.546 | 179498.462 | 1.010x | +| `logits/validator_raw/vocab_32000` | 23735.550 | 23904.125 | 0.993x | +| `logits/validator_sml/vocab_128000` | 99648.387 | 99266.212 | 1.004x | +| `logits/validator_sml/vocab_256000` | 197266.092 | 199430.296 | 0.989x | +| `logits/validator_sml/vocab_32000` | 24528.092 | 24126.225 | 1.017x | +| `memory/hybrid_full` | 408.700 | 36677.713 | 0.011x | +| `memory/kv_full` | 103.067 | 36946.496 | 0.003x | +| `memory/recurrent_full` | 113.079 | 5595.042 | 0.020x | +| `text/encoders/bpe_long` | 10221.996 | 10221.204 | 1.000x | +| `text/encoders/bpe_short` | 159.125 | 153.158 | 1.039x | +| `text/encoders/fallback_long` | 2470.238 | 2485.546 | 0.994x | +| `text/encoders/fallback_short` | 50.267 | 47.825 | 1.051x | +| `text/encoders/plamo2_long` | 4848.942 | 4878.158 | 0.994x | +| `text/encoders/plamo2_short` | 107.117 | 104.096 | 1.029x | +| `text/encoders/rwkv_long` | 4557.729 | 4543.887 | 1.003x | +| `text/encoders/rwkv_short` | 2697.533 | 2658.883 | 1.015x | +| `text/encoders/spm_long` | 12589.987 | 12349.475 | 1.019x | +| `text/encoders/spm_short` | 213.188 | 205.325 | 1.038x | +| `text/encoders/ugm_long` | 8308.617 | 8295.337 | 1.002x | +| `text/encoders/ugm_short` | 137.250 | 137.008 | 1.002x | +| `text/encoders/wpm_long` | 26858.621 | 26355.825 | 1.019x | +| `text/encoders/wpm_short` | 531.438 | 540.237 | 0.984x | +| `text/jinja/formatter_long` | 87073.829 | 400326.883 | 0.218x | +| `text/jinja/formatter_short` | 1144.017 | 6368.133 | 0.180x | +| `text/jinja/parser_long` | 35030.512 | 52803.367 | 0.663x | +| `text/jinja/parser_short` | 547.888 | 632.633 | 0.866x | +| `tokenizer/full_bpe_long` | 9967.413 | 9607.096 | 1.038x | +| `tokenizer/full_bpe_short` | 220.113 | 218.846 | 1.006x | +| `tokenizer/full_plamo2_long` | 9890.796 | 9985.525 | 0.991x | +| `tokenizer/full_plamo2_short` | 1799.446 | 1769.058 | 1.017x | +| `tokenizer/full_rwkv_long` | 3566.475 | 3551.117 | 1.004x | +| `tokenizer/full_rwkv_short` | 2373.500 | 2159.892 | 1.099x | +| `tokenizer/full_spm_long` | 13766.279 | 13689.263 | 1.006x | +| `tokenizer/full_spm_short` | 296.825 | 285.354 | 1.040x | +| `tokenizer/full_ugm_long` | 10042.667 | 9989.429 | 1.005x | +| `tokenizer/full_ugm_short` | 1817.804 | 1818.546 | 1.000x | +| `tokenizer/full_wpm_long` | 28866.112 | 34007.938 | 0.849x | +| `tokenizer/full_wpm_short` | 2204.133 | 2210.221 | 0.997x | +| `tokenizer/preprocessor_bpe_long` | 2775.246 | 5265.688 | 0.527x | +| `tokenizer/preprocessor_bpe_short` | 82.854 | 1747.217 | 0.047x | +| `tokenizer/preprocessor_plamo2_long` | 3052.371 | 4619.908 | 0.661x | +| `tokenizer/preprocessor_plamo2_short` | 2367.925 | 3575.713 | 0.662x | +| `tokenizer/preprocessor_rwkv_long` | 3077.379 | 4554.646 | 0.676x | +| `tokenizer/preprocessor_rwkv_short` | 2356.238 | 3536.963 | 0.666x | +| `tokenizer/preprocessor_spm_long` | 3092.796 | 4569.296 | 0.677x | +| `tokenizer/preprocessor_spm_short` | 2361.154 | 3586.446 | 0.658x | +| `tokenizer/preprocessor_ugm_long` | 3139.088 | 4625.679 | 0.679x | +| `tokenizer/preprocessor_ugm_short` | 2375.508 | 3560.692 | 0.667x | +| `tokenizer/preprocessor_wpm_long` | 3043.238 | 4503.621 | 0.676x | +| `tokenizer/preprocessor_wpm_short` | 2599.613 | 3530.233 | 0.736x | diff --git a/docs/templates/benchmarks.md.j2 b/docs/templates/benchmarks.md.j2 index 40795141..0f3c749c 100644 --- a/docs/templates/benchmarks.md.j2 +++ b/docs/templates/benchmarks.md.j2 @@ -5,7 +5,5 @@ Source: `snapshots/bench/benchmarks_compare.txt` Note: While EMEL is modular and easy to bench in isolation, llama.cpp code is very entangled. These microbenches aim for apples-to-apples comparisons but likely are not. True benchmarks will be end-to-end once the system is complete. -Legacy benchmark IDs like `batch/splitter_*` and `jinja/renderer_*` are intentionally -retained for snapshot/report continuity and should be renamed after consumers migrate. {{ benchmarks_table }} diff --git a/snapshots/bench/benchmarks.txt b/snapshots/bench/benchmarks.txt index 551aeff9..2c000db5 100644 --- a/snapshots/bench/benchmarks.txt +++ b/snapshots/bench/benchmarks.txt @@ -1,94 +1,94 @@ # ref=ecbcb7ea9d3303097519723b264a8b5f1e977028 # toolchain=/opt/homebrew/bin/zig -batch/splitter_equal ns_per_op=1836.312 -batch/splitter_seq ns_per_op=1698.263 -batch/splitter_simple ns_per_op=1139.383 -gbnf/parser_basic ns_per_op=264.846 -gbnf/parser_complex ns_per_op=1857.504 -jinja/parser_long ns_per_op=33815.062 -jinja/parser_short ns_per_op=531.558 -jinja/renderer_long ns_per_op=84833.121 -jinja/renderer_short ns_per_op=1113.967 -kernel/aarch64/op_add ns_per_op=92.546 -kernel/aarch64/op_cos ns_per_op=1631.362 -kernel/aarch64/op_div ns_per_op=93.892 -kernel/aarch64/op_dup ns_per_op=86.471 -kernel/aarch64/op_log ns_per_op=1819.667 -kernel/aarch64/op_mul ns_per_op=89.467 -kernel/aarch64/op_mul_mat ns_per_op=4517.254 -kernel/aarch64/op_sin ns_per_op=1289.033 -kernel/aarch64/op_soft_max ns_per_op=2065.446 -kernel/aarch64/op_sqr ns_per_op=86.829 -kernel/aarch64/op_sqrt ns_per_op=138.008 -kernel/aarch64/op_sub ns_per_op=88.904 -kernel/aarch64/op_unary_exp ns_per_op=1277.404 -kernel/aarch64/op_unary_neg ns_per_op=86.029 -kernel/aarch64/op_unary_relu ns_per_op=90.608 -kernel/x86_64/op_add ns_per_op=64.504 -kernel/x86_64/op_cos ns_per_op=1628.146 -kernel/x86_64/op_div ns_per_op=73.971 -kernel/x86_64/op_dup ns_per_op=47.921 -kernel/x86_64/op_log ns_per_op=1852.987 -kernel/x86_64/op_mul ns_per_op=60.212 -kernel/x86_64/op_mul_mat ns_per_op=43938.567 -kernel/x86_64/op_sin ns_per_op=1262.237 -kernel/x86_64/op_soft_max ns_per_op=2059.963 -kernel/x86_64/op_sqr ns_per_op=50.700 -kernel/x86_64/op_sqrt ns_per_op=140.496 -kernel/x86_64/op_sub ns_per_op=60.233 -kernel/x86_64/op_unary_exp ns_per_op=1268.250 -kernel/x86_64/op_unary_neg ns_per_op=47.487 -kernel/x86_64/op_unary_relu ns_per_op=47.254 -logits/sampler_raw/vocab_128000 ns_per_op=18746.150 -logits/sampler_raw/vocab_256000 ns_per_op=37709.246 -logits/sampler_raw/vocab_32000 ns_per_op=4739.504 -logits/sampler_sml/vocab_128000 ns_per_op=16979.446 -logits/sampler_sml/vocab_256000 ns_per_op=36024.967 -logits/sampler_sml/vocab_32000 ns_per_op=3928.754 -logits/validator_raw/vocab_128000 ns_per_op=88332.717 -logits/validator_raw/vocab_256000 ns_per_op=182805.817 -logits/validator_raw/vocab_32000 ns_per_op=23365.571 -logits/validator_sml/vocab_128000 ns_per_op=96768.158 -logits/validator_sml/vocab_256000 ns_per_op=193641.642 -logits/validator_sml/vocab_32000 ns_per_op=23869.067 -memory/hybrid_full ns_per_op=387.054 -memory/kv_full ns_per_op=100.883 -memory/recurrent_full ns_per_op=114.583 -text/encoders/bpe_long ns_per_op=10232.100 -text/encoders/bpe_short ns_per_op=164.613 -text/encoders/fallback_long ns_per_op=2522.454 -text/encoders/fallback_short ns_per_op=45.263 -text/encoders/plamo2_long ns_per_op=4983.292 -text/encoders/plamo2_short ns_per_op=108.175 -text/encoders/rwkv_long ns_per_op=4530.604 -text/encoders/rwkv_short ns_per_op=2613.637 -text/encoders/spm_long ns_per_op=12319.425 -text/encoders/spm_short ns_per_op=202.892 -text/encoders/ugm_long ns_per_op=8120.746 -text/encoders/ugm_short ns_per_op=131.733 -text/encoders/wpm_long ns_per_op=26693.121 -text/encoders/wpm_short ns_per_op=529.188 -tokenizer/full_bpe_long ns_per_op=9626.758 -tokenizer/full_bpe_short ns_per_op=219.575 -tokenizer/full_plamo2_long ns_per_op=10053.233 -tokenizer/full_plamo2_short ns_per_op=1918.483 -tokenizer/full_rwkv_long ns_per_op=3675.642 -tokenizer/full_rwkv_short ns_per_op=2230.875 -tokenizer/full_spm_long ns_per_op=13644.233 -tokenizer/full_spm_short ns_per_op=295.458 -tokenizer/full_ugm_long ns_per_op=10078.542 -tokenizer/full_ugm_short ns_per_op=2138.625 -tokenizer/full_wpm_long ns_per_op=28529.271 -tokenizer/full_wpm_short ns_per_op=2266.062 -tokenizer/preprocessor_bpe_long ns_per_op=2753.250 -tokenizer/preprocessor_bpe_short ns_per_op=86.571 -tokenizer/preprocessor_plamo2_long ns_per_op=3144.229 -tokenizer/preprocessor_plamo2_short ns_per_op=2467.929 -tokenizer/preprocessor_rwkv_long ns_per_op=3121.367 -tokenizer/preprocessor_rwkv_short ns_per_op=2477.188 -tokenizer/preprocessor_spm_long ns_per_op=3127.146 -tokenizer/preprocessor_spm_short ns_per_op=2448.333 -tokenizer/preprocessor_ugm_long ns_per_op=3190.696 -tokenizer/preprocessor_ugm_short ns_per_op=2460.821 -tokenizer/preprocessor_wpm_long ns_per_op=3154.375 -tokenizer/preprocessor_wpm_short ns_per_op=2466.742 +batch/planner_equal ns_per_op=1846.750 +batch/planner_seq ns_per_op=1781.388 +batch/planner_simple ns_per_op=1348.817 +gbnf/rule_parser_basic ns_per_op=247.521 +gbnf/rule_parser_complex ns_per_op=1933.033 +kernel/aarch64/op_add ns_per_op=88.783 +kernel/aarch64/op_cos ns_per_op=1668.921 +kernel/aarch64/op_div ns_per_op=88.600 +kernel/aarch64/op_dup ns_per_op=85.975 +kernel/aarch64/op_log ns_per_op=1843.883 +kernel/aarch64/op_mul ns_per_op=91.025 +kernel/aarch64/op_mul_mat ns_per_op=4540.008 +kernel/aarch64/op_sin ns_per_op=1447.079 +kernel/aarch64/op_soft_max ns_per_op=2066.808 +kernel/aarch64/op_sqr ns_per_op=86.779 +kernel/aarch64/op_sqrt ns_per_op=137.033 +kernel/aarch64/op_sub ns_per_op=91.279 +kernel/aarch64/op_unary_exp ns_per_op=1297.300 +kernel/aarch64/op_unary_neg ns_per_op=89.208 +kernel/aarch64/op_unary_relu ns_per_op=85.879 +kernel/x86_64/op_add ns_per_op=60.092 +kernel/x86_64/op_cos ns_per_op=1969.629 +kernel/x86_64/op_div ns_per_op=74.679 +kernel/x86_64/op_dup ns_per_op=47.033 +kernel/x86_64/op_log ns_per_op=1820.858 +kernel/x86_64/op_mul ns_per_op=60.196 +kernel/x86_64/op_mul_mat ns_per_op=44244.079 +kernel/x86_64/op_sin ns_per_op=1296.000 +kernel/x86_64/op_soft_max ns_per_op=2062.137 +kernel/x86_64/op_sqr ns_per_op=49.138 +kernel/x86_64/op_sqrt ns_per_op=143.012 +kernel/x86_64/op_sub ns_per_op=60.096 +kernel/x86_64/op_unary_exp ns_per_op=1284.658 +kernel/x86_64/op_unary_neg ns_per_op=51.946 +kernel/x86_64/op_unary_relu ns_per_op=52.304 +logits/sampler_raw/vocab_128000 ns_per_op=19259.958 +logits/sampler_raw/vocab_256000 ns_per_op=38539.842 +logits/sampler_raw/vocab_32000 ns_per_op=5214.146 +logits/sampler_sml/vocab_128000 ns_per_op=15429.442 +logits/sampler_sml/vocab_256000 ns_per_op=34200.133 +logits/sampler_sml/vocab_32000 ns_per_op=4436.292 +logits/validator_raw/vocab_128000 ns_per_op=90205.633 +logits/validator_raw/vocab_256000 ns_per_op=181372.546 +logits/validator_raw/vocab_32000 ns_per_op=23735.550 +logits/validator_sml/vocab_128000 ns_per_op=99648.387 +logits/validator_sml/vocab_256000 ns_per_op=197266.092 +logits/validator_sml/vocab_32000 ns_per_op=24528.092 +memory/hybrid_full ns_per_op=408.700 +memory/kv_full ns_per_op=103.067 +memory/recurrent_full ns_per_op=113.079 +text/encoders/bpe_long ns_per_op=10221.996 +text/encoders/bpe_short ns_per_op=159.125 +text/encoders/fallback_long ns_per_op=2470.238 +text/encoders/fallback_short ns_per_op=50.267 +text/encoders/plamo2_long ns_per_op=4848.942 +text/encoders/plamo2_short ns_per_op=107.117 +text/encoders/rwkv_long ns_per_op=4557.729 +text/encoders/rwkv_short ns_per_op=2697.533 +text/encoders/spm_long ns_per_op=12589.987 +text/encoders/spm_short ns_per_op=213.188 +text/encoders/ugm_long ns_per_op=8308.617 +text/encoders/ugm_short ns_per_op=137.250 +text/encoders/wpm_long ns_per_op=26858.621 +text/encoders/wpm_short ns_per_op=531.438 +text/jinja/formatter_long ns_per_op=87073.829 +text/jinja/formatter_short ns_per_op=1144.017 +text/jinja/parser_long ns_per_op=35030.512 +text/jinja/parser_short ns_per_op=547.888 +tokenizer/full_bpe_long ns_per_op=9967.413 +tokenizer/full_bpe_short ns_per_op=220.113 +tokenizer/full_plamo2_long ns_per_op=9890.796 +tokenizer/full_plamo2_short ns_per_op=1799.446 +tokenizer/full_rwkv_long ns_per_op=3566.475 +tokenizer/full_rwkv_short ns_per_op=2373.500 +tokenizer/full_spm_long ns_per_op=13766.279 +tokenizer/full_spm_short ns_per_op=296.825 +tokenizer/full_ugm_long ns_per_op=10042.667 +tokenizer/full_ugm_short ns_per_op=1817.804 +tokenizer/full_wpm_long ns_per_op=28866.112 +tokenizer/full_wpm_short ns_per_op=2204.133 +tokenizer/preprocessor_bpe_long ns_per_op=2775.246 +tokenizer/preprocessor_bpe_short ns_per_op=82.854 +tokenizer/preprocessor_plamo2_long ns_per_op=3052.371 +tokenizer/preprocessor_plamo2_short ns_per_op=2367.925 +tokenizer/preprocessor_rwkv_long ns_per_op=3077.379 +tokenizer/preprocessor_rwkv_short ns_per_op=2356.238 +tokenizer/preprocessor_spm_long ns_per_op=3092.796 +tokenizer/preprocessor_spm_short ns_per_op=2361.154 +tokenizer/preprocessor_ugm_long ns_per_op=3139.088 +tokenizer/preprocessor_ugm_short ns_per_op=2375.508 +tokenizer/preprocessor_wpm_long ns_per_op=3043.238 +tokenizer/preprocessor_wpm_short ns_per_op=2599.613 diff --git a/snapshots/bench/benchmarks_compare.txt b/snapshots/bench/benchmarks_compare.txt index 5287953c..70a75ce3 100644 --- a/snapshots/bench/benchmarks_compare.txt +++ b/snapshots/bench/benchmarks_compare.txt @@ -1,94 +1,94 @@ # ref=ecbcb7ea9d3303097519723b264a8b5f1e977028 # toolchain=/opt/homebrew/bin/zig -batch/splitter_equal emel.cpp 1836.312 ns/op, llama.cpp 8593.229 ns/op, ratio=0.214x -batch/splitter_seq emel.cpp 1698.263 ns/op, llama.cpp 4051.104 ns/op, ratio=0.419x -batch/splitter_simple emel.cpp 1139.383 ns/op, llama.cpp 3584.637 ns/op, ratio=0.318x -gbnf/parser_basic emel.cpp 264.846 ns/op, llama.cpp 463.637 ns/op, ratio=0.571x -gbnf/parser_complex emel.cpp 1857.504 ns/op, llama.cpp 2470.021 ns/op, ratio=0.752x -jinja/parser_long emel.cpp 33815.062 ns/op, llama.cpp 55666.438 ns/op, ratio=0.607x -jinja/parser_short emel.cpp 531.558 ns/op, llama.cpp 662.467 ns/op, ratio=0.802x -jinja/renderer_long emel.cpp 84833.121 ns/op, llama.cpp 406507.271 ns/op, ratio=0.209x -jinja/renderer_short emel.cpp 1113.967 ns/op, llama.cpp 6485.746 ns/op, ratio=0.172x -kernel/aarch64/op_add emel.cpp 92.546 ns/op, llama.cpp 5279.417 ns/op, ratio=0.018x -kernel/aarch64/op_cos emel.cpp 1631.362 ns/op, llama.cpp 5731.046 ns/op, ratio=0.285x -kernel/aarch64/op_div emel.cpp 93.892 ns/op, llama.cpp 4394.467 ns/op, ratio=0.021x -kernel/aarch64/op_dup emel.cpp 86.471 ns/op, llama.cpp 4282.050 ns/op, ratio=0.020x -kernel/aarch64/op_log emel.cpp 1819.667 ns/op, llama.cpp 6011.442 ns/op, ratio=0.303x -kernel/aarch64/op_mul emel.cpp 89.467 ns/op, llama.cpp 5507.025 ns/op, ratio=0.016x -kernel/aarch64/op_mul_mat emel.cpp 4517.254 ns/op, llama.cpp 10219.783 ns/op, ratio=0.442x -kernel/aarch64/op_sin emel.cpp 1289.033 ns/op, llama.cpp 5404.462 ns/op, ratio=0.239x -kernel/aarch64/op_soft_max emel.cpp 2065.446 ns/op, llama.cpp 4890.883 ns/op, ratio=0.422x -kernel/aarch64/op_sqr emel.cpp 86.829 ns/op, llama.cpp 4336.387 ns/op, ratio=0.020x -kernel/aarch64/op_sqrt emel.cpp 138.008 ns/op, llama.cpp 4288.304 ns/op, ratio=0.032x -kernel/aarch64/op_sub emel.cpp 88.904 ns/op, llama.cpp 5325.046 ns/op, ratio=0.017x -kernel/aarch64/op_unary_exp emel.cpp 1277.404 ns/op, llama.cpp 5371.312 ns/op, ratio=0.238x -kernel/aarch64/op_unary_neg emel.cpp 86.029 ns/op, llama.cpp 4175.996 ns/op, ratio=0.021x -kernel/aarch64/op_unary_relu emel.cpp 90.608 ns/op, llama.cpp 4124.083 ns/op, ratio=0.022x -kernel/x86_64/op_add emel.cpp 64.504 ns/op, llama.cpp 5233.129 ns/op, ratio=0.012x -kernel/x86_64/op_cos emel.cpp 1628.146 ns/op, llama.cpp 6016.683 ns/op, ratio=0.271x -kernel/x86_64/op_div emel.cpp 73.971 ns/op, llama.cpp 5013.746 ns/op, ratio=0.015x -kernel/x86_64/op_dup emel.cpp 47.921 ns/op, llama.cpp 4274.621 ns/op, ratio=0.011x -kernel/x86_64/op_log emel.cpp 1852.987 ns/op, llama.cpp 6434.496 ns/op, ratio=0.288x -kernel/x86_64/op_mul emel.cpp 60.212 ns/op, llama.cpp 5865.367 ns/op, ratio=0.010x -kernel/x86_64/op_mul_mat emel.cpp 43938.567 ns/op, llama.cpp 11147.154 ns/op, ratio=3.942x -kernel/x86_64/op_sin emel.cpp 1262.237 ns/op, llama.cpp 5676.933 ns/op, ratio=0.222x -kernel/x86_64/op_soft_max emel.cpp 2059.963 ns/op, llama.cpp 4999.904 ns/op, ratio=0.412x -kernel/x86_64/op_sqr emel.cpp 50.700 ns/op, llama.cpp 4964.100 ns/op, ratio=0.010x -kernel/x86_64/op_sqrt emel.cpp 140.496 ns/op, llama.cpp 4741.517 ns/op, ratio=0.030x -kernel/x86_64/op_sub emel.cpp 60.233 ns/op, llama.cpp 5408.542 ns/op, ratio=0.011x -kernel/x86_64/op_unary_exp emel.cpp 1268.250 ns/op, llama.cpp 5503.779 ns/op, ratio=0.230x -kernel/x86_64/op_unary_neg emel.cpp 47.487 ns/op, llama.cpp 4549.292 ns/op, ratio=0.010x -kernel/x86_64/op_unary_relu emel.cpp 47.254 ns/op, llama.cpp 4375.171 ns/op, ratio=0.011x -logits/sampler_raw/vocab_128000 emel.cpp 18746.150 ns/op, llama.cpp 19140.217 ns/op, ratio=0.979x -logits/sampler_raw/vocab_256000 emel.cpp 37709.246 ns/op, llama.cpp 37594.458 ns/op, ratio=1.003x -logits/sampler_raw/vocab_32000 emel.cpp 4739.504 ns/op, llama.cpp 4991.942 ns/op, ratio=0.949x -logits/sampler_sml/vocab_128000 emel.cpp 16979.446 ns/op, llama.cpp 16718.892 ns/op, ratio=1.016x -logits/sampler_sml/vocab_256000 emel.cpp 36024.967 ns/op, llama.cpp 29679.767 ns/op, ratio=1.214x -logits/sampler_sml/vocab_32000 emel.cpp 3928.754 ns/op, llama.cpp 3549.517 ns/op, ratio=1.107x -logits/validator_raw/vocab_128000 emel.cpp 88332.717 ns/op, llama.cpp 90152.250 ns/op, ratio=0.980x -logits/validator_raw/vocab_256000 emel.cpp 182805.817 ns/op, llama.cpp 182508.413 ns/op, ratio=1.002x -logits/validator_raw/vocab_32000 emel.cpp 23365.571 ns/op, llama.cpp 23878.521 ns/op, ratio=0.979x -logits/validator_sml/vocab_128000 emel.cpp 96768.158 ns/op, llama.cpp 98797.996 ns/op, ratio=0.979x -logits/validator_sml/vocab_256000 emel.cpp 193641.642 ns/op, llama.cpp 196589.429 ns/op, ratio=0.985x -logits/validator_sml/vocab_32000 emel.cpp 23869.067 ns/op, llama.cpp 24220.071 ns/op, ratio=0.986x -memory/hybrid_full emel.cpp 387.054 ns/op, llama.cpp 37587.438 ns/op, ratio=0.010x -memory/kv_full emel.cpp 100.883 ns/op, llama.cpp 36279.867 ns/op, ratio=0.003x -memory/recurrent_full emel.cpp 114.583 ns/op, llama.cpp 5563.017 ns/op, ratio=0.021x -text/encoders/bpe_long emel.cpp 10232.100 ns/op, llama.cpp 10270.446 ns/op, ratio=0.996x -text/encoders/bpe_short emel.cpp 164.613 ns/op, llama.cpp 160.850 ns/op, ratio=1.023x -text/encoders/fallback_long emel.cpp 2522.454 ns/op, llama.cpp 2465.408 ns/op, ratio=1.023x -text/encoders/fallback_short emel.cpp 45.263 ns/op, llama.cpp 47.033 ns/op, ratio=0.962x -text/encoders/plamo2_long emel.cpp 4983.292 ns/op, llama.cpp 4977.471 ns/op, ratio=1.001x -text/encoders/plamo2_short emel.cpp 108.175 ns/op, llama.cpp 106.071 ns/op, ratio=1.020x -text/encoders/rwkv_long emel.cpp 4530.604 ns/op, llama.cpp 4569.600 ns/op, ratio=0.991x -text/encoders/rwkv_short emel.cpp 2613.637 ns/op, llama.cpp 2628.946 ns/op, ratio=0.994x -text/encoders/spm_long emel.cpp 12319.425 ns/op, llama.cpp 12292.258 ns/op, ratio=1.002x -text/encoders/spm_short emel.cpp 202.892 ns/op, llama.cpp 208.137 ns/op, ratio=0.975x -text/encoders/ugm_long emel.cpp 8120.746 ns/op, llama.cpp 8109.150 ns/op, ratio=1.001x -text/encoders/ugm_short emel.cpp 131.733 ns/op, llama.cpp 139.221 ns/op, ratio=0.946x -text/encoders/wpm_long emel.cpp 26693.121 ns/op, llama.cpp 26402.671 ns/op, ratio=1.011x -text/encoders/wpm_short emel.cpp 529.188 ns/op, llama.cpp 536.987 ns/op, ratio=0.985x -tokenizer/full_bpe_long emel.cpp 9626.758 ns/op, llama.cpp 9619.733 ns/op, ratio=1.001x -tokenizer/full_bpe_short emel.cpp 219.575 ns/op, llama.cpp 211.517 ns/op, ratio=1.038x -tokenizer/full_plamo2_long emel.cpp 10053.233 ns/op, llama.cpp 9994.929 ns/op, ratio=1.006x -tokenizer/full_plamo2_short emel.cpp 1918.483 ns/op, llama.cpp 1897.900 ns/op, ratio=1.011x -tokenizer/full_rwkv_long emel.cpp 3675.642 ns/op, llama.cpp 3665.338 ns/op, ratio=1.003x -tokenizer/full_rwkv_short emel.cpp 2230.875 ns/op, llama.cpp 2521.367 ns/op, ratio=0.885x -tokenizer/full_spm_long emel.cpp 13644.233 ns/op, llama.cpp 13779.175 ns/op, ratio=0.990x -tokenizer/full_spm_short emel.cpp 295.458 ns/op, llama.cpp 281.479 ns/op, ratio=1.050x -tokenizer/full_ugm_long emel.cpp 10078.542 ns/op, llama.cpp 10030.425 ns/op, ratio=1.005x -tokenizer/full_ugm_short emel.cpp 2138.625 ns/op, llama.cpp 2206.517 ns/op, ratio=0.969x -tokenizer/full_wpm_long emel.cpp 28529.271 ns/op, llama.cpp 28240.213 ns/op, ratio=1.010x -tokenizer/full_wpm_short emel.cpp 2266.062 ns/op, llama.cpp 2320.533 ns/op, ratio=0.977x -tokenizer/preprocessor_bpe_long emel.cpp 2753.250 ns/op, llama.cpp 5209.350 ns/op, ratio=0.529x -tokenizer/preprocessor_bpe_short emel.cpp 86.571 ns/op, llama.cpp 1702.050 ns/op, ratio=0.051x -tokenizer/preprocessor_plamo2_long emel.cpp 3144.229 ns/op, llama.cpp 4588.988 ns/op, ratio=0.685x -tokenizer/preprocessor_plamo2_short emel.cpp 2467.929 ns/op, llama.cpp 3609.229 ns/op, ratio=0.684x -tokenizer/preprocessor_rwkv_long emel.cpp 3121.367 ns/op, llama.cpp 4583.267 ns/op, ratio=0.681x -tokenizer/preprocessor_rwkv_short emel.cpp 2477.188 ns/op, llama.cpp 3683.521 ns/op, ratio=0.673x -tokenizer/preprocessor_spm_long emel.cpp 3127.146 ns/op, llama.cpp 4508.325 ns/op, ratio=0.694x -tokenizer/preprocessor_spm_short emel.cpp 2448.333 ns/op, llama.cpp 3641.775 ns/op, ratio=0.672x -tokenizer/preprocessor_ugm_long emel.cpp 3190.696 ns/op, llama.cpp 4554.075 ns/op, ratio=0.701x -tokenizer/preprocessor_ugm_short emel.cpp 2460.821 ns/op, llama.cpp 3632.079 ns/op, ratio=0.678x -tokenizer/preprocessor_wpm_long emel.cpp 3154.375 ns/op, llama.cpp 4505.400 ns/op, ratio=0.700x -tokenizer/preprocessor_wpm_short emel.cpp 2466.742 ns/op, llama.cpp 3636.067 ns/op, ratio=0.678x +batch/planner_equal emel.cpp 1846.750 ns/op, llama.cpp 8689.946 ns/op, ratio=0.213x +batch/planner_seq emel.cpp 1781.388 ns/op, llama.cpp 3996.500 ns/op, ratio=0.446x +batch/planner_simple emel.cpp 1348.817 ns/op, llama.cpp 3498.363 ns/op, ratio=0.386x +gbnf/rule_parser_basic emel.cpp 247.521 ns/op, llama.cpp 471.233 ns/op, ratio=0.525x +gbnf/rule_parser_complex emel.cpp 1933.033 ns/op, llama.cpp 2515.221 ns/op, ratio=0.769x +kernel/aarch64/op_add emel.cpp 88.783 ns/op, llama.cpp 5061.321 ns/op, ratio=0.018x +kernel/aarch64/op_cos emel.cpp 1668.921 ns/op, llama.cpp 6025.850 ns/op, ratio=0.277x +kernel/aarch64/op_div emel.cpp 88.600 ns/op, llama.cpp 4142.504 ns/op, ratio=0.021x +kernel/aarch64/op_dup emel.cpp 85.975 ns/op, llama.cpp 4095.954 ns/op, ratio=0.021x +kernel/aarch64/op_log emel.cpp 1843.883 ns/op, llama.cpp 6106.117 ns/op, ratio=0.302x +kernel/aarch64/op_mul emel.cpp 91.025 ns/op, llama.cpp 5091.896 ns/op, ratio=0.018x +kernel/aarch64/op_mul_mat emel.cpp 4540.008 ns/op, llama.cpp 10639.004 ns/op, ratio=0.427x +kernel/aarch64/op_sin emel.cpp 1447.079 ns/op, llama.cpp 5599.971 ns/op, ratio=0.258x +kernel/aarch64/op_soft_max emel.cpp 2066.808 ns/op, llama.cpp 4972.771 ns/op, ratio=0.416x +kernel/aarch64/op_sqr emel.cpp 86.779 ns/op, llama.cpp 4090.646 ns/op, ratio=0.021x +kernel/aarch64/op_sqrt emel.cpp 137.033 ns/op, llama.cpp 4436.392 ns/op, ratio=0.031x +kernel/aarch64/op_sub emel.cpp 91.279 ns/op, llama.cpp 5088.383 ns/op, ratio=0.018x +kernel/aarch64/op_unary_exp emel.cpp 1297.300 ns/op, llama.cpp 5642.096 ns/op, ratio=0.230x +kernel/aarch64/op_unary_neg emel.cpp 89.208 ns/op, llama.cpp 4536.625 ns/op, ratio=0.020x +kernel/aarch64/op_unary_relu emel.cpp 85.879 ns/op, llama.cpp 4413.375 ns/op, ratio=0.019x +kernel/x86_64/op_add emel.cpp 60.092 ns/op, llama.cpp 5068.100 ns/op, ratio=0.012x +kernel/x86_64/op_cos emel.cpp 1969.629 ns/op, llama.cpp 5873.692 ns/op, ratio=0.335x +kernel/x86_64/op_div emel.cpp 74.679 ns/op, llama.cpp 4153.717 ns/op, ratio=0.018x +kernel/x86_64/op_dup emel.cpp 47.033 ns/op, llama.cpp 4013.613 ns/op, ratio=0.012x +kernel/x86_64/op_log emel.cpp 1820.858 ns/op, llama.cpp 6532.413 ns/op, ratio=0.279x +kernel/x86_64/op_mul emel.cpp 60.196 ns/op, llama.cpp 5235.196 ns/op, ratio=0.011x +kernel/x86_64/op_mul_mat emel.cpp 44244.079 ns/op, llama.cpp 10511.242 ns/op, ratio=4.209x +kernel/x86_64/op_sin emel.cpp 1296.000 ns/op, llama.cpp 5583.742 ns/op, ratio=0.232x +kernel/x86_64/op_soft_max emel.cpp 2062.137 ns/op, llama.cpp 5244.917 ns/op, ratio=0.393x +kernel/x86_64/op_sqr emel.cpp 49.138 ns/op, llama.cpp 4063.596 ns/op, ratio=0.012x +kernel/x86_64/op_sqrt emel.cpp 143.012 ns/op, llama.cpp 4265.863 ns/op, ratio=0.034x +kernel/x86_64/op_sub emel.cpp 60.096 ns/op, llama.cpp 5310.508 ns/op, ratio=0.011x +kernel/x86_64/op_unary_exp emel.cpp 1284.658 ns/op, llama.cpp 5399.771 ns/op, ratio=0.238x +kernel/x86_64/op_unary_neg emel.cpp 51.946 ns/op, llama.cpp 4309.450 ns/op, ratio=0.012x +kernel/x86_64/op_unary_relu emel.cpp 52.304 ns/op, llama.cpp 4238.471 ns/op, ratio=0.012x +logits/sampler_raw/vocab_128000 emel.cpp 19259.958 ns/op, llama.cpp 18468.492 ns/op, ratio=1.043x +logits/sampler_raw/vocab_256000 emel.cpp 38539.842 ns/op, llama.cpp 36725.137 ns/op, ratio=1.049x +logits/sampler_raw/vocab_32000 emel.cpp 5214.146 ns/op, llama.cpp 4826.229 ns/op, ratio=1.080x +logits/sampler_sml/vocab_128000 emel.cpp 15429.442 ns/op, llama.cpp 14757.788 ns/op, ratio=1.046x +logits/sampler_sml/vocab_256000 emel.cpp 34200.133 ns/op, llama.cpp 30380.342 ns/op, ratio=1.126x +logits/sampler_sml/vocab_32000 emel.cpp 4436.292 ns/op, llama.cpp 4330.962 ns/op, ratio=1.024x +logits/validator_raw/vocab_128000 emel.cpp 90205.633 ns/op, llama.cpp 90458.808 ns/op, ratio=0.997x +logits/validator_raw/vocab_256000 emel.cpp 181372.546 ns/op, llama.cpp 179498.462 ns/op, ratio=1.010x +logits/validator_raw/vocab_32000 emel.cpp 23735.550 ns/op, llama.cpp 23904.125 ns/op, ratio=0.993x +logits/validator_sml/vocab_128000 emel.cpp 99648.387 ns/op, llama.cpp 99266.212 ns/op, ratio=1.004x +logits/validator_sml/vocab_256000 emel.cpp 197266.092 ns/op, llama.cpp 199430.296 ns/op, ratio=0.989x +logits/validator_sml/vocab_32000 emel.cpp 24528.092 ns/op, llama.cpp 24126.225 ns/op, ratio=1.017x +memory/hybrid_full emel.cpp 408.700 ns/op, llama.cpp 36677.713 ns/op, ratio=0.011x +memory/kv_full emel.cpp 103.067 ns/op, llama.cpp 36946.496 ns/op, ratio=0.003x +memory/recurrent_full emel.cpp 113.079 ns/op, llama.cpp 5595.042 ns/op, ratio=0.020x +text/encoders/bpe_long emel.cpp 10221.996 ns/op, llama.cpp 10221.204 ns/op, ratio=1.000x +text/encoders/bpe_short emel.cpp 159.125 ns/op, llama.cpp 153.158 ns/op, ratio=1.039x +text/encoders/fallback_long emel.cpp 2470.238 ns/op, llama.cpp 2485.546 ns/op, ratio=0.994x +text/encoders/fallback_short emel.cpp 50.267 ns/op, llama.cpp 47.825 ns/op, ratio=1.051x +text/encoders/plamo2_long emel.cpp 4848.942 ns/op, llama.cpp 4878.158 ns/op, ratio=0.994x +text/encoders/plamo2_short emel.cpp 107.117 ns/op, llama.cpp 104.096 ns/op, ratio=1.029x +text/encoders/rwkv_long emel.cpp 4557.729 ns/op, llama.cpp 4543.887 ns/op, ratio=1.003x +text/encoders/rwkv_short emel.cpp 2697.533 ns/op, llama.cpp 2658.883 ns/op, ratio=1.015x +text/encoders/spm_long emel.cpp 12589.987 ns/op, llama.cpp 12349.475 ns/op, ratio=1.019x +text/encoders/spm_short emel.cpp 213.188 ns/op, llama.cpp 205.325 ns/op, ratio=1.038x +text/encoders/ugm_long emel.cpp 8308.617 ns/op, llama.cpp 8295.337 ns/op, ratio=1.002x +text/encoders/ugm_short emel.cpp 137.250 ns/op, llama.cpp 137.008 ns/op, ratio=1.002x +text/encoders/wpm_long emel.cpp 26858.621 ns/op, llama.cpp 26355.825 ns/op, ratio=1.019x +text/encoders/wpm_short emel.cpp 531.438 ns/op, llama.cpp 540.237 ns/op, ratio=0.984x +text/jinja/formatter_long emel.cpp 87073.829 ns/op, llama.cpp 400326.883 ns/op, ratio=0.218x +text/jinja/formatter_short emel.cpp 1144.017 ns/op, llama.cpp 6368.133 ns/op, ratio=0.180x +text/jinja/parser_long emel.cpp 35030.512 ns/op, llama.cpp 52803.367 ns/op, ratio=0.663x +text/jinja/parser_short emel.cpp 547.888 ns/op, llama.cpp 632.633 ns/op, ratio=0.866x +tokenizer/full_bpe_long emel.cpp 9967.413 ns/op, llama.cpp 9607.096 ns/op, ratio=1.038x +tokenizer/full_bpe_short emel.cpp 220.113 ns/op, llama.cpp 218.846 ns/op, ratio=1.006x +tokenizer/full_plamo2_long emel.cpp 9890.796 ns/op, llama.cpp 9985.525 ns/op, ratio=0.991x +tokenizer/full_plamo2_short emel.cpp 1799.446 ns/op, llama.cpp 1769.058 ns/op, ratio=1.017x +tokenizer/full_rwkv_long emel.cpp 3566.475 ns/op, llama.cpp 3551.117 ns/op, ratio=1.004x +tokenizer/full_rwkv_short emel.cpp 2373.500 ns/op, llama.cpp 2159.892 ns/op, ratio=1.099x +tokenizer/full_spm_long emel.cpp 13766.279 ns/op, llama.cpp 13689.263 ns/op, ratio=1.006x +tokenizer/full_spm_short emel.cpp 296.825 ns/op, llama.cpp 285.354 ns/op, ratio=1.040x +tokenizer/full_ugm_long emel.cpp 10042.667 ns/op, llama.cpp 9989.429 ns/op, ratio=1.005x +tokenizer/full_ugm_short emel.cpp 1817.804 ns/op, llama.cpp 1818.546 ns/op, ratio=1.000x +tokenizer/full_wpm_long emel.cpp 28866.112 ns/op, llama.cpp 34007.938 ns/op, ratio=0.849x +tokenizer/full_wpm_short emel.cpp 2204.133 ns/op, llama.cpp 2210.221 ns/op, ratio=0.997x +tokenizer/preprocessor_bpe_long emel.cpp 2775.246 ns/op, llama.cpp 5265.688 ns/op, ratio=0.527x +tokenizer/preprocessor_bpe_short emel.cpp 82.854 ns/op, llama.cpp 1747.217 ns/op, ratio=0.047x +tokenizer/preprocessor_plamo2_long emel.cpp 3052.371 ns/op, llama.cpp 4619.908 ns/op, ratio=0.661x +tokenizer/preprocessor_plamo2_short emel.cpp 2367.925 ns/op, llama.cpp 3575.713 ns/op, ratio=0.662x +tokenizer/preprocessor_rwkv_long emel.cpp 3077.379 ns/op, llama.cpp 4554.646 ns/op, ratio=0.676x +tokenizer/preprocessor_rwkv_short emel.cpp 2356.238 ns/op, llama.cpp 3536.963 ns/op, ratio=0.666x +tokenizer/preprocessor_spm_long emel.cpp 3092.796 ns/op, llama.cpp 4569.296 ns/op, ratio=0.677x +tokenizer/preprocessor_spm_short emel.cpp 2361.154 ns/op, llama.cpp 3586.446 ns/op, ratio=0.658x +tokenizer/preprocessor_ugm_long emel.cpp 3139.088 ns/op, llama.cpp 4625.679 ns/op, ratio=0.679x +tokenizer/preprocessor_ugm_short emel.cpp 2375.508 ns/op, llama.cpp 3560.692 ns/op, ratio=0.667x +tokenizer/preprocessor_wpm_long emel.cpp 3043.238 ns/op, llama.cpp 4503.621 ns/op, ratio=0.676x +tokenizer/preprocessor_wpm_short emel.cpp 2599.613 ns/op, llama.cpp 3530.233 ns/op, ratio=0.736x diff --git a/snapshots/quality_gates/timing.txt b/snapshots/quality_gates/timing.txt index 08dc627c..fd5ad1c6 100644 --- a/snapshots/quality_gates/timing.txt +++ b/snapshots/quality_gates/timing.txt @@ -1,8 +1,8 @@ # quality_gates timing (seconds) -build_with_zig 0 -test_with_coverage 71 -paritychecker 5 -fuzz_smoke 28 -bench_snapshot 87 -generate_docs 30 -total 222 +build_with_zig 1 +test_with_coverage 72 +paritychecker 6 +fuzz_smoke 29 +bench_snapshot 95 +generate_docs 33 +total 236 diff --git a/tools/bench/CMakeLists.txt b/tools/bench/CMakeLists.txt index 6f1dfadf..d1f5b91b 100644 --- a/tools/bench/CMakeLists.txt +++ b/tools/bench/CMakeLists.txt @@ -109,9 +109,11 @@ find_path(NLOHMANN_JSON_INCLUDE_DIR nlohmann/json.hpp) set(BENCH_RUNNER_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/batch/planner_bench.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/gbnf/parser_bench.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/kernel_bench.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/logits_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/gbnf/rule_parser_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/kernel/aarch64_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/kernel/x86_64_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/logits/sampler_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/logits/validator_bench.cpp ${CMAKE_CURRENT_SOURCE_DIR}/memory/kv_bench.cpp ${CMAKE_CURRENT_SOURCE_DIR}/memory/recurrent_bench.cpp ${CMAKE_CURRENT_SOURCE_DIR}/memory/hybrid_bench.cpp @@ -140,6 +142,8 @@ set(BENCH_RUNNER_SOURCES ${reference_impl_SOURCE_DIR}/common/jinja/caps.cpp ${CMAKE_CURRENT_SOURCE_DIR}/bench_cases.hpp ${CMAKE_CURRENT_SOURCE_DIR}/bench_common.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/kernel/bench_common.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/logits/bench_common.hpp ${CMAKE_CURRENT_SOURCE_DIR}/bench_main.cpp ) @@ -168,7 +172,7 @@ target_include_directories(bench_runner ) add_executable(gbnf_bench_runner - ${CMAKE_CURRENT_SOURCE_DIR}/gbnf/parser_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/gbnf/rule_parser_bench.cpp ${CMAKE_CURRENT_SOURCE_DIR}/gbnf/bench_main.cpp ${CMAKE_CURRENT_SOURCE_DIR}/bench_cases.hpp ${CMAKE_CURRENT_SOURCE_DIR}/bench_common.hpp diff --git a/tools/bench/batch/planner_bench.cpp b/tools/bench/batch/planner_bench.cpp index 5d5212c6..54511653 100644 --- a/tools/bench/batch/planner_bench.cpp +++ b/tools/bench/batch/planner_bench.cpp @@ -360,7 +360,7 @@ void append_emel_batch_planner_cases(std::vector & results, const config }; auto fn = [&]() { (void)machine.process_event(request); }; - results.push_back(measure_case("batch/splitter_simple", cfg, fn)); + results.push_back(measure_case("batch/planner_simple", cfg, fn)); } { @@ -392,7 +392,7 @@ void append_emel_batch_planner_cases(std::vector & results, const config }; auto fn = [&]() { (void)machine.process_event(request); }; - results.push_back(measure_case("batch/splitter_equal", cfg, fn)); + results.push_back(measure_case("batch/planner_equal", cfg, fn)); } { @@ -424,7 +424,7 @@ void append_emel_batch_planner_cases(std::vector & results, const config }; auto fn = [&]() { (void)machine.process_event(request); }; - results.push_back(measure_case("batch/splitter_seq", cfg, fn)); + results.push_back(measure_case("batch/planner_seq", cfg, fn)); } } @@ -440,7 +440,7 @@ void append_reference_batch_planner_cases(std::vector & results, const c } } }; - results.push_back(measure_case("batch/splitter_simple", cfg, fn)); + results.push_back(measure_case("batch/planner_simple", cfg, fn)); } { @@ -454,7 +454,7 @@ void append_reference_batch_planner_cases(std::vector & results, const c } } }; - results.push_back(measure_case("batch/splitter_equal", cfg, fn)); + results.push_back(measure_case("batch/planner_equal", cfg, fn)); } { @@ -468,7 +468,7 @@ void append_reference_batch_planner_cases(std::vector & results, const c } } }; - results.push_back(measure_case("batch/splitter_seq", cfg, fn)); + results.push_back(measure_case("batch/planner_seq", cfg, fn)); } } diff --git a/tools/bench/bench_cases.hpp b/tools/bench/bench_cases.hpp index 98cfc8d6..3f0beec1 100644 --- a/tools/bench/bench_cases.hpp +++ b/tools/bench/bench_cases.hpp @@ -6,6 +6,24 @@ namespace emel::bench { +using append_case_fn = void (*)(std::vector & results, const config & cfg); + +struct test_case { + append_case_fn append_emel = nullptr; + append_case_fn append_reference = nullptr; + bool tokenizer_case = false; +}; + +inline void append_test_case(std::vector & results, + const config & cfg, + const test_case & tc, + const bool reference) { + const append_case_fn fn = reference ? tc.append_reference : tc.append_emel; + if (fn != nullptr) { + fn(results, cfg); + } +} + void append_emel_batch_planner_cases(std::vector & results, const config & cfg); void append_reference_batch_planner_cases(std::vector & results, const config & cfg); void append_emel_memory_kv_cases(std::vector & results, const config & cfg); @@ -18,12 +36,16 @@ void append_emel_jinja_parser_cases(std::vector & results, const config void append_reference_jinja_parser_cases(std::vector & results, const config & cfg); void append_emel_jinja_formatter_cases(std::vector & results, const config & cfg); void append_reference_jinja_formatter_cases(std::vector & results, const config & cfg); -void append_emel_gbnf_parser_cases(std::vector & results, const config & cfg); -void append_reference_gbnf_parser_cases(std::vector & results, const config & cfg); -void append_emel_logits_cases(std::vector & results, const config & cfg); -void append_reference_logits_cases(std::vector & results, const config & cfg); -void append_emel_kernel_cases(std::vector & results, const config & cfg); -void append_reference_kernel_cases(std::vector & results, const config & cfg); +void append_emel_gbnf_rule_parser_cases(std::vector & results, const config & cfg); +void append_reference_gbnf_rule_parser_cases(std::vector & results, const config & cfg); +void append_emel_logits_validator_cases(std::vector & results, const config & cfg); +void append_reference_logits_validator_cases(std::vector & results, const config & cfg); +void append_emel_logits_sampler_cases(std::vector & results, const config & cfg); +void append_reference_logits_sampler_cases(std::vector & results, const config & cfg); +void append_emel_kernel_x86_64_cases(std::vector & results, const config & cfg); +void append_reference_kernel_x86_64_cases(std::vector & results, const config & cfg); +void append_emel_kernel_aarch64_cases(std::vector & results, const config & cfg); +void append_reference_kernel_aarch64_cases(std::vector & results, const config & cfg); void append_emel_sm_any_cases(std::vector & results, const config & cfg); void append_reference_sm_any_cases(std::vector & results, const config & cfg); void append_emel_tokenizer_preprocessor_bpe_cases(std::vector & results, diff --git a/tools/bench/bench_main.cpp b/tools/bench/bench_main.cpp index 0aefdc31..1bf0cc3e 100644 --- a/tools/bench/bench_main.cpp +++ b/tools/bench/bench_main.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -42,68 +43,97 @@ std::size_t read_env_size(const char * name, std::size_t fallback) { return static_cast(parsed); } -std::vector run_emel_benchmarks(const bench::config & cfg, - const bool include_tokenizer) { - std::vector results; - results.reserve(20); - bench::append_emel_batch_planner_cases(results, cfg); - bench::append_emel_memory_kv_cases(results, cfg); - bench::append_emel_memory_recurrent_cases(results, cfg); - bench::append_emel_memory_hybrid_cases(results, cfg); - bench::append_emel_jinja_parser_cases(results, cfg); - bench::append_emel_jinja_formatter_cases(results, cfg); - bench::append_emel_gbnf_parser_cases(results, cfg); - bench::append_emel_logits_cases(results, cfg); - bench::append_emel_kernel_cases(results, cfg); - bench::append_emel_sm_any_cases(results, cfg); - bench::append_emel_tokenizer_preprocessor_bpe_cases(results, cfg); - bench::append_emel_tokenizer_preprocessor_spm_cases(results, cfg); - bench::append_emel_tokenizer_preprocessor_ugm_cases(results, cfg); - bench::append_emel_tokenizer_preprocessor_wpm_cases(results, cfg); - bench::append_emel_tokenizer_preprocessor_rwkv_cases(results, cfg); - bench::append_emel_tokenizer_preprocessor_plamo2_cases(results, cfg); - bench::append_emel_encoder_bpe_cases(results, cfg); - bench::append_emel_encoder_spm_cases(results, cfg); - bench::append_emel_encoder_wpm_cases(results, cfg); - bench::append_emel_encoder_ugm_cases(results, cfg); - bench::append_emel_encoder_rwkv_cases(results, cfg); - bench::append_emel_encoder_plamo2_cases(results, cfg); - bench::append_emel_encoder_fallback_cases(results, cfg); - if (include_tokenizer) { - bench::append_emel_tokenizer_cases(results, cfg); - } - return results; +constexpr bench::test_case make_test_case(const bench::append_case_fn emel_fn, + const bench::append_case_fn reference_fn, + const bool tokenizer_case = false) { + return bench::test_case{ + .append_emel = emel_fn, + .append_reference = reference_fn, + .tokenizer_case = tokenizer_case, + }; +} + +const auto & default_test_cases() { + static const std::array cases = {{ + make_test_case(bench::append_emel_batch_planner_cases, + bench::append_reference_batch_planner_cases), + make_test_case(bench::append_emel_memory_kv_cases, bench::append_reference_memory_kv_cases), + make_test_case(bench::append_emel_memory_recurrent_cases, + bench::append_reference_memory_recurrent_cases), + make_test_case(bench::append_emel_memory_hybrid_cases, + bench::append_reference_memory_hybrid_cases), + make_test_case(bench::append_emel_jinja_parser_cases, + bench::append_reference_jinja_parser_cases), + make_test_case(bench::append_emel_jinja_formatter_cases, + bench::append_reference_jinja_formatter_cases), + make_test_case(bench::append_emel_gbnf_rule_parser_cases, + bench::append_reference_gbnf_rule_parser_cases), + make_test_case(bench::append_emel_logits_validator_cases, + bench::append_reference_logits_validator_cases), + make_test_case(bench::append_emel_logits_sampler_cases, + bench::append_reference_logits_sampler_cases), + make_test_case(bench::append_emel_kernel_x86_64_cases, + bench::append_reference_kernel_x86_64_cases), + make_test_case(bench::append_emel_kernel_aarch64_cases, + bench::append_reference_kernel_aarch64_cases), + make_test_case(bench::append_emel_sm_any_cases, bench::append_reference_sm_any_cases), + make_test_case(bench::append_emel_tokenizer_preprocessor_bpe_cases, + bench::append_reference_tokenizer_preprocessor_bpe_cases), + make_test_case(bench::append_emel_tokenizer_preprocessor_spm_cases, + bench::append_reference_tokenizer_preprocessor_spm_cases), + make_test_case(bench::append_emel_tokenizer_preprocessor_ugm_cases, + bench::append_reference_tokenizer_preprocessor_ugm_cases), + make_test_case(bench::append_emel_tokenizer_preprocessor_wpm_cases, + bench::append_reference_tokenizer_preprocessor_wpm_cases), + make_test_case(bench::append_emel_tokenizer_preprocessor_rwkv_cases, + bench::append_reference_tokenizer_preprocessor_rwkv_cases), + make_test_case(bench::append_emel_tokenizer_preprocessor_plamo2_cases, + bench::append_reference_tokenizer_preprocessor_plamo2_cases), + make_test_case(bench::append_emel_encoder_bpe_cases, bench::append_reference_encoder_bpe_cases), + make_test_case(bench::append_emel_encoder_spm_cases, bench::append_reference_encoder_spm_cases), + make_test_case(bench::append_emel_encoder_wpm_cases, bench::append_reference_encoder_wpm_cases), + make_test_case(bench::append_emel_encoder_ugm_cases, bench::append_reference_encoder_ugm_cases), + make_test_case(bench::append_emel_encoder_rwkv_cases, + bench::append_reference_encoder_rwkv_cases), + make_test_case(bench::append_emel_encoder_plamo2_cases, + bench::append_reference_encoder_plamo2_cases), + make_test_case(bench::append_emel_encoder_fallback_cases, + bench::append_reference_encoder_fallback_cases), + }}; + return cases; } -std::vector run_reference_benchmarks(const bench::config & cfg, - const bool include_tokenizer) { +const auto & kernel_test_cases() { + static const std::array cases = {{ + make_test_case(bench::append_emel_kernel_x86_64_cases, + bench::append_reference_kernel_x86_64_cases), + make_test_case(bench::append_emel_kernel_aarch64_cases, + bench::append_reference_kernel_aarch64_cases), + }}; + return cases; +} + +template +std::vector run_benchmarks(const bench::config & cfg, + const std::array & cases, + const bool reference, + const bool include_tokenizer) { std::vector results; - results.reserve(20); - bench::append_reference_batch_planner_cases(results, cfg); - bench::append_reference_memory_kv_cases(results, cfg); - bench::append_reference_memory_recurrent_cases(results, cfg); - bench::append_reference_memory_hybrid_cases(results, cfg); - bench::append_reference_jinja_parser_cases(results, cfg); - bench::append_reference_jinja_formatter_cases(results, cfg); - bench::append_reference_gbnf_parser_cases(results, cfg); - bench::append_reference_logits_cases(results, cfg); - bench::append_reference_kernel_cases(results, cfg); - bench::append_reference_sm_any_cases(results, cfg); - bench::append_reference_tokenizer_preprocessor_bpe_cases(results, cfg); - bench::append_reference_tokenizer_preprocessor_spm_cases(results, cfg); - bench::append_reference_tokenizer_preprocessor_ugm_cases(results, cfg); - bench::append_reference_tokenizer_preprocessor_wpm_cases(results, cfg); - bench::append_reference_tokenizer_preprocessor_rwkv_cases(results, cfg); - bench::append_reference_tokenizer_preprocessor_plamo2_cases(results, cfg); - bench::append_reference_encoder_bpe_cases(results, cfg); - bench::append_reference_encoder_spm_cases(results, cfg); - bench::append_reference_encoder_wpm_cases(results, cfg); - bench::append_reference_encoder_ugm_cases(results, cfg); - bench::append_reference_encoder_rwkv_cases(results, cfg); - bench::append_reference_encoder_plamo2_cases(results, cfg); - bench::append_reference_encoder_fallback_cases(results, cfg); + results.reserve(k_case_count + 1); + + for (const bench::test_case & tc : cases) { + if (tc.tokenizer_case && !include_tokenizer) { + continue; + } + bench::append_test_case(results, cfg, tc, reference); + } + if (include_tokenizer) { - bench::append_reference_tokenizer_cases(results, cfg); + const bench::test_case tokenizer_case = make_test_case( + bench::append_emel_tokenizer_cases, + bench::append_reference_tokenizer_cases, + true); + bench::append_test_case(results, cfg, tokenizer_case, reference); } return results; } @@ -208,42 +238,38 @@ int main(int argc, char ** argv) { const mode run_mode = parse_mode(argc, argv); if (run_mode == mode::k_kernel_emel) { - std::vector results; - bench::append_emel_kernel_cases(results, cfg); + const auto results = run_benchmarks(cfg, kernel_test_cases(), false, false); print_snapshot(results); return 0; } if (run_mode == mode::k_kernel_reference) { - std::vector results; - bench::append_reference_kernel_cases(results, cfg); + const auto results = run_benchmarks(cfg, kernel_test_cases(), true, false); print_snapshot(results); return 0; } if (run_mode == mode::k_kernel_compare) { - std::vector emel_results; - std::vector ref_results; - bench::append_emel_kernel_cases(emel_results, cfg); - bench::append_reference_kernel_cases(ref_results, cfg); + const auto emel_results = run_benchmarks(cfg, kernel_test_cases(), false, false); + const auto ref_results = run_benchmarks(cfg, kernel_test_cases(), true, false); print_compare(emel_results, ref_results); return 0; } if (run_mode == mode::k_emel) { - const auto results = run_emel_benchmarks(cfg, true); + const auto results = run_benchmarks(cfg, default_test_cases(), false, true); print_snapshot(results); return 0; } if (run_mode == mode::k_reference) { - const auto results = run_reference_benchmarks(cfg, true); + const auto results = run_benchmarks(cfg, default_test_cases(), true, true); print_snapshot(results); return 0; } - const auto emel_results = run_emel_benchmarks(cfg, true); - const auto ref_results = run_reference_benchmarks(cfg, true); + const auto emel_results = run_benchmarks(cfg, default_test_cases(), false, true); + const auto ref_results = run_benchmarks(cfg, default_test_cases(), true, true); print_compare(emel_results, ref_results); return 0; } diff --git a/tools/bench/gbnf/bench_main.cpp b/tools/bench/gbnf/bench_main.cpp index 234f7a9f..a1cee5b6 100644 --- a/tools/bench/gbnf/bench_main.cpp +++ b/tools/bench/gbnf/bench_main.cpp @@ -46,14 +46,14 @@ std::size_t read_env_size(const char * name, std::size_t fallback) { std::vector run_emel_benchmarks(const bench::config & cfg) { std::vector results; results.reserve(2); - bench::append_emel_gbnf_parser_cases(results, cfg); + bench::append_emel_gbnf_rule_parser_cases(results, cfg); return results; } std::vector run_reference_benchmarks(const bench::config & cfg) { std::vector results; results.reserve(2); - bench::append_reference_gbnf_parser_cases(results, cfg); + bench::append_reference_gbnf_rule_parser_cases(results, cfg); return results; } diff --git a/tools/bench/gbnf/parser_bench.cpp b/tools/bench/gbnf/rule_parser_bench.cpp similarity index 88% rename from tools/bench/gbnf/parser_bench.cpp rename to tools/bench/gbnf/rule_parser_bench.cpp index 42dc4e59..19c65f97 100644 --- a/tools/bench/gbnf/parser_bench.cpp +++ b/tools/bench/gbnf/rule_parser_bench.cpp @@ -152,23 +152,23 @@ void ensure_case_parity(const char * case_name, } } -void ensure_gbnf_parser_parity() { +void ensure_gbnf_rule_parser_parity() { static bool checked = false; if (checked) { return; } checked = true; - ensure_case_parity("gbnf/parser_basic", k_basic_grammar, k_basic_grammar_view); - ensure_case_parity("gbnf/parser_complex", k_complex_grammar, k_complex_grammar_view); + ensure_case_parity("gbnf/rule_parser_basic", k_basic_grammar, k_basic_grammar_view); + ensure_case_parity("gbnf/rule_parser_complex", k_complex_grammar, k_complex_grammar_view); } } // namespace namespace emel::bench { -void append_emel_gbnf_parser_cases(std::vector & results, const config & cfg) { - ensure_gbnf_parser_parity(); +void append_emel_gbnf_rule_parser_cases(std::vector & results, const config & cfg) { + ensure_gbnf_rule_parser_parity(); { emel_parse_state state{k_basic_grammar_view}; @@ -177,7 +177,7 @@ void append_emel_gbnf_parser_cases(std::vector & results, const config & std::abort(); } }; - results.push_back(measure_case("gbnf/parser_basic", cfg, basic_fn)); + results.push_back(measure_case("gbnf/rule_parser_basic", cfg, basic_fn)); } { @@ -187,12 +187,12 @@ void append_emel_gbnf_parser_cases(std::vector & results, const config & std::abort(); } }; - results.push_back(measure_case("gbnf/parser_complex", cfg, complex_fn)); + results.push_back(measure_case("gbnf/rule_parser_complex", cfg, complex_fn)); } } -void append_reference_gbnf_parser_cases(std::vector & results, const config & cfg) { - ensure_gbnf_parser_parity(); +void append_reference_gbnf_rule_parser_cases(std::vector & results, const config & cfg) { + ensure_gbnf_rule_parser_parity(); { reference_parse_state state{k_basic_grammar}; @@ -201,7 +201,7 @@ void append_reference_gbnf_parser_cases(std::vector & results, const con std::abort(); } }; - results.push_back(measure_case("gbnf/parser_basic", cfg, basic_fn)); + results.push_back(measure_case("gbnf/rule_parser_basic", cfg, basic_fn)); } { @@ -211,7 +211,7 @@ void append_reference_gbnf_parser_cases(std::vector & results, const con std::abort(); } }; - results.push_back(measure_case("gbnf/parser_complex", cfg, complex_fn)); + results.push_back(measure_case("gbnf/rule_parser_complex", cfg, complex_fn)); } } diff --git a/tools/bench/kernel/aarch64_bench.cpp b/tools/bench/kernel/aarch64_bench.cpp new file mode 100644 index 00000000..1d67d1c0 --- /dev/null +++ b/tools/bench/kernel/aarch64_bench.cpp @@ -0,0 +1,22 @@ +#include "bench_cases.hpp" + +#include "emel/kernel/aarch64/context.hpp" +#include "emel/kernel/aarch64/detail.hpp" + +#include "kernel/bench_common.hpp" + +namespace emel::bench { + +void append_emel_kernel_aarch64_cases(std::vector & results, const config & cfg) { + const emel::kernel::aarch64::action::context aarch_ctx{}; + auto exec = [&](const auto & ev) { + return emel::kernel::aarch64::detail::execute_request(ev, aarch_ctx); + }; + append_emel_backend_cases(results, cfg, "aarch64", exec); +} + +void append_reference_kernel_aarch64_cases(std::vector & results, const config & cfg) { + append_reference_backend_cases(results, cfg, "aarch64"); +} + +} // namespace emel::bench diff --git a/tools/bench/kernel_bench.cpp b/tools/bench/kernel/bench_common.hpp similarity index 96% rename from tools/bench/kernel_bench.cpp rename to tools/bench/kernel/bench_common.hpp index 4e9d81ac..120674c6 100644 --- a/tools/bench/kernel_bench.cpp +++ b/tools/bench/kernel/bench_common.hpp @@ -1,3 +1,4 @@ +#pragma once #include "bench_cases.hpp" #include @@ -651,26 +652,3 @@ void append_reference_backend_cases(std::vector & results, } // namespace -namespace emel::bench { - -void append_emel_kernel_cases(std::vector & results, const config & cfg) { - const emel::kernel::x86_64::action::context x86_ctx{}; - const emel::kernel::aarch64::action::context aarch_ctx{}; - - auto x86_exec = [&](const auto & ev) { - return emel::kernel::x86_64::detail::execute_request(ev, x86_ctx); - }; - auto aarch_exec = [&](const auto & ev) { - return emel::kernel::aarch64::detail::execute_request(ev, aarch_ctx); - }; - - append_emel_backend_cases(results, cfg, "x86_64", x86_exec); - append_emel_backend_cases(results, cfg, "aarch64", aarch_exec); -} - -void append_reference_kernel_cases(std::vector & results, const config & cfg) { - append_reference_backend_cases(results, cfg, "x86_64"); - append_reference_backend_cases(results, cfg, "aarch64"); -} - -} // namespace emel::bench diff --git a/tools/bench/kernel/x86_64_bench.cpp b/tools/bench/kernel/x86_64_bench.cpp new file mode 100644 index 00000000..e5e8fd13 --- /dev/null +++ b/tools/bench/kernel/x86_64_bench.cpp @@ -0,0 +1,22 @@ +#include "bench_cases.hpp" + +#include "emel/kernel/x86_64/context.hpp" +#include "emel/kernel/x86_64/detail.hpp" + +#include "kernel/bench_common.hpp" + +namespace emel::bench { + +void append_emel_kernel_x86_64_cases(std::vector & results, const config & cfg) { + const emel::kernel::x86_64::action::context x86_ctx{}; + auto exec = [&](const auto & ev) { + return emel::kernel::x86_64::detail::execute_request(ev, x86_ctx); + }; + append_emel_backend_cases(results, cfg, "x86_64", exec); +} + +void append_reference_kernel_x86_64_cases(std::vector & results, const config & cfg) { + append_reference_backend_cases(results, cfg, "x86_64"); +} + +} // namespace emel::bench diff --git a/tools/bench/logits_bench.cpp b/tools/bench/logits/bench_common.hpp similarity index 63% rename from tools/bench/logits_bench.cpp rename to tools/bench/logits/bench_common.hpp index 29974db2..83565303 100644 --- a/tools/bench/logits_bench.cpp +++ b/tools/bench/logits/bench_common.hpp @@ -1,3 +1,4 @@ +#pragma once #include "bench_cases.hpp" #include @@ -167,85 +168,5 @@ emel::error::type run_sampler_raw(const emel::logits::sampler::event::sample_log return ev.error_out; } -void append_component_cases(std::vector & results, - const emel::bench::config & cfg) { - volatile std::int64_t sink = 0; - - for (const int32_t vocab_size : k_vocab_sizes) { - logits_case_data validator_data{vocab_size}; - int32_t candidate_count_out = 0; - emel::error::type validator_error_out = emel::error::cast(emel::logits::validator::error::none); - emel::logits::validator::event::build build_event{ - validator_data.logits[0], - vocab_size, - validator_data.candidate_ids[0], - validator_data.candidate_scores[0], - vocab_size, - candidate_count_out, - validator_error_out}; - - emel::logits::validator::sm validator_machine{}; - const std::string validator_sml_case = make_case_name("validator", "sml", vocab_size); - auto validator_sml_fn = [&]() { - (void)validator_machine.process_event(build_event); - sink ^= static_cast(candidate_count_out); - }; - results.push_back(emel::bench::measure_case(validator_sml_case.c_str(), cfg, validator_sml_fn)); - - const std::string validator_raw_case = make_case_name("validator", "raw", vocab_size); - auto validator_raw_fn = [&]() { - (void)run_validator_raw(build_event); - sink ^= static_cast(candidate_count_out); - }; - results.push_back(emel::bench::measure_case(validator_raw_case.c_str(), cfg, validator_raw_fn)); - - logits_case_data sampler_data{vocab_size}; - int32_t selected_token_out = -1; - emel::error::type sampler_error_out = emel::error::cast(emel::logits::sampler::error::none); - emel::logits::sampler::event::sample_logits sample_event{ - sampler_data.logits[0], - vocab_size, - sampler_data.candidate_ids[0], - sampler_data.candidate_scores[0], - vocab_size, - selected_token_out, - sampler_error_out}; - - emel::logits::sampler::fn sampler_chain[] = { - emel::logits::sampler::fn::from(), - emel::logits::sampler::fn::from(), - }; - constexpr int32_t sampler_count = static_cast(std::size(sampler_chain)); - - emel::logits::sampler::sm sampler_machine{sampler_chain, sampler_count}; - const std::string sampler_sml_case = make_case_name("sampler", "sml", vocab_size); - auto sampler_sml_fn = [&]() { - (void)sampler_machine.process_event(sample_event); - sink ^= static_cast(selected_token_out); - }; - results.push_back(emel::bench::measure_case(sampler_sml_case.c_str(), cfg, sampler_sml_fn)); - - const std::string sampler_raw_case = make_case_name("sampler", "raw", vocab_size); - auto sampler_raw_fn = [&]() { - (void)run_sampler_raw(sample_event, sampler_chain, sampler_count); - sink ^= static_cast(selected_token_out); - }; - results.push_back(emel::bench::measure_case(sampler_raw_case.c_str(), cfg, sampler_raw_fn)); - } - - (void)sink; -} } // namespace - -namespace emel::bench { - -void append_emel_logits_cases(std::vector & results, const config & cfg) { - append_component_cases(results, cfg); -} - -void append_reference_logits_cases(std::vector & results, const config & cfg) { - append_component_cases(results, cfg); -} - -} // namespace emel::bench diff --git a/tools/bench/logits/sampler_bench.cpp b/tools/bench/logits/sampler_bench.cpp new file mode 100644 index 00000000..38feb2c2 --- /dev/null +++ b/tools/bench/logits/sampler_bench.cpp @@ -0,0 +1,52 @@ +#include "bench_cases.hpp" + +#include "logits/bench_common.hpp" + +namespace emel::bench { + +void append_emel_logits_sampler_cases(std::vector & results, const config & cfg) { + volatile std::int64_t sink = 0; + + for (const int32_t vocab_size : k_vocab_sizes) { + logits_case_data sampler_data{vocab_size}; + int32_t selected_token_out = -1; + emel::error::type sampler_error_out = emel::error::cast(emel::logits::sampler::error::none); + emel::logits::sampler::event::sample_logits sample_event{ + sampler_data.logits[0], + vocab_size, + sampler_data.candidate_ids[0], + sampler_data.candidate_scores[0], + vocab_size, + selected_token_out, + sampler_error_out}; + + emel::logits::sampler::fn sampler_chain[] = { + emel::logits::sampler::fn::from(), + emel::logits::sampler::fn::from(), + }; + constexpr int32_t sampler_count = static_cast(std::size(sampler_chain)); + + emel::logits::sampler::sm sampler_machine{sampler_chain, sampler_count}; + const std::string sampler_sml_case = make_case_name("sampler", "sml", vocab_size); + auto sampler_sml_fn = [&]() { + (void)sampler_machine.process_event(sample_event); + sink ^= static_cast(selected_token_out); + }; + results.push_back(emel::bench::measure_case(sampler_sml_case.c_str(), cfg, sampler_sml_fn)); + + const std::string sampler_raw_case = make_case_name("sampler", "raw", vocab_size); + auto sampler_raw_fn = [&]() { + (void)run_sampler_raw(sample_event, sampler_chain, sampler_count); + sink ^= static_cast(selected_token_out); + }; + results.push_back(emel::bench::measure_case(sampler_raw_case.c_str(), cfg, sampler_raw_fn)); + } + + (void)sink; +} + +void append_reference_logits_sampler_cases(std::vector & results, const config & cfg) { + append_emel_logits_sampler_cases(results, cfg); +} + +} // namespace emel::bench diff --git a/tools/bench/logits/validator_bench.cpp b/tools/bench/logits/validator_bench.cpp new file mode 100644 index 00000000..a816d5a2 --- /dev/null +++ b/tools/bench/logits/validator_bench.cpp @@ -0,0 +1,46 @@ +#include "bench_cases.hpp" + +#include "logits/bench_common.hpp" + +namespace emel::bench { + +void append_emel_logits_validator_cases(std::vector & results, const config & cfg) { + volatile std::int64_t sink = 0; + + for (const int32_t vocab_size : k_vocab_sizes) { + logits_case_data validator_data{vocab_size}; + int32_t candidate_count_out = 0; + emel::error::type validator_error_out = emel::error::cast(emel::logits::validator::error::none); + emel::logits::validator::event::build build_event{ + validator_data.logits[0], + vocab_size, + validator_data.candidate_ids[0], + validator_data.candidate_scores[0], + vocab_size, + candidate_count_out, + validator_error_out}; + + emel::logits::validator::sm validator_machine{}; + const std::string validator_sml_case = make_case_name("validator", "sml", vocab_size); + auto validator_sml_fn = [&]() { + (void)validator_machine.process_event(build_event); + sink ^= static_cast(candidate_count_out); + }; + results.push_back(emel::bench::measure_case(validator_sml_case.c_str(), cfg, validator_sml_fn)); + + const std::string validator_raw_case = make_case_name("validator", "raw", vocab_size); + auto validator_raw_fn = [&]() { + (void)run_validator_raw(build_event); + sink ^= static_cast(candidate_count_out); + }; + results.push_back(emel::bench::measure_case(validator_raw_case.c_str(), cfg, validator_raw_fn)); + } + + (void)sink; +} + +void append_reference_logits_validator_cases(std::vector & results, const config & cfg) { + append_emel_logits_validator_cases(results, cfg); +} + +} // namespace emel::bench diff --git a/tools/bench/text/jinja/formatter_bench.cpp b/tools/bench/text/jinja/formatter_bench.cpp index 29a236a3..50ce0504 100644 --- a/tools/bench/text/jinja/formatter_bench.cpp +++ b/tools/bench/text/jinja/formatter_bench.cpp @@ -11,9 +11,6 @@ #include "jinja/parser.h" #include "jinja/runtime.h" -// TODO(rearchitecture-cleanup): Keep legacy "renderer" benchmark IDs until -// downstream references and snapshot consumers are migrated. - namespace { std::string make_long_template() { @@ -124,7 +121,7 @@ void append_emel_jinja_formatter_cases(std::vector & results, const conf }; (void)machine.process_event(ev); }; - results.push_back(measure_case("jinja/renderer_short", cfg, short_fn)); + results.push_back(measure_case("text/jinja/formatter_short", cfg, short_fn)); auto long_fn = [&]() { std::array buffer = {}; @@ -140,7 +137,7 @@ void append_emel_jinja_formatter_cases(std::vector & results, const conf }; (void)machine.process_event(ev); }; - results.push_back(measure_case("jinja/renderer_long", cfg, long_fn)); + results.push_back(measure_case("text/jinja/formatter_long", cfg, long_fn)); } void append_reference_jinja_formatter_cases(std::vector & results, const config & cfg) { @@ -158,7 +155,7 @@ void append_reference_jinja_formatter_cases(std::vector & results, const auto parts = ::jinja::runtime::gather_string_parts(result); (void)::jinja::render_string_parts(parts); }; - results.push_back(measure_case("jinja/renderer_short", cfg, short_fn)); + results.push_back(measure_case("text/jinja/formatter_short", cfg, short_fn)); auto long_fn = [&]() { ::jinja::context ctx; @@ -174,7 +171,7 @@ void append_reference_jinja_formatter_cases(std::vector & results, const auto parts = ::jinja::runtime::gather_string_parts(result); (void)::jinja::render_string_parts(parts); }; - results.push_back(measure_case("jinja/renderer_long", cfg, long_fn)); + results.push_back(measure_case("text/jinja/formatter_long", cfg, long_fn)); } } // namespace emel::bench diff --git a/tools/bench/text/jinja/parser_bench.cpp b/tools/bench/text/jinja/parser_bench.cpp index aea3df22..e5433e2a 100644 --- a/tools/bench/text/jinja/parser_bench.cpp +++ b/tools/bench/text/jinja/parser_bench.cpp @@ -65,7 +65,7 @@ void append_emel_jinja_parser_cases(std::vector & results, const config emel::text::jinja::parser::detail::recursive_descent_parser parser{program}; (void)parser.parse(lex_res); }; - results.push_back(measure_case("jinja/parser_short", cfg, short_fn)); + results.push_back(measure_case("text/jinja/parser_short", cfg, short_fn)); auto long_fn = [&]() { emel::text::jinja::lexer_result lex_res = lex.tokenize(long_template); @@ -73,7 +73,7 @@ void append_emel_jinja_parser_cases(std::vector & results, const config emel::text::jinja::parser::detail::recursive_descent_parser parser{program}; (void)parser.parse(lex_res); }; - results.push_back(measure_case("jinja/parser_long", cfg, long_fn)); + results.push_back(measure_case("text/jinja/parser_long", cfg, long_fn)); } void append_reference_jinja_parser_cases(std::vector & results, const config & cfg) { @@ -88,13 +88,13 @@ void append_reference_jinja_parser_cases(std::vector & results, const co ::jinja::lexer_result lex_res = lex.tokenize(short_template); (void)::jinja::parse_from_tokens(lex_res); }; - results.push_back(measure_case("jinja/parser_short", cfg, short_fn)); + results.push_back(measure_case("text/jinja/parser_short", cfg, short_fn)); auto long_fn = [&]() { ::jinja::lexer_result lex_res = lex.tokenize(long_template); (void)::jinja::parse_from_tokens(lex_res); }; - results.push_back(measure_case("jinja/parser_long", cfg, long_fn)); + results.push_back(measure_case("text/jinja/parser_long", cfg, long_fn)); } } // namespace emel::bench From 442c1979a36a177829e9dafe04327199e7109c5e Mon Sep 17 00:00:00 2001 From: gabewillen Date: Sun, 1 Mar 2026 23:30:33 -0600 Subject: [PATCH 3/3] bench: fix bpe encoder cases and skip unsupported kernel arch --- tools/bench/bench_main.cpp | 30 ++++++++++++++++ tools/bench/text/encoders/bench_common.hpp | 40 ++++++++++++++++------ tools/bench/text/encoders/bpe_bench.cpp | 14 +++++--- 3 files changed, 70 insertions(+), 14 deletions(-) diff --git a/tools/bench/bench_main.cpp b/tools/bench/bench_main.cpp index 1bf0cc3e..2ce6f919 100644 --- a/tools/bench/bench_main.cpp +++ b/tools/bench/bench_main.cpp @@ -19,6 +19,33 @@ constexpr std::size_t k_default_runs = 5; constexpr std::uint64_t k_default_warmup_iterations = 1000; constexpr std::size_t k_default_warmup_runs = 1; constexpr std::size_t k_max_runs = 25; + +constexpr bool k_host_is_x86_64 = +#if defined(__x86_64__) || defined(_M_X64) + true; +#else + false; +#endif + +constexpr bool k_host_is_aarch64 = +#if defined(__aarch64__) || defined(_M_ARM64) + true; +#else + false; +#endif + +bool case_supported_on_host(const bench::test_case & tc) { + if (tc.append_emel == bench::append_emel_kernel_x86_64_cases || + tc.append_reference == bench::append_reference_kernel_x86_64_cases) { + return k_host_is_x86_64; + } + if (tc.append_emel == bench::append_emel_kernel_aarch64_cases || + tc.append_reference == bench::append_reference_kernel_aarch64_cases) { + return k_host_is_aarch64; + } + return true; +} + std::uint64_t read_env_u64(const char * name, std::uint64_t fallback) { const char * value = std::getenv(name); if (value == nullptr || value[0] == '\0') { @@ -125,6 +152,9 @@ std::vector run_benchmarks(const bench::config & cfg, if (tc.tokenizer_case && !include_tokenizer) { continue; } + if (!case_supported_on_host(tc)) { + continue; + } bench::append_test_case(results, cfg, tc, reference); } diff --git a/tools/bench/text/encoders/bench_common.hpp b/tools/bench/text/encoders/bench_common.hpp index 1b6201a7..61851bf3 100644 --- a/tools/bench/text/encoders/bench_common.hpp +++ b/tools/bench/text/encoders/bench_common.hpp @@ -124,16 +124,14 @@ inline void ensure_encodes(machine_type & machine, } template -inline void append_emel_encoder_cases(std::vector & results, - const config & cfg, - const char * short_name, - const char * long_name, - build_vocab_fn build_vocab, - const bool preprocessed, - const int short_repeats = 1, - const int long_repeats = 64) { - const std::string short_text = make_repeated_text(short_repeats); - const std::string long_text = make_repeated_text(long_repeats); +inline void append_emel_encoder_cases_with_text(std::vector & results, + const config & cfg, + const char * short_name, + const char * long_name, + build_vocab_fn build_vocab, + const bool preprocessed, + const std::string_view short_text, + const std::string_view long_text) { auto vocab = build_vocab(); machine_type machine{}; @@ -168,4 +166,26 @@ inline void append_emel_encoder_cases(std::vector & results, results.push_back(measure_case(long_name, cfg, long_fn)); } +template +inline void append_emel_encoder_cases(std::vector & results, + const config & cfg, + const char * short_name, + const char * long_name, + build_vocab_fn build_vocab, + const bool preprocessed, + const int short_repeats = 1, + const int long_repeats = 64) { + const std::string short_text = make_repeated_text(short_repeats); + const std::string long_text = make_repeated_text(long_repeats); + append_emel_encoder_cases_with_text( + results, + cfg, + short_name, + long_name, + build_vocab, + preprocessed, + short_text, + long_text); +} + } // namespace emel::bench::encoder_bench diff --git a/tools/bench/text/encoders/bpe_bench.cpp b/tools/bench/text/encoders/bpe_bench.cpp index 3ae199fb..2ea78ebf 100644 --- a/tools/bench/text/encoders/bpe_bench.cpp +++ b/tools/bench/text/encoders/bpe_bench.cpp @@ -16,8 +16,7 @@ std::unique_ptr make_bpe_vocab() { vocab->ignore_merges = true; emel::bench::encoder_bench::add_all_byte_tokens(*vocab); (void)emel::bench::encoder_bench::add_token(*vocab, "hello", 0.5f, 1); - (void)emel::bench::encoder_bench::add_token(*vocab, "world", 0.5f, 1); - (void)emel::bench::encoder_bench::add_token(*vocab, " ", 0.1f, 1); + (void)emel::bench::encoder_bench::add_token(*vocab, " world", 0.5f, 1); return vocab; } @@ -26,8 +25,15 @@ std::unique_ptr make_bpe_vocab() { namespace emel::bench { void append_emel_encoder_bpe_cases(std::vector & results, const config & cfg) { - encoder_bench::append_emel_encoder_cases( - results, cfg, "text/encoders/bpe_short", "text/encoders/bpe_long", make_bpe_vocab, true); + encoder_bench::append_emel_encoder_cases_with_text( + results, + cfg, + "text/encoders/bpe_short", + "text/encoders/bpe_long", + make_bpe_vocab, + true, + "hello", + " world"); } void append_reference_encoder_bpe_cases(std::vector & results, const config & cfg) {