From f3577ebf0fa32f77c88a104fee0b8c24a63f0503 Mon Sep 17 00:00:00 2001 From: idevasena Date: Wed, 10 Dec 2025 18:34:23 -0800 Subject: [PATCH 01/43] vdb_benchmark commit with unit tests --- vdb_benchmark/.gitignore | 180 ++++ vdb_benchmark/LICENSE | 201 +++++ vdb_benchmark/README.md | 125 +++ vdb_benchmark/docker-compose.yml | 68 ++ vdb_benchmark/list_collections.py | 153 ++++ vdb_benchmark/pyproject.toml | 36 + vdb_benchmark/tests/Makefile | 165 ++++ vdb_benchmark/tests/README.md | 404 +++++++++ vdb_benchmark/tests/fixtures/test_config.yaml | 54 ++ vdb_benchmark/tests/requirements.txt | 66 ++ vdb_benchmark/tests/tests/__init__.py | 17 + vdb_benchmark/tests/tests/conftest.py | 180 ++++ vdb_benchmark/tests/tests/run_tests.py | 346 ++++++++ .../tests/tests/test_compact_and_watch.py | 701 +++++++++++++++ vdb_benchmark/tests/tests/test_config.py | 359 ++++++++ .../tests/tests/test_database_connection.py | 538 ++++++++++++ .../tests/tests/test_index_management.py | 825 ++++++++++++++++++ vdb_benchmark/tests/tests/test_load_vdb.py | 530 +++++++++++ .../tests/tests/test_simple_bench.py | 766 ++++++++++++++++ .../tests/tests/test_vector_generation.py | 369 ++++++++ vdb_benchmark/tests/tests/verify_fixes.py | 81 ++ vdb_benchmark/tests/utils/__init__.py | 47 + vdb_benchmark/tests/utils/mock_data.py | 415 +++++++++ vdb_benchmark/tests/utils/test_helpers.py | 458 ++++++++++ vdb_benchmark/vdbbench/__init__.py | 0 vdb_benchmark/vdbbench/compact_and_watch.py | 292 +++++++ vdb_benchmark/vdbbench/config_loader.py | 60 ++ .../vdbbench/configs/10m_diskann.yaml | 26 + vdb_benchmark/vdbbench/configs/10m_hnsw.yaml | 26 + .../vdbbench/configs/1m_diskann.yaml | 26 + vdb_benchmark/vdbbench/configs/1m_hnsw.yaml | 26 + vdb_benchmark/vdbbench/list_collections.py | 183 ++++ vdb_benchmark/vdbbench/load_vdb.py | 370 ++++++++ vdb_benchmark/vdbbench/simple_bench.py | 668 ++++++++++++++ 34 files changed, 8761 insertions(+) create mode 100644 vdb_benchmark/.gitignore create mode 100644 vdb_benchmark/LICENSE create mode 100644 vdb_benchmark/README.md create mode 100644 vdb_benchmark/docker-compose.yml create mode 100644 vdb_benchmark/list_collections.py create mode 100644 vdb_benchmark/pyproject.toml create mode 100755 vdb_benchmark/tests/Makefile create mode 100755 vdb_benchmark/tests/README.md create mode 100755 vdb_benchmark/tests/fixtures/test_config.yaml create mode 100755 vdb_benchmark/tests/requirements.txt create mode 100755 vdb_benchmark/tests/tests/__init__.py create mode 100755 vdb_benchmark/tests/tests/conftest.py create mode 100755 vdb_benchmark/tests/tests/run_tests.py create mode 100755 vdb_benchmark/tests/tests/test_compact_and_watch.py create mode 100755 vdb_benchmark/tests/tests/test_config.py create mode 100755 vdb_benchmark/tests/tests/test_database_connection.py create mode 100755 vdb_benchmark/tests/tests/test_index_management.py create mode 100755 vdb_benchmark/tests/tests/test_load_vdb.py create mode 100755 vdb_benchmark/tests/tests/test_simple_bench.py create mode 100755 vdb_benchmark/tests/tests/test_vector_generation.py create mode 100755 vdb_benchmark/tests/tests/verify_fixes.py create mode 100755 vdb_benchmark/tests/utils/__init__.py create mode 100755 vdb_benchmark/tests/utils/mock_data.py create mode 100755 vdb_benchmark/tests/utils/test_helpers.py create mode 100644 vdb_benchmark/vdbbench/__init__.py create mode 100644 vdb_benchmark/vdbbench/compact_and_watch.py create mode 100644 vdb_benchmark/vdbbench/config_loader.py create mode 100644 vdb_benchmark/vdbbench/configs/10m_diskann.yaml create mode 100644 vdb_benchmark/vdbbench/configs/10m_hnsw.yaml create mode 100644 vdb_benchmark/vdbbench/configs/1m_diskann.yaml create mode 100644 vdb_benchmark/vdbbench/configs/1m_hnsw.yaml create mode 100644 vdb_benchmark/vdbbench/list_collections.py create mode 100644 vdb_benchmark/vdbbench/load_vdb.py create mode 100644 vdb_benchmark/vdbbench/simple_bench.py diff --git a/vdb_benchmark/.gitignore b/vdb_benchmark/.gitignore new file mode 100644 index 00000000..95b3f05e --- /dev/null +++ b/vdb_benchmark/.gitignore @@ -0,0 +1,180 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +tests/tests/__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ +tests/.benchmarks/ +tests/.coverage +tests/tests/coverage_html/ +tests/tests/test_results.* +tests/tests/test_report.* + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc diff --git a/vdb_benchmark/LICENSE b/vdb_benchmark/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/vdb_benchmark/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vdb_benchmark/README.md b/vdb_benchmark/README.md new file mode 100644 index 00000000..e8ea20e4 --- /dev/null +++ b/vdb_benchmark/README.md @@ -0,0 +1,125 @@ +# Vector Database Benchmark Tool +This tool allows you to benchmark and compare the performance of vector databases with current support for Milvus and others planned. + +## Installation + +### Using Docker (recommended) +1. Clone the repository: +``` bash +git clone -b TF_VDBBench https://github.com/mlcommons/storage.git +cd storage/vdb_benchmark +``` +2. Build and run the Docker container: +```bash +docker compose up -d # with docker-compose-v2. v1 uses docker-compose up +``` + +### Manual Installation +1. Clone the repository: +```bash +git clone -b TF_VDBBench https://github.com/mlcommons/storage.git +cd storage/vdb_benchmark +``` + +2. Install the package: +```bash +pip3 install ./ +``` + +## Deploying a Standalone Milvus Instance +The docker-compose.yml file will configure a 3-container instance of Milvus database. + - Milvus Database + - Minio Object Storage + - etcd + +The docker-compose.yml file uses ```/mnt/vdb``` as the root directory for the required docker volumes. You can modify the compose file for your environment or ensure that your target storage is mounted at this location. + +For testing more than one storage solution, there are two methods: +1. Create a set of containers for each storage solution with modified docker-compose.yml files pointing to different root directories. Each set of containers will also need a different port to listen on. You may need to limit how many instances you can run depending on the available memory in your system +2. Bring down the containers, copy the /mnt/vdb data to another location, change the mount point to point to the new location. Bring the containers back up. This is simpler as the database connection isn't changing but you need to manually reconfigure the storage to change the system under test. + +### Deployment +```bash +cd storage/vdb_benchmark +docker compose up -d # with docker-compose-v2. v1 uses docker-compose up +``` + +```-d``` option is required to detach from the containers after starting them. Without this option you will be attached to the log output of the set of containers and ```ctrl+c``` will stop the containers. + +*If you have connection problems with a proxy I recommend this link: https://medium.com/@SrvZ/docker-proxy-and-my-struggles-a4fd6de21861* + +## Running the Benchmark +The benchmark process consists of three main steps: +1. Loading vectors into the database +2. Monitoring and compacting the database +3. Running the benchmark queries + +### Step 1: Load Vectors into the Database +Use the load_vdb.py script to generate and load 10 million vectors into your vector database: (this process can take up to 8 hours) +```bash +python vdbbench/load_vdb.py --config vdbbench/configs/10m_diskann.yaml +``` + + +For testing, I recommend using a smaller data by passing the num_vectors option: +```bash +python vdbbench/load_vdb.py --config vdbbench/configs/10m_diskann.yaml --collection-name mlps_500k_10shards_1536dim_uniform_diskann --num-vectors 500000 +``` + +Key parameters: +* --collection-name: Name of the collection to create +* --dimension: Vector dimension +* --num-vectors: Number of vectors to generate +* --chunk-size: Number of vectors to generate in each chunk (for memory management) +* --distribution: Distribution for vector generation (uniform, normal) +* --batch-size: Batch size for insertion + +Example configuration file (vdbbench/configs/10m_diskann.yaml): +```yaml +database: + host: 127.0.0.1 + port: 19530 + database: milvus + max_receive_message_length: 514_983_574 + max_send_message_length: 514_983_574 + +dataset: + collection_name: mlps_10m_10shards_1536dim_uniform_diskann + num_vectors: 10_000_000 + dimension: 1536 + distribution: uniform + batch_size: 1000 + num_shards: 10 + vector_dtype: FLOAT_VECTOR + +index: + index_type: DISKANN + metric_type: COSINE + #index_params + max_degree: 64 + search_list_size: 200 + +workflow: + compact: True +``` + +### Step 2: Monitor and Compact the Database +The compact_and_watch.py script monitors the database and performs compaction. You should only need this if the load process exits out while waiting. The load script will do compaction and will wait for it to complete. +```bash +python vdbbench/compact_and_watch.py --config vdbbench/configs/10m_diskann.yaml --interval 5 +``` +This step is automatically performed at the end of the loading process if you set compact: true in your configuration. + +### Step 3: Run the Benchmark +Finally, run the benchmark using the simple_bench.py script: +```bash +python vdbbench/simple_bench.py --host 127.0.0.1 --collection --processes --batch-size --runtime +``` + +For comparison with HNSW indexing, use ```vdbbench/configs/10m_hnsw.yaml``` and update collection_name accordingly. + +## Supported Databases +Milvus with DiskANN & HNSW indexing (currently implemented) + +# Contributing +Contributions are welcome! Please feel free to submit a Pull Request. diff --git a/vdb_benchmark/docker-compose.yml b/vdb_benchmark/docker-compose.yml new file mode 100644 index 00000000..4c69af29 --- /dev/null +++ b/vdb_benchmark/docker-compose.yml @@ -0,0 +1,68 @@ +version: '3.5' + +services: + etcd: + container_name: milvus-etcd + image: quay.io/coreos/etcd:v3.5.18 + environment: + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 + volumes: + - /mnt/vdb/etcd:/etcd + command: etcd -advertise-client-urls=http://etcd:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd + ports: + - "2379:2379" + healthcheck: + test: ["CMD", "etcdctl", "endpoint", "health"] + interval: 30s + timeout: 20s + retries: 3 + + minio: + container_name: milvus-minio + image: minio/minio:RELEASE.2023-03-20T20-16-18Z + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + ports: + - "9001:9001" + - "9000:9000" + volumes: + - /mnt/vdb/minio:/minio_data + command: minio server /minio_data --console-address ":9001" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + standalone: + container_name: milvus-standalone + image: milvusdb/milvus:v2.5.10 + command: ["milvus", "run", "standalone"] + security_opt: + - seccomp:unconfined + environment: + MINIO_REGION: us-east-1 + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + volumes: + - /mnt/vdb/milvus:/var/lib/milvus + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"] + interval: 30s + start_period: 90s + timeout: 20s + retries: 3 + ports: + - "19530:19530" + - "9091:9091" + depends_on: + - "etcd" + - "minio" + +networks: + default: + name: milvus diff --git a/vdb_benchmark/list_collections.py b/vdb_benchmark/list_collections.py new file mode 100644 index 00000000..a83b2f8a --- /dev/null +++ b/vdb_benchmark/list_collections.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +""" +Milvus Collection Lister + +This script connects to a local Milvus database and lists all collections +along with the number of vectors in each collection. +""" + +import argparse +import sys +from typing import Dict, List, Tuple + +try: + from pymilvus import connections, utility + from pymilvus.exceptions import MilvusException +except ImportError: + print("Error: pymilvus package not found. Please install it with 'pip install pymilvus'") + sys.exit(1) + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments""" + parser = argparse.ArgumentParser(description="List Milvus collections and their vector counts") + parser.add_argument("--host", type=str, default="127.0.0.1", + help="Milvus server host (default: 127.0.0.1)") + parser.add_argument("--port", type=str, default="19530", + help="Milvus server port (default: 19530)") + parser.add_argument("--verbose", "-v", action="store_true", + help="Show detailed collection information") + return parser.parse_args() + + +def connect_to_milvus(host: str, port: str) -> bool: + """Establish connection to Milvus server""" + try: + connections.connect( + alias="default", + host=host, + port=port, + max_receive_message_length=514983574, + max_send_message_length=514983574 + ) + return True + except Exception as e: + print(f"Failed to connect to Milvus: {e}") + return False + + +def get_collections_info() -> List[Dict]: + """Get information about all collections""" + try: + collection_names = utility.list_collections() + collections_info = [] + + for name in collection_names: + from pymilvus import Collection + collection = Collection(name) + + # Get collection statistics - using num_entities instead of get_stats() + row_count = collection.num_entities + + # Get collection schema + schema = collection.schema + description = schema.description if schema.description else "No description" + + # Get vector field dimension + vector_field = None + vector_dim = None + for field in schema.fields: + if field.dtype == 100: # DataType.FLOAT_VECTOR + vector_field = field.name + vector_dim = field.params.get("dim") + break + + # Get index information + index_info = [] + try: + for field_name in collection.schema.fields: + if collection.has_index(field_name.name): + index = collection.index(field_name.name) + index_info.append({ + "field": field_name.name, + "index_type": index.params.get("index_type"), + "metric_type": index.params.get("metric_type"), + "params": index.params.get("params", {}) + }) + except Exception as e: + index_info = [{"error": str(e)}] + + collections_info.append({ + "name": name, + "row_count": row_count, + "description": description, + "vector_field": vector_field, + "vector_dim": vector_dim, + "index_info": index_info + }) + + return collections_info + except MilvusException as e: + print(f"Error retrieving collection information: {e}") + return [] + + +def main() -> int: + """Main function""" + args = parse_args() + + # Connect to Milvus + if not connect_to_milvus(args.host, args.port): + return 1 + + print(f"Connected to Milvus server at {args.host}:{args.port}") + + # Get collections information + collections_info = get_collections_info() + + if not collections_info: + print("No collections found.") + return 0 + + # Display collections information + print(f"\nFound {len(collections_info)} collections:") + print("-" * 80) + + for info in collections_info: + print(f"Collection: {info['name']}") + print(f" Vectors: {info['row_count']:,}") + print(f" Vector Field: {info['vector_field']} (dim: {info['vector_dim']})") + + if args.verbose: + print(f" Description: {info['description']}") + + if info['index_info']: + print(" Indexes:") + for idx in info['index_info']: + if "error" in idx: + print(f" Error retrieving index info: {idx['error']}") + else: + print(f" Field: {idx['field']}") + print(f" Type: {idx['index_type']}") + print(f" Metric: {idx['metric_type']}") + print(f" Params: {idx['params']}") + else: + print(" Indexes: None") + + print("-" * 80) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/vdb_benchmark/pyproject.toml b/vdb_benchmark/pyproject.toml new file mode 100644 index 00000000..f4d56d8f --- /dev/null +++ b/vdb_benchmark/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "vdbbench" +version = "0.1.0" +description = "Vector Database Benchmarking Tool" +readme = "README.md" +authors = [ + {name = "Vector DB Storage WG TF"} +] +license = {text = "MIT"} +requires-python = ">=3.8" +dependencies = [ + "numpy", + "pandas", + "pymilvus", + "pyyaml", + "tabulate" +] + +[project.urls] +"Homepage" = "https://github.com/mlcommons/storage/tree/TF_VDBBench/vdb_benchmark" +"Bug Tracker" = "https://github.com/mlcommons/storage/issues" + +[project.scripts] +compact-and-watch = "vdbbench.compact_and_watch:main" +load-vdb = "vdbbench.load_vdb:main" +vdbbench = "vdbbench.simple_bench:main" + +[tool.setuptools] +packages = {find = {}} + +[tool.setuptools.package-data] +vdbbench = ["*.py"] diff --git a/vdb_benchmark/tests/Makefile b/vdb_benchmark/tests/Makefile new file mode 100755 index 00000000..742886c7 --- /dev/null +++ b/vdb_benchmark/tests/Makefile @@ -0,0 +1,165 @@ +# Makefile for VDB-Bench Test Suite + +.PHONY: help install test test-all test-config test-connection test-loading \ + test-benchmark test-index test-monitoring test-performance \ + test-integration coverage coverage-html clean lint format \ + test-verbose test-failed test-parallel + +# Default target +help: + @echo "VDB-Bench Test Suite Makefile" + @echo "==============================" + @echo "" + @echo "Available targets:" + @echo " make install - Install test dependencies" + @echo " make test - Run all tests" + @echo " make test-verbose - Run tests with verbose output" + @echo " make test-parallel - Run tests in parallel" + @echo " make test-failed - Re-run only failed tests" + @echo "" + @echo "Test categories:" + @echo " make test-config - Run configuration tests" + @echo " make test-connection - Run connection tests" + @echo " make test-loading - Run loading tests" + @echo " make test-benchmark - Run benchmark tests" + @echo " make test-index - Run index management tests" + @echo " make test-monitoring - Run monitoring tests" + @echo "" + @echo "Special test suites:" + @echo " make test-performance - Run performance tests" + @echo " make test-integration - Run integration tests" + @echo "" + @echo "Coverage and reports:" + @echo " make coverage - Run tests with coverage" + @echo " make coverage-html - Generate HTML coverage report" + @echo "" + @echo "Code quality:" + @echo " make lint - Run code linting" + @echo " make format - Format code with black" + @echo "" + @echo "Maintenance:" + @echo " make clean - Clean test artifacts" + +# Installation +install: + pip install -r tests/requirements-test.txt + pip install -e . + +# Basic test execution +test: + python tests/run_tests.py + +test-all: test + +test-verbose: + python tests/run_tests.py --verbose + +test-parallel: + pytest tests/ -n auto --dist loadscope + +test-failed: + pytest tests/ --lf + +# Test categories +test-config: + python tests/run_tests.py --category config + +test-connection: + python tests/run_tests.py --category connection + +test-loading: + python tests/run_tests.py --category loading + +test-benchmark: + python tests/run_tests.py --category benchmark + +test-index: + python tests/run_tests.py --category index + +test-monitoring: + python tests/run_tests.py --category monitoring + +# Special test suites +test-performance: + python tests/run_tests.py --performance + +test-integration: + python tests/run_tests.py --integration + +# Coverage +coverage: + pytest tests/ --cov=vdbbench --cov-report=term --cov-report=html + +coverage-html: coverage + @echo "Opening coverage report in browser..." + @python -m webbrowser tests/htmlcov/index.html + +# Code quality +lint: + @echo "Running flake8..." + flake8 tests/ --max-line-length=100 --ignore=E203,W503 + @echo "Running pylint..." + pylint tests/ --max-line-length=100 --disable=C0111,R0903,R0913 + @echo "Running mypy..." + mypy tests/ --ignore-missing-imports + +format: + black tests/ --line-length=100 + isort tests/ --profile black --line-length=100 + +# Clean up +clean: + @echo "Cleaning test artifacts..." + rm -rf tests/__pycache__ + rm -rf tests/utils/__pycache__ + rm -rf tests/.pytest_cache + rm -rf tests/htmlcov + rm -rf tests/coverage_html + rm -f tests/.coverage + rm -f tests/test_results.xml + rm -f tests/test_results.json + rm -f tests/test_report.html + rm -f tests/*.pyc + rm -rf tests/**/*.pyc + find tests/ -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true + @echo "Clean complete!" + +# Watch mode (requires pytest-watch) +watch: + ptw tests/ -- --verbose + +# Run specific test file +test-file: + @read -p "Enter test file name (without .py): " file; \ + pytest tests/$$file.py -v + +# Run tests matching pattern +test-match: + @read -p "Enter test pattern: " pattern; \ + pytest tests/ -k "$$pattern" -v + +# Generate test report +report: + pytest tests/ --html=tests/test_report.html --self-contained-html + @echo "Test report generated at tests/test_report.html" + +# Check test coverage for specific module +coverage-module: + @read -p "Enter module name: " module; \ + pytest tests/ --cov=vdbbench.$$module --cov-report=term + +# Quick test (fast subset of tests) +test-quick: + pytest tests/ -m "not slow" --maxfail=1 -x + +# Full test suite with all checks +test-full: clean lint test-parallel coverage report + @echo "Full test suite complete!" + +# Continuous Integration target +ci: install lint test-parallel coverage + @echo "CI test suite complete!" + +# Development target (format, lint, and test) +dev: format lint test-verbose + @echo "Development test cycle complete!" diff --git a/vdb_benchmark/tests/README.md b/vdb_benchmark/tests/README.md new file mode 100755 index 00000000..f40c101b --- /dev/null +++ b/vdb_benchmark/tests/README.md @@ -0,0 +1,404 @@ +# VDB-Bench Test Suite + +Comprehensive unit test suite for the vdb-bench vector database benchmarking tool. + +## Overview + +This test suite provides extensive coverage for all components of vdb-bench, including: + +- Configuration management +- Database connections +- Vector generation and loading +- Index management +- Benchmarking operations +- Compaction and monitoring +- Performance metrics + +## Directory Structure + +``` +tests/ +├── __init__.py # Test suite package initialization +├── conftest.py # Pytest configuration and shared fixtures +├── run_tests.py # Main test runner script +├── requirements-test.txt # Testing dependencies +│ +├── test_config.py # Configuration management tests +├── test_database_connection.py # Database connection tests +├── test_load_vdb.py # Vector loading tests +├── test_vector_generation.py # Vector generation tests +├── test_index_management.py # Index management tests +├── test_simple_bench.py # Benchmarking functionality tests +├── test_compact_and_watch.py # Compaction and monitoring tests +│ +├── utils/ # Test utilities +│ ├── __init__.py +│ ├── test_helpers.py # Helper functions and utilities +│ └── mock_data.py # Mock data generators +│ +└── fixtures/ # Test fixtures + └── test_config.yaml # Sample configuration file +``` + +## Installation + +1. Install test dependencies: + +```bash +pip install -r tests/requirements-test.txt +``` + +2. Install vdb-bench in development mode: + +```bash +pip install -e . +``` + +## Running Tests + +### Run All Tests + +```bash +# Using pytest directly +pytest tests/ + +# Using the test runner +python tests/run_tests.py + +# With coverage +python tests/run_tests.py --verbose +``` + +### Run Specific Test Categories + +```bash +# Configuration tests +python tests/run_tests.py --category config + +# Connection tests +python tests/run_tests.py --category connection + +# Loading tests +python tests/run_tests.py --category loading + +# Benchmark tests +python tests/run_tests.py --category benchmark + +# Index management tests +python tests/run_tests.py --category index + +# Monitoring tests +python tests/run_tests.py --category monitoring +``` + +### Run Specific Test Modules + +```bash +# Run specific test files +python tests/run_tests.py --modules test_config test_load_vdb + +# Or using pytest +pytest tests/test_config.py tests/test_load_vdb.py +``` + +### Run Performance Tests + +```bash +# Run only performance-related tests +python tests/run_tests.py --performance + +# Or using pytest markers +pytest tests/ -k "performance or benchmark" +``` + +### Run with Verbose Output + +```bash +python tests/run_tests.py --verbose + +# Or with pytest +pytest tests/ -v +``` + +## Test Coverage + +### Generate Coverage Report + +```bash +# Run tests with coverage +pytest tests/ --cov=vdbbench --cov-report=html + +# Or using the test runner +python tests/run_tests.py # Coverage is enabled by default +``` + +### View Coverage Report + +After running tests with coverage, open the HTML report: + +```bash +# Open coverage report in browser +open tests/coverage_html/index.html +``` + +## Test Configuration + +### Environment Variables + +Set these environment variables to configure test behavior: + +```bash +# Database connection +export VDB_BENCH_TEST_HOST=localhost +export VDB_BENCH_TEST_PORT=19530 + +# Test data size +export VDB_BENCH_TEST_VECTORS=1000 +export VDB_BENCH_TEST_DIMENSION=128 + +# Performance test settings +export VDB_BENCH_TEST_TIMEOUT=60 +``` + +### Custom Test Configuration + +Create a custom test configuration file: + +```yaml +# tests/custom_config.yaml +test_settings: + use_mock_database: true + vector_count: 5000 + dimension: 256 + test_timeout: 30 +``` + +## Writing New Tests + +### Test Structure + +Follow this template for new test files: + +```python +""" +Unit tests for [component name] +""" +import pytest +from unittest.mock import Mock, patch +import numpy as np + +class TestComponentName: + """Test [component] functionality.""" + + def test_basic_operation(self): + """Test basic [operation].""" + # Test implementation + assert result == expected + + @pytest.mark.parametrize("input,expected", [ + (1, 2), + (2, 4), + (3, 6), + ]) + def test_parametrized(self, input, expected): + """Test with multiple inputs.""" + result = function_under_test(input) + assert result == expected + + @pytest.mark.skipif(condition, reason="Reason for skipping") + def test_conditional(self): + """Test that runs conditionally.""" + pass +``` + +### Using Fixtures + +Common fixtures are available in `conftest.py`: + +```python +def test_with_fixtures(mock_collection, sample_vectors, temp_config_file): + """Test using provided fixtures.""" + # mock_collection: Mock Milvus collection + # sample_vectors: Pre-generated test vectors + # temp_config_file: Temporary config file path + + result = process_vectors(mock_collection, sample_vectors) + assert result is not None +``` + +### Adding Mock Data + +Use mock data generators from `utils/mock_data.py`: + +```python +from tests.utils.mock_data import MockDataGenerator + +def test_with_mock_data(): + """Test using mock data generators.""" + generator = MockDataGenerator(seed=42) + + # Generate SIFT-like vectors + vectors = generator.generate_sift_like_vectors(1000, 128) + + # Generate deep learning embeddings + embeddings = generator.generate_deep_learning_embeddings( + 500, 768, model_type="bert" + ) +``` + +## Test Reports + +### HTML Report + +Tests automatically generate an HTML report: + +```bash +# View test report +open tests/test_report.html +``` + +### JUnit XML Report + +JUnit XML format for CI/CD integration: + +```bash +# Located at +tests/test_results.xml +``` + +### JSON Results + +Detailed test results in JSON format: + +```bash +# Located at +tests/test_results.json +``` + +## Continuous Integration + +### GitHub Actions Example + +```yaml +name: Tests + +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + pip install -r tests/requirements-test.txt + pip install -e . + + - name: Run tests + run: python tests/run_tests.py --verbose + + - name: Upload coverage + uses: codecov/codecov-action@v2 +``` + +## Debugging Tests + +### Run Tests in Debug Mode + +```bash +# Run with pytest debugging +pytest tests/ --pdb + +# Run specific test with debugging +pytest tests/test_config.py::TestConfigurationLoader::test_load_valid_config --pdb +``` + +### Increase Verbosity + +```bash +# Maximum verbosity +pytest tests/ -vvv + +# Show print statements +pytest tests/ -s +``` + +### Run Failed Tests Only + +```bash +# Re-run only failed tests from last run +pytest tests/ --lf + +# Run failed tests first, then others +pytest tests/ --ff +``` + +## Performance Testing + +### Run Benchmark Tests + +```bash +# Run with benchmark plugin +pytest tests/ --benchmark-only + +# Save benchmark results +pytest tests/ --benchmark-save=results + +# Compare benchmark results +pytest tests/ --benchmark-compare=results +``` + +### Memory Profiling + +```bash +# Profile memory usage +python -m memory_profiler tests/test_load_vdb.py +``` + +## Best Practices + +1. **Isolation**: Each test should be independent +2. **Mocking**: Mock external dependencies (database, file I/O) +3. **Fixtures**: Use fixtures for common setup +4. **Parametrization**: Test multiple inputs with parametrize +5. **Assertions**: Use clear, specific assertions +6. **Documentation**: Document complex test logic +7. **Performance**: Keep tests fast (< 1 second each) +8. **Coverage**: Aim for >80% code coverage + +## Troubleshooting + +### Common Issues + +1. **Import Errors**: Ensure vdb-bench is installed in development mode +2. **Mock Failures**: Check that pymilvus mocks are properly configured +3. **Timeout Issues**: Increase timeout for slow tests +4. **Resource Issues**: Some tests may require more memory/CPU + +### Getting Help + +For issues or questions: +1. Check test logs in `tests/test_results.json` +2. Review HTML report at `tests/test_report.html` +3. Enable verbose mode for detailed output +4. Check fixture definitions in `conftest.py` + +## Contributing + +When contributing new features, please: +1. Add corresponding unit tests +2. Ensure all tests pass +3. Maintain or improve code coverage +4. Follow the existing test structure +5. Update this README if needed + +## License + +Same as vdb-bench main project. diff --git a/vdb_benchmark/tests/fixtures/test_config.yaml b/vdb_benchmark/tests/fixtures/test_config.yaml new file mode 100755 index 00000000..360f34f1 --- /dev/null +++ b/vdb_benchmark/tests/fixtures/test_config.yaml @@ -0,0 +1,54 @@ +# Test configuration for vdb-bench unit tests +database: + host: 127.0.0.1 + port: 19530 + database: test_milvus + timeout: 30 + max_receive_message_length: 514983574 + max_send_message_length: 514983574 + +dataset: + collection_name: test_collection_sample + num_vectors: 10000 + dimension: 128 + distribution: uniform + batch_size: 500 + chunk_size: 1000 + num_shards: 2 + vector_dtype: FLOAT_VECTOR + +index: + index_type: HNSW + metric_type: L2 + params: + M: 16 + efConstruction: 200 + ef: 64 + +benchmark: + num_queries: 1000 + top_k: 10 + batch_size: 100 + num_processes: 4 + runtime: 60 + warmup_queries: 100 + +monitoring: + enabled: true + interval: 5 + metrics: + - qps + - latency + - recall + - memory_usage + +workflow: + compact: true + compact_threshold: 0.2 + flush_interval: 10000 + auto_index: true + +logging: + level: INFO + file: test_benchmark.log + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/vdb_benchmark/tests/requirements.txt b/vdb_benchmark/tests/requirements.txt new file mode 100755 index 00000000..32f8b91a --- /dev/null +++ b/vdb_benchmark/tests/requirements.txt @@ -0,0 +1,66 @@ +# Testing Dependencies for vdb-bench + +# Core testing frameworks +pytest>=7.4.0 +pytest-cov>=4.1.0 +pytest-html>=3.2.0 +pytest-xdist>=3.3.1 # For parallel test execution +pytest-timeout>=2.1.0 +pytest-mock>=3.11.1 + +# Coverage tools +coverage>=7.2.7 +coverage-badge>=1.1.0 + +# Mocking and fixtures +mock>=5.1.0 +faker>=19.2.0 +factory-boy>=3.3.0 + +# Data generation and manipulation +numpy>=1.24.3 +pandas>=2.0.3 +scipy>=1.11.1 + +# File handling +pyyaml>=6.0 +h5py>=3.9.0 + +# System monitoring (for testing monitoring features) +psutil>=5.9.5 + +# HTTP mocking (if needed for API tests) +responses>=0.23.1 +requests-mock>=1.11.0 + +# Async testing support +pytest-asyncio>=0.21.1 +aiofiles>=23.1.0 + +# Performance testing +pytest-benchmark>=4.0.0 +memory-profiler>=0.61.0 + +# Code quality +black>=23.7.0 +flake8>=6.0.0 +mypy>=1.4.1 +pylint>=2.17.4 + +# Documentation +sphinx>=7.0.1 +sphinx-rtd-theme>=1.2.2 + +# Milvus client (for integration tests) +pymilvus>=2.3.0 + +# Additional utilities +python-dotenv>=1.0.0 +click>=8.1.6 +colorama>=0.4.6 +tabulate>=0.9.0 +tqdm>=4.65.0 + +# Optional: for generating test reports +junitparser>=3.1.0 +allure-pytest>=2.13.2 diff --git a/vdb_benchmark/tests/tests/__init__.py b/vdb_benchmark/tests/tests/__init__.py new file mode 100755 index 00000000..241de820 --- /dev/null +++ b/vdb_benchmark/tests/tests/__init__.py @@ -0,0 +1,17 @@ +""" +VDB-Bench Test Suite + +Comprehensive unit tests for the vdb-bench vector database benchmarking tool. +""" + +__version__ = "1.0.0" + +# Test categories +TEST_CATEGORIES = [ + "configuration", + "database_connection", + "vector_loading", + "benchmarking", + "compaction", + "monitoring" +] diff --git a/vdb_benchmark/tests/tests/conftest.py b/vdb_benchmark/tests/tests/conftest.py new file mode 100755 index 00000000..48a0354f --- /dev/null +++ b/vdb_benchmark/tests/tests/conftest.py @@ -0,0 +1,180 @@ +""" +Pytest configuration and fixtures for vdb-bench tests +""" +import pytest +import yaml +import tempfile +import shutil +from pathlib import Path +from unittest.mock import Mock, MagicMock, patch +import numpy as np +from typing import Dict, Any, Generator +import os + +# Mock pymilvus if not installed +try: + from pymilvus import connections, Collection, utility +except ImportError: + connections = MagicMock() + Collection = MagicMock() + utility = MagicMock() + + +@pytest.fixture(scope="session") +def test_data_dir() -> Path: + """Create a temporary directory for test data that persists for the session.""" + temp_dir = Path(tempfile.mkdtemp(prefix="vdb_bench_test_")) + yield temp_dir + shutil.rmtree(temp_dir) + + +@pytest.fixture(scope="function") +def temp_config_file(test_data_dir) -> Generator[Path, None, None]: + """Create a temporary configuration file for testing.""" + config_path = test_data_dir / "test_config.yaml" + config_data = { + "database": { + "host": "127.0.0.1", + "port": 19530, + "database": "milvus_test", + "max_receive_message_length": 514983574, + "max_send_message_length": 514983574 + }, + "dataset": { + "collection_name": "test_collection", + "num_vectors": 1000, + "dimension": 128, + "distribution": "uniform", + "batch_size": 100, + "num_shards": 2, + "vector_dtype": "FLOAT_VECTOR" + }, + "index": { + "index_type": "DISKANN", + "metric_type": "COSINE", + "max_degree": 64, + "search_list_size": 200 + }, + "workflow": { + "compact": True + } + } + + with open(config_path, 'w') as f: + yaml.dump(config_data, f) + + yield config_path + + if config_path.exists(): + config_path.unlink() + + +@pytest.fixture +def mock_milvus_connection(): + """Mock Milvus connection for testing.""" + with patch('pymilvus.connections.connect') as mock_connect: + mock_connect.return_value = Mock() + yield mock_connect + + +@pytest.fixture +def mock_collection(): + """Mock Milvus collection for testing.""" + mock_coll = Mock(spec=Collection) + mock_coll.name = "test_collection" + mock_coll.schema = Mock() + mock_coll.num_entities = 1000 + mock_coll.insert = Mock(return_value=Mock(primary_keys=[1, 2, 3])) + mock_coll.create_index = Mock() + mock_coll.load = Mock() + mock_coll.release = Mock() + mock_coll.flush = Mock() + mock_coll.compact = Mock() + return mock_coll + + +@pytest.fixture +def sample_vectors() -> np.ndarray: + """Generate sample vectors for testing.""" + np.random.seed(42) + return np.random.randn(100, 128).astype(np.float32) + + +@pytest.fixture +def sample_config() -> Dict[str, Any]: + """Provide a sample configuration dictionary.""" + return { + "database": { + "host": "localhost", + "port": 19530, + "database": "default" + }, + "dataset": { + "collection_name": "test_vectors", + "num_vectors": 10000, + "dimension": 1536, + "distribution": "uniform", + "batch_size": 1000 + }, + "index": { + "index_type": "DISKANN", + "metric_type": "COSINE" + } + } + + +@pytest.fixture +def mock_time(): + """Mock time module for testing time-based operations.""" + with patch('time.time') as mock_time_func: + mock_time_func.side_effect = [0, 1, 2, 3, 4, 5] # Incremental time + yield mock_time_func + + +@pytest.fixture +def mock_multiprocessing(): + """Mock multiprocessing for testing parallel operations.""" + with patch('multiprocessing.Pool') as mock_pool: + mock_pool_instance = Mock() + mock_pool_instance.map = Mock(side_effect=lambda func, args: [func(arg) for arg in args]) + mock_pool_instance.close = Mock() + mock_pool_instance.join = Mock() + mock_pool.return_value.__enter__ = Mock(return_value=mock_pool_instance) + mock_pool.return_value.__exit__ = Mock(return_value=None) + yield mock_pool + + +@pytest.fixture +def benchmark_results(): + """Sample benchmark results for testing.""" + return { + "qps": 1250.5, + "latency_p50": 0.8, + "latency_p95": 1.2, + "latency_p99": 1.5, + "total_queries": 10000, + "runtime": 8.0, + "errors": 0 + } + + +@pytest.fixture(autouse=True) +def reset_milvus_connections(): + """Reset Milvus connections before each test.""" + connections.disconnect("default") + yield + connections.disconnect("default") + + +@pytest.fixture +def env_vars(): + """Set up environment variables for testing.""" + original_env = os.environ.copy() + + os.environ['VDB_BENCH_HOST'] = 'test_host' + os.environ['VDB_BENCH_PORT'] = '19530' + + yield os.environ + + os.environ.clear() + os.environ.update(original_env) diff --git a/vdb_benchmark/tests/tests/run_tests.py b/vdb_benchmark/tests/tests/run_tests.py new file mode 100755 index 00000000..a09766b8 --- /dev/null +++ b/vdb_benchmark/tests/tests/run_tests.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 +""" +Comprehensive test runner for vdb-bench test suite +""" +import sys +import os +import argparse +import pytest +import coverage +from pathlib import Path +from typing import List, Optional +import json +import time +from datetime import datetime + + +class TestRunner: + """Main test runner for vdb-bench test suite.""" + + def __init__(self, test_dir: Path = None): + """Initialize test runner.""" + self.test_dir = test_dir or Path(__file__).parent + self.results = { + "start_time": None, + "end_time": None, + "duration": 0, + "total_tests": 0, + "passed": 0, + "failed": 0, + "skipped": 0, + "errors": 0, + "coverage": None + } + + def run_all_tests(self, verbose: bool = False, + coverage_enabled: bool = True) -> int: + """Run all tests with optional coverage.""" + print("=" * 60) + print("VDB-Bench Test Suite Runner") + print("=" * 60) + + self.results["start_time"] = datetime.now().isoformat() + start = time.time() + + # Setup coverage if enabled + cov = None + if coverage_enabled: + cov = coverage.Coverage() + cov.start() + print("Coverage tracking enabled") + + # Prepare pytest arguments + pytest_args = [ + str(self.test_dir), + "-v" if verbose else "-q", + "--tb=short", + "--color=yes", + f"--junitxml={self.test_dir}/test_results.xml", + f"--html={self.test_dir}/test_report.html", + "--self-contained-html" + ] + + # Run pytest + print(f"\nRunning tests from: {self.test_dir}") + print("-" * 60) + + exit_code = pytest.main(pytest_args) + + # Stop coverage and generate report + if cov: + cov.stop() + cov.save() + + # Generate coverage report + print("\n" + "=" * 60) + print("Coverage Report") + print("-" * 60) + + cov.report() + + # Save HTML coverage report + html_dir = self.test_dir / "coverage_html" + cov.html_report(directory=str(html_dir)) + print(f"\nHTML coverage report saved to: {html_dir}") + + # Get coverage percentage + self.results["coverage"] = cov.report(show_missing=False) + + # Update results + self.results["end_time"] = datetime.now().isoformat() + self.results["duration"] = time.time() - start + + # Parse test results + self._parse_test_results(exit_code) + + # Save results to JSON + self._save_results() + + # Print summary + self._print_summary() + + return exit_code + + def run_specific_tests(self, test_modules: List[str], + verbose: bool = False) -> int: + """Run specific test modules.""" + print("=" * 60) + print(f"Running specific tests: {', '.join(test_modules)}") + print("=" * 60) + + pytest_args = [] + for module in test_modules: + test_path = self.test_dir / f"{module}.py" + if test_path.exists(): + pytest_args.append(str(test_path)) + else: + print(f"Warning: Test module not found: {test_path}") + + if not pytest_args: + print("No valid test modules found!") + return 1 + + if verbose: + pytest_args.append("-v") + else: + pytest_args.append("-q") + + pytest_args.extend(["--tb=short", "--color=yes"]) + + return pytest.main(pytest_args) + + def run_by_category(self, category: str, verbose: bool = False) -> int: + """Run tests by category.""" + category_map = { + "config": ["test_config"], + "connection": ["test_database_connection"], + "loading": ["test_load_vdb", "test_vector_generation"], + "benchmark": ["test_simple_bench"], + "index": ["test_index_management"], + "monitoring": ["test_compact_and_watch"], + "all": None # Run all tests + } + + if category not in category_map: + print(f"Unknown category: {category}") + print(f"Available categories: {', '.join(category_map.keys())}") + return 1 + + if category == "all": + return self.run_all_tests(verbose=verbose) + + test_modules = category_map[category] + return self.run_specific_tests(test_modules, verbose=verbose) + + def run_performance_tests(self, verbose: bool = False) -> int: + """Run performance-related tests.""" + print("=" * 60) + print("Running Performance Tests") + print("=" * 60) + + pytest_args = [ + str(self.test_dir), + "-v" if verbose else "-q", + "-k", "performance or benchmark or throughput", + "--tb=short", + "--color=yes" + ] + + return pytest.main(pytest_args) + + def run_integration_tests(self, verbose: bool = False) -> int: + """Run integration tests.""" + print("=" * 60) + print("Running Integration Tests") + print("=" * 60) + + pytest_args = [ + str(self.test_dir), + "-v" if verbose else "-q", + "-m", "integration", + "--tb=short", + "--color=yes" + ] + + return pytest.main(pytest_args) + + def _parse_test_results(self, exit_code: int) -> None: + """Parse test results from pytest exit code.""" + # Basic result parsing based on exit code + if exit_code == 0: + self.results["status"] = "SUCCESS" + elif exit_code == 1: + self.results["status"] = "TESTS_FAILED" + elif exit_code == 2: + self.results["status"] = "INTERRUPTED" + elif exit_code == 3: + self.results["status"] = "INTERNAL_ERROR" + elif exit_code == 4: + self.results["status"] = "USAGE_ERROR" + elif exit_code == 5: + self.results["status"] = "NO_TESTS" + else: + self.results["status"] = "UNKNOWN_ERROR" + + # Try to parse XML results if available + xml_path = self.test_dir / "test_results.xml" + if xml_path.exists(): + try: + import xml.etree.ElementTree as ET + tree = ET.parse(xml_path) + root = tree.getroot() + + testsuite = root.find("testsuite") or root + self.results["total_tests"] = int(testsuite.get("tests", 0)) + self.results["failed"] = int(testsuite.get("failures", 0)) + self.results["errors"] = int(testsuite.get("errors", 0)) + self.results["skipped"] = int(testsuite.get("skipped", 0)) + self.results["passed"] = ( + self.results["total_tests"] - + self.results["failed"] - + self.results["errors"] - + self.results["skipped"] + ) + except Exception as e: + print(f"Warning: Could not parse XML results: {e}") + + def _save_results(self) -> None: + """Save test results to JSON file.""" + results_path = self.test_dir / "test_results.json" + + with open(results_path, 'w') as f: + json.dump(self.results, f, indent=2) + + print(f"\nTest results saved to: {results_path}") + + def _print_summary(self) -> None: + """Print test execution summary.""" + print("\n" + "=" * 60) + print("Test Execution Summary") + print("=" * 60) + + print(f"Status: {self.results.get('status', 'UNKNOWN')}") + print(f"Duration: {self.results['duration']:.2f} seconds") + print(f"Total Tests: {self.results['total_tests']}") + print(f"Passed: {self.results['passed']}") + print(f"Failed: {self.results['failed']}") + print(f"Errors: {self.results['errors']}") + print(f"Skipped: {self.results['skipped']}") + + if self.results.get("coverage"): + print(f"Code Coverage: {self.results['coverage']:.1f}%") + + print("=" * 60) + + # Print pass rate + if self.results['total_tests'] > 0: + pass_rate = (self.results['passed'] / self.results['total_tests']) * 100 + print(f"Pass Rate: {pass_rate:.1f}%") + + if pass_rate == 100: + print("✅ All tests passed!") + elif pass_rate >= 90: + print("⚠️ Most tests passed, but some failures detected.") + else: + print("❌ Significant test failures detected.") + + print("=" * 60) + + +def main(): + """Main entry point for test runner.""" + parser = argparse.ArgumentParser( + description="VDB-Bench Test Suite Runner", + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument( + "--category", "-c", + choices=["all", "config", "connection", "loading", + "benchmark", "index", "monitoring"], + default="all", + help="Test category to run" + ) + + parser.add_argument( + "--modules", "-m", + nargs="+", + help="Specific test modules to run" + ) + + parser.add_argument( + "--performance", "-p", + action="store_true", + help="Run performance tests only" + ) + + parser.add_argument( + "--integration", "-i", + action="store_true", + help="Run integration tests only" + ) + + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Verbose output" + ) + + parser.add_argument( + "--no-coverage", + action="store_true", + help="Disable coverage tracking" + ) + + parser.add_argument( + "--test-dir", + type=Path, + default=Path(__file__).parent, + help="Test directory path" + ) + + args = parser.parse_args() + + # Create test runner + runner = TestRunner(test_dir=args.test_dir) + + # Determine which tests to run + if args.modules: + exit_code = runner.run_specific_tests(args.modules, verbose=args.verbose) + elif args.performance: + exit_code = runner.run_performance_tests(verbose=args.verbose) + elif args.integration: + exit_code = runner.run_integration_tests(verbose=args.verbose) + elif args.category != "all": + exit_code = runner.run_by_category(args.category, verbose=args.verbose) + else: + exit_code = runner.run_all_tests( + verbose=args.verbose, + coverage_enabled=not args.no_coverage + ) + + sys.exit(exit_code) + + +if __name__ == "__main__": + main() diff --git a/vdb_benchmark/tests/tests/test_compact_and_watch.py b/vdb_benchmark/tests/tests/test_compact_and_watch.py new file mode 100755 index 00000000..fbc886f3 --- /dev/null +++ b/vdb_benchmark/tests/tests/test_compact_and_watch.py @@ -0,0 +1,701 @@ +""" +Unit tests for compaction and monitoring functionality in vdb-bench +""" +import pytest +import time +from unittest.mock import Mock, MagicMock, patch, call +import threading +from typing import Dict, Any, List +import json +from datetime import datetime, timedelta + + +class TestCompactionOperations: + """Test database compaction operations.""" + + def test_manual_compaction_trigger(self, mock_collection): + """Test manually triggering compaction.""" + mock_collection.compact.return_value = 1234 # Compaction ID + + def trigger_compaction(collection): + """Trigger manual compaction.""" + try: + compaction_id = collection.compact() + return { + "success": True, + "compaction_id": compaction_id, + "timestamp": time.time() + } + except Exception as e: + return { + "success": False, + "error": str(e) + } + + result = trigger_compaction(mock_collection) + + assert result["success"] is True + assert result["compaction_id"] == 1234 + assert "timestamp" in result + mock_collection.compact.assert_called_once() + + def test_compaction_state_monitoring(self, mock_collection): + """Test monitoring compaction state.""" + # Mock compaction state progression + states = ["Executing", "Executing", "Completed"] + state_iter = iter(states) + + def get_compaction_state(compaction_id): + try: + return next(state_iter) + except StopIteration: + return "Completed" + + mock_collection.get_compaction_state = Mock(side_effect=get_compaction_state) + + def monitor_compaction(collection, compaction_id, timeout=60): + """Monitor compaction until completion.""" + start_time = time.time() + states = [] + + while time.time() - start_time < timeout: + state = collection.get_compaction_state(compaction_id) + states.append({ + "state": state, + "timestamp": time.time() - start_time + }) + + if state == "Completed": + return { + "success": True, + "duration": time.time() - start_time, + "states": states + } + elif state == "Failed": + return { + "success": False, + "error": "Compaction failed", + "states": states + } + + time.sleep(0.1) # Check interval + + return { + "success": False, + "error": "Compaction timeout", + "states": states + } + + with patch('time.sleep'): # Speed up test + result = monitor_compaction(mock_collection, 1234) + + assert result["success"] is True + assert len(result["states"]) == 3 + assert result["states"][-1]["state"] == "Completed" + + def test_automatic_compaction_scheduling(self): + """Test automatic compaction scheduling based on conditions.""" + class CompactionScheduler: + def __init__(self, collection): + self.collection = collection + self.last_compaction = None + self.compaction_history = [] + + def should_compact(self, num_segments, deleted_ratio, time_since_last): + """Determine if compaction should be triggered.""" + # Compact if: + # - More than 10 segments + # - Deleted ratio > 20% + # - More than 1 hour since last compaction + + if num_segments > 10: + return True, "Too many segments" + + if deleted_ratio > 0.2: + return True, "High deletion ratio" + + if self.last_compaction and time_since_last > 3600: + return True, "Time-based compaction" + + return False, None + + def check_and_compact(self): + """Check conditions and trigger compaction if needed.""" + # Get collection stats (mocked here) + stats = { + "num_segments": 12, + "deleted_ratio": 0.15, + "last_compaction": self.last_compaction + } + + time_since_last = ( + time.time() - self.last_compaction + if self.last_compaction else float('inf') + ) + + should_compact, reason = self.should_compact( + stats["num_segments"], + stats["deleted_ratio"], + time_since_last + ) + + if should_compact: + compaction_id = self.collection.compact() + self.last_compaction = time.time() + self.compaction_history.append({ + "id": compaction_id, + "reason": reason, + "timestamp": self.last_compaction + }) + return True, reason + + return False, None + + mock_collection = Mock() + mock_collection.compact.return_value = 5678 + + scheduler = CompactionScheduler(mock_collection) + + # Should trigger compaction (too many segments) + compacted, reason = scheduler.check_and_compact() + + assert compacted is True + assert reason == "Too many segments" + assert len(scheduler.compaction_history) == 1 + mock_collection.compact.assert_called_once() + + def test_compaction_with_resource_monitoring(self): + """Test compaction with system resource monitoring.""" + import psutil + + class ResourceAwareCompaction: + def __init__(self, collection): + self.collection = collection + self.resource_thresholds = { + "cpu_percent": 80, + "memory_percent": 85, + "disk_io_rate": 100 # MB/s + } + + def check_resources(self): + """Check if system resources allow compaction.""" + cpu_percent = psutil.cpu_percent(interval=1) + memory_percent = psutil.virtual_memory().percent + + # Mock disk I/O rate + disk_io_rate = 50 # MB/s + + return { + "cpu_ok": cpu_percent < self.resource_thresholds["cpu_percent"], + "memory_ok": memory_percent < self.resource_thresholds["memory_percent"], + "disk_ok": disk_io_rate < self.resource_thresholds["disk_io_rate"], + "cpu_percent": cpu_percent, + "memory_percent": memory_percent, + "disk_io_rate": disk_io_rate + } + + def compact_with_resource_check(self): + """Perform compaction only if resources are available.""" + resource_status = self.check_resources() + + if all([resource_status["cpu_ok"], + resource_status["memory_ok"], + resource_status["disk_ok"]]): + + compaction_id = self.collection.compact() + return { + "success": True, + "compaction_id": compaction_id, + "resource_status": resource_status + } + else: + return { + "success": False, + "reason": "Resource constraints", + "resource_status": resource_status + } + + with patch('psutil.cpu_percent', return_value=50): + with patch('psutil.virtual_memory') as mock_memory: + mock_memory.return_value = Mock(percent=60) + + mock_collection = Mock() + mock_collection.compact.return_value = 9999 + + compactor = ResourceAwareCompaction(mock_collection) + result = compactor.compact_with_resource_check() + + assert result["success"] is True + assert result["compaction_id"] == 9999 + assert result["resource_status"]["cpu_ok"] is True + + +class TestMonitoring: + """Test monitoring functionality.""" + + def test_collection_stats_monitoring(self, mock_collection): + """Test monitoring collection statistics.""" + mock_collection.num_entities = 1000000 + + # Mock getting collection stats + def get_stats(): + return { + "num_entities": mock_collection.num_entities, + "num_segments": 10, + "index_building_progress": 95 + } + + mock_collection.get_stats = get_stats + + class StatsMonitor: + def __init__(self, collection): + self.collection = collection + self.stats_history = [] + + def collect_stats(self): + """Collect current statistics.""" + stats = self.collection.get_stats() + stats["timestamp"] = time.time() + self.stats_history.append(stats) + return stats + + def get_trends(self, window_size=10): + """Calculate trends from recent stats.""" + if len(self.stats_history) < 2: + return None + + recent = self.stats_history[-window_size:] + + # Calculate entity growth rate + if len(recent) >= 2: + time_diff = recent[-1]["timestamp"] - recent[0]["timestamp"] + entity_diff = recent[-1]["num_entities"] - recent[0]["num_entities"] + + growth_rate = entity_diff / time_diff if time_diff > 0 else 0 + + return { + "entity_growth_rate": growth_rate, + "avg_segments": sum(s["num_segments"] for s in recent) / len(recent), + "current_entities": recent[-1]["num_entities"] + } + + return None + + monitor = StatsMonitor(mock_collection) + + # Collect stats over time + for i in range(5): + mock_collection.num_entities += 10000 + stats = monitor.collect_stats() + time.sleep(0.01) # Small delay + + trends = monitor.get_trends() + + assert trends is not None + assert trends["current_entities"] == 1050000 # 1000000 + (5 * 10000) + assert len(monitor.stats_history) == 5 + + def test_periodic_monitoring(self): + """Test periodic monitoring with configurable intervals.""" + class PeriodicMonitor: + def __init__(self, collection, interval=5): + self.collection = collection + self.interval = interval + self.running = False + self.thread = None + self.data = [] + + def monitor_function(self): + """Function to run periodically.""" + stats = { + "timestamp": time.time(), + "num_entities": self.collection.num_entities, + "status": "healthy" + } + self.data.append(stats) + return stats + + def start(self): + """Start periodic monitoring.""" + self.running = True + + def run(): + while self.running: + self.monitor_function() + time.sleep(self.interval) + + self.thread = threading.Thread(target=run) + self.thread.daemon = True + self.thread.start() + + def stop(self): + """Stop periodic monitoring.""" + self.running = False + if self.thread: + self.thread.join(timeout=1) + + def get_latest(self, n=5): + """Get latest n monitoring results.""" + return self.data[-n:] if self.data else [] + + mock_collection = Mock() + mock_collection.num_entities = 1000000 + + monitor = PeriodicMonitor(mock_collection, interval=0.01) # Fast interval for testing + + monitor.start() + time.sleep(0.05) # Let it collect some data + monitor.stop() + + latest = monitor.get_latest() + + assert len(latest) > 0 + assert all("timestamp" in item for item in latest) + + def test_alert_system(self): + """Test alert system for monitoring thresholds.""" + class AlertSystem: + def __init__(self): + self.alerts = [] + self.thresholds = { + "high_latency": 100, # ms + "low_qps": 50, + "high_error_rate": 0.05, + "segment_count": 20 + } + self.alert_callbacks = [] + + def check_metric(self, metric_name, value): + """Check if metric exceeds threshold.""" + if metric_name == "latency" and value > self.thresholds["high_latency"]: + self.trigger_alert("HIGH_LATENCY", f"Latency {value}ms exceeds threshold") + + elif metric_name == "qps" and value < self.thresholds["low_qps"]: + self.trigger_alert("LOW_QPS", f"QPS {value} below threshold") + + elif metric_name == "error_rate" and value > self.thresholds["high_error_rate"]: + self.trigger_alert("HIGH_ERROR_RATE", f"Error rate {value:.2%} exceeds threshold") + + elif metric_name == "segments" and value > self.thresholds["segment_count"]: + self.trigger_alert("TOO_MANY_SEGMENTS", f"Segment count {value} exceeds threshold") + + def trigger_alert(self, alert_type, message): + """Trigger an alert.""" + alert = { + "type": alert_type, + "message": message, + "timestamp": time.time(), + "resolved": False + } + + self.alerts.append(alert) + + # Call registered callbacks + for callback in self.alert_callbacks: + callback(alert) + + return alert + + def resolve_alert(self, alert_type): + """Mark alerts of given type as resolved.""" + for alert in self.alerts: + if alert["type"] == alert_type and not alert["resolved"]: + alert["resolved"] = True + alert["resolved_time"] = time.time() + + def register_callback(self, callback): + """Register callback for alerts.""" + self.alert_callbacks.append(callback) + + def get_active_alerts(self): + """Get list of active (unresolved) alerts.""" + return [a for a in self.alerts if not a["resolved"]] + + alert_system = AlertSystem() + + # Register a callback + received_alerts = [] + alert_system.register_callback(lambda alert: received_alerts.append(alert)) + + # Test various metrics + alert_system.check_metric("latency", 150) # Should trigger + alert_system.check_metric("qps", 100) # Should not trigger + alert_system.check_metric("error_rate", 0.1) # Should trigger + alert_system.check_metric("segments", 25) # Should trigger + + active = alert_system.get_active_alerts() + + assert len(active) == 3 + assert len(received_alerts) == 3 + assert any(a["type"] == "HIGH_LATENCY" for a in active) + + # Resolve an alert + alert_system.resolve_alert("HIGH_LATENCY") + active = alert_system.get_active_alerts() + + assert len(active) == 2 + + def test_monitoring_data_aggregation(self): + """Test aggregating monitoring data over time windows.""" + class DataAggregator: + def __init__(self): + self.raw_data = [] + + def add_data_point(self, timestamp, metrics): + """Add a data point.""" + self.raw_data.append({ + "timestamp": timestamp, + **metrics + }) + + def aggregate_window(self, start_time, end_time, aggregation="avg"): + """Aggregate data within a time window.""" + window_data = [ + d for d in self.raw_data + if start_time <= d["timestamp"] <= end_time + ] + + if not window_data: + return None + + if aggregation == "avg": + return self._average_aggregation(window_data) + elif aggregation == "max": + return self._max_aggregation(window_data) + elif aggregation == "min": + return self._min_aggregation(window_data) + else: + return window_data + + def _average_aggregation(self, data): + """Calculate average of metrics.""" + result = {"count": len(data)} + + # Get all metric keys (excluding timestamp) + metric_keys = [k for k in data[0].keys() if k != "timestamp"] + + for key in metric_keys: + values = [d[key] for d in data if key in d] + result[f"{key}_avg"] = sum(values) / len(values) if values else 0 + + return result + + def _max_aggregation(self, data): + """Get maximum values of metrics.""" + result = {"count": len(data)} + + metric_keys = [k for k in data[0].keys() if k != "timestamp"] + + for key in metric_keys: + values = [d[key] for d in data if key in d] + result[f"{key}_max"] = max(values) if values else 0 + + return result + + def _min_aggregation(self, data): + """Get minimum values of metrics.""" + result = {"count": len(data)} + + metric_keys = [k for k in data[0].keys() if k != "timestamp"] + + for key in metric_keys: + values = [d[key] for d in data if key in d] + result[f"{key}_min"] = min(values) if values else 0 + + return result + + def create_time_series(self, metric_name, interval=60): + """Create time series data for a specific metric.""" + if not self.raw_data: + return [] + + min_time = min(d["timestamp"] for d in self.raw_data) + max_time = max(d["timestamp"] for d in self.raw_data) + + time_series = [] + current_time = min_time + + while current_time <= max_time: + window_end = current_time + interval + window_data = [ + d for d in self.raw_data + if current_time <= d["timestamp"] < window_end + and metric_name in d + ] + + if window_data: + avg_value = sum(d[metric_name] for d in window_data) / len(window_data) + time_series.append({ + "timestamp": current_time, + "value": avg_value + }) + + current_time = window_end + + return time_series + + aggregator = DataAggregator() + + # Add sample data points + base_time = time.time() + for i in range(100): + aggregator.add_data_point( + base_time + i, + { + "qps": 100 + i % 20, + "latency": 10 + i % 5, + "error_count": i % 3 + } + ) + + # Test aggregation + avg_metrics = aggregator.aggregate_window(base_time, base_time + 50, "avg") + assert avg_metrics is not None + assert "qps_avg" in avg_metrics + assert avg_metrics["count"] == 51 + + # Test time series creation + time_series = aggregator.create_time_series("qps", interval=10) + assert len(time_series) > 0 + assert all("timestamp" in point and "value" in point for point in time_series) + + +class TestWatchOperations: + """Test watch operations for monitoring database state.""" + + def test_index_building_watch(self, mock_collection): + """Test watching index building progress.""" + progress_values = [0, 25, 50, 75, 100] + progress_iter = iter(progress_values) + + def get_index_progress(): + try: + return next(progress_iter) + except StopIteration: + return 100 + + mock_collection.index.get_build_progress = Mock(side_effect=get_index_progress) + + class IndexWatcher: + def __init__(self, collection): + self.collection = collection + self.progress_history = [] + + def watch_build(self, check_interval=1): + """Watch index building until completion.""" + while True: + progress = self.collection.index.get_build_progress() + self.progress_history.append({ + "progress": progress, + "timestamp": time.time() + }) + + if progress >= 100: + return { + "completed": True, + "final_progress": progress, + "history": self.progress_history + } + + time.sleep(check_interval) + + mock_collection.index = Mock() + mock_collection.index.get_build_progress = Mock(side_effect=get_index_progress) + + watcher = IndexWatcher(mock_collection) + + with patch('time.sleep'): # Speed up test + result = watcher.watch_build() + + assert result["completed"] is True + assert result["final_progress"] == 100 + assert len(result["history"]) == 5 + + def test_segment_merge_watch(self): + """Test watching segment merge operations.""" + class SegmentMergeWatcher: + def __init__(self): + self.merge_operations = [] + self.active_merges = {} + + def start_merge(self, segments): + """Start watching a segment merge.""" + merge_id = f"merge_{len(self.merge_operations)}" + + merge_op = { + "id": merge_id, + "segments": segments, + "start_time": time.time(), + "status": "running", + "progress": 0 + } + + self.merge_operations.append(merge_op) + self.active_merges[merge_id] = merge_op + + return merge_id + + def update_progress(self, merge_id, progress): + """Update merge progress.""" + if merge_id in self.active_merges: + self.active_merges[merge_id]["progress"] = progress + + if progress >= 100: + self.complete_merge(merge_id) + + def complete_merge(self, merge_id): + """Mark merge as completed.""" + if merge_id in self.active_merges: + merge_op = self.active_merges[merge_id] + merge_op["status"] = "completed" + merge_op["end_time"] = time.time() + merge_op["duration"] = merge_op["end_time"] - merge_op["start_time"] + + del self.active_merges[merge_id] + + return merge_op + + return None + + def get_active_merges(self): + """Get list of active merge operations.""" + return list(self.active_merges.values()) + + def get_merge_stats(self): + """Get statistics about merge operations.""" + completed = [m for m in self.merge_operations if m["status"] == "completed"] + + if not completed: + return None + + durations = [m["duration"] for m in completed] + + return { + "total_merges": len(self.merge_operations), + "completed_merges": len(completed), + "active_merges": len(self.active_merges), + "avg_duration": sum(durations) / len(durations) if durations else 0, + "min_duration": min(durations) if durations else 0, + "max_duration": max(durations) if durations else 0 + } + + watcher = SegmentMergeWatcher() + + # Start multiple merges + merge1 = watcher.start_merge(["seg1", "seg2"]) + merge2 = watcher.start_merge(["seg3", "seg4"]) + + assert len(watcher.get_active_merges()) == 2 + + # Update progress + watcher.update_progress(merge1, 50) + watcher.update_progress(merge2, 100) # Complete this one + + assert len(watcher.get_active_merges()) == 1 + + # Complete remaining merge + watcher.update_progress(merge1, 100) + + stats = watcher.get_merge_stats() + assert stats["completed_merges"] == 2 + assert stats["active_merges"] == 0 diff --git a/vdb_benchmark/tests/tests/test_config.py b/vdb_benchmark/tests/tests/test_config.py new file mode 100755 index 00000000..725976ae --- /dev/null +++ b/vdb_benchmark/tests/tests/test_config.py @@ -0,0 +1,359 @@ +""" +Unit tests for configuration management in vdb-bench +""" +import pytest +import yaml +from pathlib import Path +from typing import Dict, Any +import os +from unittest.mock import patch, mock_open, MagicMock + + +class TestConfigurationLoader: + """Test configuration loading and validation.""" + + def test_load_valid_config(self, temp_config_file): + """Test loading a valid configuration file.""" + # Mock the config loading function + with open(temp_config_file, 'r') as f: + config = yaml.safe_load(f) + + assert config is not None + assert 'database' in config + assert 'dataset' in config + assert 'index' in config + assert config['database']['host'] == '127.0.0.1' + assert config['dataset']['num_vectors'] == 1000 + + def test_load_missing_config_file(self): + """Test handling of missing configuration file.""" + non_existent_file = Path("/tmp/non_existent_config.yaml") + + with pytest.raises(FileNotFoundError): + with open(non_existent_file, 'r') as f: + yaml.safe_load(f) + + def test_load_invalid_yaml(self, test_data_dir): + """Test handling of invalid YAML syntax.""" + invalid_yaml_path = test_data_dir / "invalid.yaml" + + with open(invalid_yaml_path, 'w') as f: + f.write("invalid: yaml: content: [") + + with pytest.raises(yaml.YAMLError): + with open(invalid_yaml_path, 'r') as f: + yaml.safe_load(f) + + def test_config_validation_missing_required_fields(self): + """Test validation when required configuration fields are missing.""" + incomplete_config = { + "database": { + "host": "localhost" + # Missing port and other required fields + } + } + + # Mock validation function + def validate_config(config): + required_fields = ['port', 'database'] + for field in required_fields: + if field not in config.get('database', {}): + raise ValueError(f"Missing required field: database.{field}") + + with pytest.raises(ValueError, match="Missing required field"): + validate_config(incomplete_config) + + def test_config_validation_invalid_values(self): + """Test validation of configuration values.""" + invalid_config = { + "database": { + "host": "localhost", + "port": -1, # Invalid port + "database": "milvus" + }, + "dataset": { + "num_vectors": -100, # Invalid negative value + "dimension": 0, # Invalid dimension + "batch_size": 0 # Invalid batch size + } + } + + def validate_config_values(config): + if config['database']['port'] < 1 or config['database']['port'] > 65535: + raise ValueError("Invalid port number") + if config['dataset']['num_vectors'] <= 0: + raise ValueError("Number of vectors must be positive") + if config['dataset']['dimension'] <= 0: + raise ValueError("Vector dimension must be positive") + if config['dataset']['batch_size'] <= 0: + raise ValueError("Batch size must be positive") + + with pytest.raises(ValueError): + validate_config_values(invalid_config) + + def test_config_merge_with_defaults(self): + """Test merging user configuration with defaults.""" + default_config = { + "database": { + "host": "localhost", + "port": 19530, + "timeout": 30 + }, + "dataset": { + "batch_size": 1000, + "distribution": "uniform" + } + } + + user_config = { + "database": { + "host": "remote-host", + "port": 8080 + }, + "dataset": { + "batch_size": 500 + } + } + + def merge_configs(default, user): + """Deep merge user config into default config.""" + merged = default.copy() + for key, value in user.items(): + if key in merged and isinstance(merged[key], dict) and isinstance(value, dict): + merged[key] = merge_configs(merged[key], value) + else: + merged[key] = value + return merged + + merged = merge_configs(default_config, user_config) + + assert merged['database']['host'] == 'remote-host' + assert merged['database']['port'] == 8080 + assert merged['database']['timeout'] == 30 # From default + assert merged['dataset']['batch_size'] == 500 + assert merged['dataset']['distribution'] == 'uniform' # From default + + def test_config_environment_variable_override(self, sample_config): + """Test overriding configuration with environment variables.""" + import copy + + os.environ['VDB_BENCH_DATABASE_HOST'] = 'env-host' + os.environ['VDB_BENCH_DATABASE_PORT'] = '9999' + os.environ['VDB_BENCH_DATASET_NUM_VECTORS'] = '5000' + + def apply_env_overrides(config): + """Apply environment variable overrides to configuration.""" + # Make a deep copy to avoid modifying original + result = copy.deepcopy(config) + env_prefix = 'VDB_BENCH_' + + for key, value in os.environ.items(): + if key.startswith(env_prefix): + # Parse the environment variable name + parts = key[len(env_prefix):].lower().split('_') + + # Special handling for num_vectors (DATASET_NUM_VECTORS) + if len(parts) >= 2 and parts[0] == 'dataset' and parts[1] == 'num' and len(parts) == 3 and parts[2] == 'vectors': + if 'dataset' not in result: + result['dataset'] = {} + result['dataset']['num_vectors'] = int(value) + else: + # Navigate to the config section for other keys + current = result + for part in parts[:-1]: + if part not in current: + current[part] = {} + current = current[part] + + # Set the value (with type conversion) + final_key = parts[-1] + if value.isdigit(): + current[final_key] = int(value) + else: + current[final_key] = value + + return result + + config = apply_env_overrides(sample_config) + + assert config['database']['host'] == 'env-host' + assert config['database']['port'] == 9999 + assert config['dataset']['num_vectors'] == 5000 + + # Clean up environment variables + del os.environ['VDB_BENCH_DATABASE_HOST'] + del os.environ['VDB_BENCH_DATABASE_PORT'] + del os.environ['VDB_BENCH_DATASET_NUM_VECTORS'] + + def test_config_save(self, test_data_dir): + """Test saving configuration to file.""" + config = { + "database": {"host": "localhost", "port": 19530}, + "dataset": {"collection_name": "test", "dimension": 128} + } + + save_path = test_data_dir / "saved_config.yaml" + + with open(save_path, 'w') as f: + yaml.dump(config, f) + + # Verify saved file + with open(save_path, 'r') as f: + loaded_config = yaml.safe_load(f) + + assert loaded_config == config + + def test_config_schema_validation(self): + """Test configuration schema validation.""" + schema = { + "database": { + "type": "dict", + "required": ["host", "port"], + "properties": { + "host": {"type": "string"}, + "port": {"type": "integer", "min": 1, "max": 65535} + } + }, + "dataset": { + "type": "dict", + "required": ["dimension"], + "properties": { + "dimension": {"type": "integer", "min": 1} + } + } + } + + def validate_against_schema(config, schema): + """Basic schema validation.""" + for key, rules in schema.items(): + if rules.get("type") == "dict": + if key not in config: + if "required" in rules: + raise ValueError(f"Missing required section: {key}") + continue + + if "required" in rules: + for req_field in rules["required"]: + if req_field not in config[key]: + raise ValueError(f"Missing required field: {key}.{req_field}") + + if "properties" in rules: + for prop, prop_rules in rules["properties"].items(): + if prop in config[key]: + value = config[key][prop] + if "type" in prop_rules: + if prop_rules["type"] == "integer" and not isinstance(value, int): + raise TypeError(f"{key}.{prop} must be an integer") + if prop_rules["type"] == "string" and not isinstance(value, str): + raise TypeError(f"{key}.{prop} must be a string") + + if "min" in prop_rules and value < prop_rules["min"]: + raise ValueError(f"{key}.{prop} must be >= {prop_rules['min']}") + if "max" in prop_rules and value > prop_rules["max"]: + raise ValueError(f"{key}.{prop} must be <= {prop_rules['max']}") + + # Valid config + valid_config = { + "database": {"host": "localhost", "port": 19530}, + "dataset": {"dimension": 128} + } + + validate_against_schema(valid_config, schema) # Should not raise + + # Invalid config (missing required field) + invalid_config = { + "database": {"host": "localhost"}, # Missing port + "dataset": {"dimension": 128} + } + + with pytest.raises(ValueError, match="Missing required field"): + validate_against_schema(invalid_config, schema) + + +class TestIndexConfiguration: + """Test index-specific configuration handling.""" + + def test_diskann_config_validation(self): + """Test DiskANN index configuration validation.""" + valid_diskann_config = { + "index_type": "DISKANN", + "metric_type": "COSINE", + "max_degree": 64, + "search_list_size": 200, + "pq_code_budget_gb": 0.1, + "build_algo": "IVF_PQ" + } + + def validate_diskann_config(config): + assert config["index_type"] == "DISKANN" + assert config["metric_type"] in ["L2", "IP", "COSINE"] + assert 1 <= config["max_degree"] <= 128 + assert 100 <= config["search_list_size"] <= 1000 + if "pq_code_budget_gb" in config: + assert config["pq_code_budget_gb"] > 0 + + validate_diskann_config(valid_diskann_config) + + # Invalid max_degree + invalid_config = valid_diskann_config.copy() + invalid_config["max_degree"] = 200 + + with pytest.raises(AssertionError): + validate_diskann_config(invalid_config) + + def test_hnsw_config_validation(self): + """Test HNSW index configuration validation.""" + valid_hnsw_config = { + "index_type": "HNSW", + "metric_type": "L2", + "M": 16, + "efConstruction": 200 + } + + def validate_hnsw_config(config): + assert config["index_type"] == "HNSW" + assert config["metric_type"] in ["L2", "IP", "COSINE"] + assert 4 <= config["M"] <= 64 + assert 8 <= config["efConstruction"] <= 512 + + validate_hnsw_config(valid_hnsw_config) + + # Invalid M value + invalid_config = valid_hnsw_config.copy() + invalid_config["M"] = 100 + + with pytest.raises(AssertionError): + validate_hnsw_config(invalid_config) + + def test_auto_index_config_selection(self): + """Test automatic index configuration based on dataset size.""" + def select_index_config(num_vectors, dimension): + if num_vectors < 100000: + return { + "index_type": "IVF_FLAT", + "nlist": 128 + } + elif num_vectors < 1000000: + return { + "index_type": "HNSW", + "M": 16, + "efConstruction": 200 + } + else: + return { + "index_type": "DISKANN", + "max_degree": 64, + "search_list_size": 200 + } + + # Small dataset + config = select_index_config(50000, 128) + assert config["index_type"] == "IVF_FLAT" + + # Medium dataset + config = select_index_config(500000, 256) + assert config["index_type"] == "HNSW" + + # Large dataset + config = select_index_config(10000000, 1536) + assert config["index_type"] == "DISKANN" diff --git a/vdb_benchmark/tests/tests/test_database_connection.py b/vdb_benchmark/tests/tests/test_database_connection.py new file mode 100755 index 00000000..538c5886 --- /dev/null +++ b/vdb_benchmark/tests/tests/test_database_connection.py @@ -0,0 +1,538 @@ +""" +Unit tests for Milvus database connection management +""" +import pytest +from unittest.mock import Mock, MagicMock, patch, call +import time +from typing import Dict, Any + + +class TestDatabaseConnection: + """Test database connection management.""" + + @patch('pymilvus.connections.connect') + def test_successful_connection(self, mock_connect): + """Test successful connection to Milvus.""" + mock_connect.return_value = True + + def connect_to_milvus(host="localhost", port=19530, **kwargs): + from pymilvus import connections + return connections.connect( + alias="default", + host=host, + port=port, + **kwargs + ) + + result = connect_to_milvus("localhost", 19530) + assert result is True + mock_connect.assert_called_once_with( + alias="default", + host="localhost", + port=19530 + ) + + @patch('pymilvus.connections.connect') + def test_connection_with_timeout(self, mock_connect): + """Test connection with custom timeout.""" + mock_connect.return_value = True + + def connect_with_timeout(host, port, timeout=30): + from pymilvus import connections + return connections.connect( + alias="default", + host=host, + port=port, + timeout=timeout + ) + + connect_with_timeout("localhost", 19530, timeout=60) + mock_connect.assert_called_with( + alias="default", + host="localhost", + port=19530, + timeout=60 + ) + + @patch('pymilvus.connections.connect') + def test_connection_failure(self, mock_connect): + """Test handling of connection failures.""" + mock_connect.side_effect = Exception("Connection refused") + + def connect_to_milvus(host, port): + from pymilvus import connections + try: + return connections.connect(alias="default", host=host, port=port) + except Exception as e: + return f"Failed to connect: {e}" + + result = connect_to_milvus("localhost", 19530) + assert "Failed to connect" in result + assert "Connection refused" in result + + @patch('pymilvus.connections.connect') + def test_connection_retry_logic(self, mock_connect): + """Test connection retry mechanism.""" + # Fail twice, then succeed + mock_connect.side_effect = [ + Exception("Connection failed"), + Exception("Connection failed"), + True + ] + + def connect_with_retry(host, port, max_retries=3, retry_delay=1): + from pymilvus import connections + + for attempt in range(max_retries): + try: + return connections.connect( + alias="default", + host=host, + port=port + ) + except Exception as e: + if attempt == max_retries - 1: + raise + time.sleep(retry_delay) + + return False + + with patch('time.sleep'): # Mock sleep to speed up test + result = connect_with_retry("localhost", 19530) + assert result is True + assert mock_connect.call_count == 3 + + @patch('pymilvus.connections.list_connections') + def test_list_connections(self, mock_list): + """Test listing active connections.""" + mock_list.return_value = [ + ("default", {"host": "localhost", "port": 19530}), + ("secondary", {"host": "remote", "port": 8080}) + ] + + def get_active_connections(): + from pymilvus import connections + return connections.list_connections() + + connections_list = get_active_connections() + assert len(connections_list) == 2 + assert connections_list[0][0] == "default" + assert connections_list[1][1]["host"] == "remote" + + @patch('pymilvus.connections.disconnect') + def test_disconnect(self, mock_disconnect): + """Test disconnecting from Milvus.""" + mock_disconnect.return_value = None + + def disconnect_from_milvus(alias="default"): + from pymilvus import connections + connections.disconnect(alias) + return True + + result = disconnect_from_milvus() + assert result is True + mock_disconnect.assert_called_once_with("default") + + @patch('pymilvus.connections.connect') + def test_connection_pool(self, mock_connect): + """Test connection pooling behavior.""" + mock_connect.return_value = True + + class ConnectionPool: + def __init__(self, max_connections=5): + self.max_connections = max_connections + self.connections = [] + self.available = [] + + def get_connection(self): + if self.available: + return self.available.pop() + elif len(self.connections) < self.max_connections: + from pymilvus import connections + conn = connections.connect( + alias=f"conn_{len(self.connections)}", + host="localhost", + port=19530 + ) + self.connections.append(conn) + return conn + else: + raise Exception("Connection pool exhausted") + + def return_connection(self, conn): + self.available.append(conn) + + def close_all(self): + for conn in self.connections: + # In real code, would disconnect each connection + pass + self.connections.clear() + self.available.clear() + + pool = ConnectionPool(max_connections=3) + + # Get connections + conn1 = pool.get_connection() + conn2 = pool.get_connection() + conn3 = pool.get_connection() + + # Pool should be exhausted + with pytest.raises(Exception, match="Connection pool exhausted"): + pool.get_connection() + + # Return a connection + pool.return_connection(conn1) + + # Should be able to get a connection now + conn4 = pool.get_connection() + assert conn4 == conn1 # Should reuse the returned connection + + @patch('pymilvus.connections.connect') + def test_connection_with_authentication(self, mock_connect): + """Test connection with authentication credentials.""" + mock_connect.return_value = True + + def connect_with_auth(host, port, user, password): + from pymilvus import connections + return connections.connect( + alias="default", + host=host, + port=port, + user=user, + password=password + ) + + connect_with_auth("localhost", 19530, "admin", "password123") + + mock_connect.assert_called_with( + alias="default", + host="localhost", + port=19530, + user="admin", + password="password123" + ) + + @patch('pymilvus.connections.connect') + def test_connection_health_check(self, mock_connect): + """Test connection health check mechanism.""" + mock_connect.return_value = True + + class MilvusConnection: + def __init__(self, host, port): + self.host = host + self.port = port + self.connected = False + self.last_health_check = 0 + + def connect(self): + from pymilvus import connections + try: + connections.connect( + alias="health_check", + host=self.host, + port=self.port + ) + self.connected = True + return True + except: + self.connected = False + return False + + def health_check(self): + """Perform a health check on the connection.""" + current_time = time.time() + + # Only check every 30 seconds + if current_time - self.last_health_check < 30: + return self.connected + + self.last_health_check = current_time + + # Try a simple operation to verify connection + try: + # In real code, would perform a lightweight operation + # like checking server status + return self.connected + except: + self.connected = False + return False + + def ensure_connected(self): + """Ensure connection is active, reconnect if needed.""" + if not self.health_check(): + return self.connect() + return True + + conn = MilvusConnection("localhost", 19530) + assert conn.connect() is True + assert conn.health_check() is True + assert conn.ensure_connected() is True + + +class TestCollectionManagement: + """Test Milvus collection management operations.""" + + @patch('pymilvus.Collection') + def test_create_collection(self, mock_collection_class): + """Test creating a new collection.""" + mock_collection = Mock() + mock_collection_class.return_value = mock_collection + + def create_collection(name, dimension, metric_type="L2"): + from pymilvus import Collection, FieldSchema, CollectionSchema, DataType + + # Define schema + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), + FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dimension) + ] + schema = CollectionSchema(fields, description=f"Collection {name}") + + # Create collection + collection = Collection(name=name, schema=schema) + return collection + + coll = create_collection("test_collection", 128) + assert coll is not None + mock_collection_class.assert_called_once() + + @patch('pymilvus.utility.has_collection') + def test_check_collection_exists(self, mock_has_collection): + """Test checking if a collection exists.""" + mock_has_collection.return_value = True + + def collection_exists(collection_name): + from pymilvus import utility + return utility.has_collection(collection_name) + + exists = collection_exists("test_collection") + assert exists is True + mock_has_collection.assert_called_once_with("test_collection") + + @patch('pymilvus.Collection') + def test_drop_collection(self, mock_collection_class): + """Test dropping a collection.""" + mock_collection = Mock() + mock_collection.drop = Mock() + mock_collection_class.return_value = mock_collection + + def drop_collection(collection_name): + from pymilvus import Collection + collection = Collection(collection_name) + collection.drop() + return True + + result = drop_collection("test_collection") + assert result is True + mock_collection.drop.assert_called_once() + + @patch('pymilvus.utility.list_collections') + def test_list_collections(self, mock_list_collections): + """Test listing all collections.""" + mock_list_collections.return_value = [ + "collection1", + "collection2", + "collection3" + ] + + def get_all_collections(): + from pymilvus import utility + return utility.list_collections() + + collections = get_all_collections() + assert len(collections) == 3 + assert "collection1" in collections + + def test_collection_with_partitions(self, mock_collection): + """Test creating and managing collection partitions.""" + mock_collection.create_partition = Mock() + mock_collection.has_partition = Mock(return_value=False) + mock_collection.partitions = [] + + def create_partitions(collection, partition_names): + for name in partition_names: + if not collection.has_partition(name): + collection.create_partition(name) + collection.partitions.append(name) + return collection.partitions + + partitions = create_partitions(mock_collection, ["partition1", "partition2"]) + assert len(partitions) == 2 + assert mock_collection.create_partition.call_count == 2 + + def test_collection_properties(self, mock_collection): + """Test getting collection properties.""" + mock_collection.num_entities = 10000 + mock_collection.description = "Test collection" + mock_collection.name = "test_coll" + mock_collection.schema = Mock() + + def get_collection_info(collection): + return { + "name": collection.name, + "description": collection.description, + "num_entities": collection.num_entities, + "schema": collection.schema + } + + info = get_collection_info(mock_collection) + assert info["name"] == "test_coll" + assert info["num_entities"] == 10000 + assert info["description"] == "Test collection" + + +class TestConnectionResilience: + """Test connection resilience and error recovery.""" + + @patch('pymilvus.connections.connect') + def test_automatic_reconnection(self, mock_connect): + """Test automatic reconnection after connection loss.""" + # Simulate connection loss and recovery + mock_connect.side_effect = [ + True, # Initial connection + Exception("Connection lost"), # Connection drops + Exception("Still disconnected"), # First retry fails + True # Reconnection succeeds + ] + + class ResilientConnection: + def __init__(self): + self.connected = False + self.retry_count = 0 + self.max_retries = 3 + self.connection_attempts = 0 + + def execute_with_retry(self, operation): + """Execute operation with automatic retry on connection failure.""" + for attempt in range(self.max_retries): + try: + if not self.connected or attempt > 0: + self._connect() + + result = operation() + self.retry_count = 0 # Reset retry count on success + return result + + except Exception as e: + self.retry_count += 1 + self.connected = False + + if self.retry_count >= self.max_retries: + raise Exception(f"Max retries exceeded: {e}") + + time.sleep(2 ** attempt) # Exponential backoff + + def _connect(self): + from pymilvus import connections + self.connection_attempts += 1 + if self.connection_attempts <= 2: + # First two connection attempts fail + self.connected = False + if self.connection_attempts == 1: + raise Exception("Connection lost") + else: + raise Exception("Still disconnected") + else: + # Third attempt succeeds + connections.connect(alias="resilient", host="localhost", port=19530) + self.connected = True + + conn = ResilientConnection() + + # Mock operation that will fail initially + operation_calls = 0 + def test_operation(): + nonlocal operation_calls + operation_calls += 1 + if operation_calls < 3 and not conn.connected: + raise Exception("Operation failed") + return "Success" + + with patch('time.sleep'): # Mock sleep for faster testing + result = conn.execute_with_retry(test_operation) + + # Operation should eventually succeed + assert result == "Success" + + @patch('pymilvus.connections.connect') + def test_connection_timeout_handling(self, mock_connect): + """Test handling of connection timeouts.""" + import socket + mock_connect.side_effect = socket.timeout("Connection timed out") + + def connect_with_timeout_handling(host, port, timeout=10): + from pymilvus import connections + + try: + return connections.connect( + alias="timeout_test", + host=host, + port=port, + timeout=timeout + ) + except socket.timeout as e: + return f"Connection timeout: {e}" + except Exception as e: + return f"Connection error: {e}" + + result = connect_with_timeout_handling("localhost", 19530, timeout=5) + assert "Connection timeout" in result + + def test_connection_state_management(self): + """Test managing connection state across operations.""" + class ConnectionManager: + def __init__(self): + self.connections = {} + self.active_alias = None + + def add_connection(self, alias, host, port): + """Add a connection configuration.""" + self.connections[alias] = { + "host": host, + "port": port, + "connected": False + } + + def switch_connection(self, alias): + """Switch to a different connection.""" + if alias not in self.connections: + raise ValueError(f"Unknown connection alias: {alias}") + + # Disconnect from current if connected + if self.active_alias and self.connections[self.active_alias]["connected"]: + self.connections[self.active_alias]["connected"] = False + + self.active_alias = alias + self.connections[alias]["connected"] = True + return True + + def get_active_connection(self): + """Get the currently active connection.""" + if not self.active_alias: + return None + return self.connections.get(self.active_alias) + + def close_all(self): + """Close all connections.""" + for alias in self.connections: + self.connections[alias]["connected"] = False + self.active_alias = None + + manager = ConnectionManager() + manager.add_connection("primary", "localhost", 19530) + manager.add_connection("secondary", "remote", 8080) + + # Switch to primary + assert manager.switch_connection("primary") is True + active = manager.get_active_connection() + assert active["host"] == "localhost" + assert active["connected"] is True + + # Switch to secondary + manager.switch_connection("secondary") + assert manager.connections["primary"]["connected"] is False + assert manager.connections["secondary"]["connected"] is True + + # Close all + manager.close_all() + assert all(not conn["connected"] for conn in manager.connections.values()) diff --git a/vdb_benchmark/tests/tests/test_index_management.py b/vdb_benchmark/tests/tests/test_index_management.py new file mode 100755 index 00000000..7cf87f79 --- /dev/null +++ b/vdb_benchmark/tests/tests/test_index_management.py @@ -0,0 +1,825 @@ +""" +Unit tests for index management functionality in vdb-bench +""" +import pytest +import numpy as np +from unittest.mock import Mock, MagicMock, patch, call +import time +import json +from typing import Dict, Any, List +from concurrent.futures import ThreadPoolExecutor + + +class TestIndexCreation: + """Test index creation operations.""" + + def test_create_diskann_index(self, mock_collection): + """Test creating DiskANN index.""" + mock_collection.create_index.return_value = True + + def create_diskann_index(collection, field_name="embedding", params=None): + """Create DiskANN index on collection.""" + if params is None: + params = { + "metric_type": "L2", + "index_type": "DISKANN", + "params": { + "max_degree": 64, + "search_list_size": 200, + "pq_code_budget_gb": 0.1, + "build_algo": "IVF_PQ" + } + } + + try: + result = collection.create_index( + field_name=field_name, + index_params=params + ) + return { + "success": True, + "index_type": params["index_type"], + "field": field_name, + "params": params + } + except Exception as e: + return { + "success": False, + "error": str(e) + } + + result = create_diskann_index(mock_collection) + + assert result["success"] is True + assert result["index_type"] == "DISKANN" + mock_collection.create_index.assert_called_once() + + def test_create_hnsw_index(self, mock_collection): + """Test creating HNSW index.""" + mock_collection.create_index.return_value = True + + def create_hnsw_index(collection, field_name="embedding", params=None): + """Create HNSW index on collection.""" + if params is None: + params = { + "metric_type": "L2", + "index_type": "HNSW", + "params": { + "M": 16, + "efConstruction": 200 + } + } + + try: + result = collection.create_index( + field_name=field_name, + index_params=params + ) + return { + "success": True, + "index_type": params["index_type"], + "field": field_name, + "params": params + } + except Exception as e: + return { + "success": False, + "error": str(e) + } + + result = create_hnsw_index(mock_collection) + + assert result["success"] is True + assert result["index_type"] == "HNSW" + assert result["params"]["params"]["M"] == 16 + + def test_create_ivf_index(self, mock_collection): + """Test creating IVF index variants.""" + class IVFIndexBuilder: + def __init__(self, collection): + self.collection = collection + + def create_ivf_flat(self, field_name, nlist=128): + """Create IVF_FLAT index.""" + params = { + "metric_type": "L2", + "index_type": "IVF_FLAT", + "params": {"nlist": nlist} + } + return self._create_index(field_name, params) + + def create_ivf_sq8(self, field_name, nlist=128): + """Create IVF_SQ8 index.""" + params = { + "metric_type": "L2", + "index_type": "IVF_SQ8", + "params": {"nlist": nlist} + } + return self._create_index(field_name, params) + + def create_ivf_pq(self, field_name, nlist=128, m=8, nbits=8): + """Create IVF_PQ index.""" + params = { + "metric_type": "L2", + "index_type": "IVF_PQ", + "params": { + "nlist": nlist, + "m": m, + "nbits": nbits + } + } + return self._create_index(field_name, params) + + def _create_index(self, field_name, params): + """Internal method to create index.""" + try: + self.collection.create_index( + field_name=field_name, + index_params=params + ) + return {"success": True, "params": params} + except Exception as e: + return {"success": False, "error": str(e)} + + mock_collection.create_index.return_value = True + builder = IVFIndexBuilder(mock_collection) + + # Test IVF_FLAT + result = builder.create_ivf_flat("embedding", nlist=256) + assert result["success"] is True + assert result["params"]["index_type"] == "IVF_FLAT" + + # Test IVF_SQ8 + result = builder.create_ivf_sq8("embedding", nlist=512) + assert result["success"] is True + assert result["params"]["index_type"] == "IVF_SQ8" + + # Test IVF_PQ + result = builder.create_ivf_pq("embedding", nlist=256, m=16) + assert result["success"] is True + assert result["params"]["index_type"] == "IVF_PQ" + assert result["params"]["params"]["m"] == 16 + + def test_index_creation_with_retry(self, mock_collection): + """Test index creation with retry logic.""" + # Simulate failures then success + mock_collection.create_index.side_effect = [ + Exception("Index creation failed"), + Exception("Still failing"), + True + ] + + def create_index_with_retry(collection, params, max_retries=3, backoff=2): + """Create index with exponential backoff retry.""" + for attempt in range(max_retries): + try: + collection.create_index( + field_name="embedding", + index_params=params + ) + return { + "success": True, + "attempts": attempt + 1 + } + except Exception as e: + if attempt == max_retries - 1: + return { + "success": False, + "attempts": attempt + 1, + "error": str(e) + } + time.sleep(backoff ** attempt) + + return {"success": False, "attempts": max_retries} + + params = { + "metric_type": "L2", + "index_type": "DISKANN", + "params": {"max_degree": 64} + } + + with patch('time.sleep'): # Speed up test + result = create_index_with_retry(mock_collection, params) + + assert result["success"] is True + assert result["attempts"] == 3 + assert mock_collection.create_index.call_count == 3 + + +class TestIndexManagement: + """Test index management operations.""" + + def test_index_status_check(self, mock_collection): + """Test checking index status.""" + # Create a proper mock index object + mock_index = Mock() + mock_index.params = {"index_type": "DISKANN"} + mock_index.progress = 100 + mock_index.state = "Finished" + + # Set the index attribute on collection + mock_collection.index = mock_index + + class IndexManager: + def __init__(self, collection): + self.collection = collection + + def get_index_status(self): + """Get current index status.""" + try: + index = self.collection.index + return { + "exists": True, + "type": index.params.get("index_type"), + "progress": index.progress, + "state": index.state, + "params": index.params + } + except: + return { + "exists": False, + "type": None, + "progress": 0, + "state": "Not Created" + } + + def is_index_ready(self): + """Check if index is ready for use.""" + status = self.get_index_status() + return ( + status["exists"] and + status["state"] == "Finished" and + status["progress"] == 100 + ) + + def wait_for_index(self, timeout=300, check_interval=5): + """Wait for index to be ready.""" + start_time = time.time() + + while time.time() - start_time < timeout: + if self.is_index_ready(): + return True + time.sleep(check_interval) + + return False + + manager = IndexManager(mock_collection) + + status = manager.get_index_status() + assert status["exists"] is True + assert status["type"] == "DISKANN" + assert status["progress"] == 100 + + assert manager.is_index_ready() is True + + def test_drop_index(self, mock_collection): + """Test dropping an index.""" + mock_collection.drop_index.return_value = None + + def drop_index(collection, field_name="embedding"): + """Drop index from collection.""" + try: + collection.drop_index(field_name=field_name) + return { + "success": True, + "field": field_name, + "message": f"Index dropped for field {field_name}" + } + except Exception as e: + return { + "success": False, + "error": str(e) + } + + result = drop_index(mock_collection) + + assert result["success"] is True + assert result["field"] == "embedding" + mock_collection.drop_index.assert_called_once_with(field_name="embedding") + + def test_rebuild_index(self, mock_collection): + """Test rebuilding an index.""" + mock_collection.drop_index.return_value = None + mock_collection.create_index.return_value = True + + class IndexRebuilder: + def __init__(self, collection): + self.collection = collection + + def rebuild_index(self, field_name, new_params): + """Rebuild index with new parameters.""" + steps = [] + + try: + # Step 1: Drop existing index + self.collection.drop_index(field_name=field_name) + steps.append("Index dropped") + + # Step 2: Wait for drop to complete + time.sleep(1) + steps.append("Waited for drop completion") + + # Step 3: Create new index + self.collection.create_index( + field_name=field_name, + index_params=new_params + ) + steps.append("New index created") + + return { + "success": True, + "steps": steps, + "new_params": new_params + } + + except Exception as e: + return { + "success": False, + "steps": steps, + "error": str(e) + } + + rebuilder = IndexRebuilder(mock_collection) + + new_params = { + "metric_type": "COSINE", + "index_type": "HNSW", + "params": {"M": 32, "efConstruction": 400} + } + + with patch('time.sleep'): # Speed up test + result = rebuilder.rebuild_index("embedding", new_params) + + assert result["success"] is True + assert len(result["steps"]) == 3 + assert mock_collection.drop_index.called + assert mock_collection.create_index.called + + def test_index_comparison(self): + """Test comparing different index configurations.""" + class IndexComparator: + def __init__(self): + self.results = {} + + def add_result(self, index_type, metrics): + """Add benchmark result for an index type.""" + self.results[index_type] = metrics + + def compare(self): + """Compare all index results.""" + if len(self.results) < 2: + return None + + comparison = { + "indexes": [], + "best_qps": None, + "best_recall": None, + "best_build_time": None + } + + best_qps = 0 + best_recall = 0 + best_build_time = float('inf') + + for index_type, metrics in self.results.items(): + comparison["indexes"].append({ + "type": index_type, + "qps": metrics.get("qps", 0), + "recall": metrics.get("recall", 0), + "build_time": metrics.get("build_time", 0), + "memory_usage": metrics.get("memory_usage", 0) + }) + + if metrics.get("qps", 0) > best_qps: + best_qps = metrics["qps"] + comparison["best_qps"] = index_type + + if metrics.get("recall", 0) > best_recall: + best_recall = metrics["recall"] + comparison["best_recall"] = index_type + + if metrics.get("build_time", float('inf')) < best_build_time: + best_build_time = metrics["build_time"] + comparison["best_build_time"] = index_type + + return comparison + + def get_recommendation(self, requirements): + """Get index recommendation based on requirements.""" + if not self.results: + return None + + scores = {} + + for index_type, metrics in self.results.items(): + score = 0 + + # Weight different factors based on requirements + if requirements.get("prioritize_speed"): + score += metrics.get("qps", 0) * 2 + + if requirements.get("prioritize_accuracy"): + score += metrics.get("recall", 0) * 1000 + + if requirements.get("memory_constrained"): + # Penalize high memory usage + score -= metrics.get("memory_usage", 0) * 0.1 + + if requirements.get("fast_build"): + # Penalize slow build time + score -= metrics.get("build_time", 0) * 10 + + scores[index_type] = score + + best_index = max(scores, key=scores.get) + + return { + "recommended": best_index, + "score": scores[best_index], + "all_scores": scores + } + + comparator = IndexComparator() + + # Add sample results + comparator.add_result("DISKANN", { + "qps": 1500, + "recall": 0.95, + "build_time": 300, + "memory_usage": 2048 + }) + + comparator.add_result("HNSW", { + "qps": 1200, + "recall": 0.98, + "build_time": 150, + "memory_usage": 4096 + }) + + comparator.add_result("IVF_PQ", { + "qps": 2000, + "recall": 0.90, + "build_time": 100, + "memory_usage": 1024 + }) + + comparison = comparator.compare() + + assert comparison["best_qps"] == "IVF_PQ" + assert comparison["best_recall"] == "HNSW" + assert comparison["best_build_time"] == "IVF_PQ" + + # Test recommendation + requirements = { + "prioritize_accuracy": True, + "memory_constrained": False + } + + recommendation = comparator.get_recommendation(requirements) + assert recommendation["recommended"] == "HNSW" + + +class TestIndexOptimization: + """Test index optimization strategies.""" + + def test_parameter_tuning(self, mock_collection): + """Test automatic parameter tuning for indexes.""" + class ParameterTuner: + def __init__(self, collection): + self.collection = collection + self.test_results = [] + + def tune_diskann(self, test_vectors, ground_truth): + """Tune DiskANN parameters.""" + param_grid = [ + {"max_degree": 32, "search_list_size": 100}, + {"max_degree": 64, "search_list_size": 200}, + {"max_degree": 96, "search_list_size": 300} + ] + + best_params = None + best_score = 0 + + for params in param_grid: + score = self._test_params( + "DISKANN", + params, + test_vectors, + ground_truth + ) + + if score > best_score: + best_score = score + best_params = params + + self.test_results.append({ + "params": params, + "score": score + }) + + return best_params, best_score + + def tune_hnsw(self, test_vectors, ground_truth): + """Tune HNSW parameters.""" + param_grid = [ + {"M": 8, "efConstruction": 100}, + {"M": 16, "efConstruction": 200}, + {"M": 32, "efConstruction": 400} + ] + + best_params = None + best_score = 0 + + for params in param_grid: + score = self._test_params( + "HNSW", + params, + test_vectors, + ground_truth + ) + + if score > best_score: + best_score = score + best_params = params + + self.test_results.append({ + "params": params, + "score": score + }) + + return best_params, best_score + + def _test_params(self, index_type, params, test_vectors, ground_truth): + """Test specific parameters and return score.""" + # Simulated testing (in reality would rebuild index and test) + # Score based on parameter values (simplified) + + if index_type == "DISKANN": + score = params["max_degree"] * 0.5 + params["search_list_size"] * 0.2 + elif index_type == "HNSW": + score = params["M"] * 2 + params["efConstruction"] * 0.1 + else: + score = 0 + + # Add some randomness + score += np.random.random() * 10 + + return score + + tuner = ParameterTuner(mock_collection) + + # Create test data + test_vectors = np.random.randn(100, 128).astype(np.float32) + ground_truth = np.random.randint(0, 1000, (100, 10)) + + # Tune DiskANN + best_diskann, diskann_score = tuner.tune_diskann(test_vectors, ground_truth) + assert best_diskann is not None + assert diskann_score > 0 + + # Tune HNSW + best_hnsw, hnsw_score = tuner.tune_hnsw(test_vectors, ground_truth) + assert best_hnsw is not None + assert hnsw_score > 0 + + # Check that results were recorded + assert len(tuner.test_results) == 6 # 3 for each index type + + def test_adaptive_index_selection(self): + """Test adaptive index selection based on workload.""" + class AdaptiveIndexSelector: + def __init__(self): + self.workload_history = [] + self.current_index = None + + def analyze_workload(self, queries): + """Analyze query workload characteristics.""" + characteristics = { + "query_count": len(queries), + "dimension": queries.shape[1] if len(queries) > 0 else 0, + "distribution": self._analyze_distribution(queries), + "sparsity": self._calculate_sparsity(queries), + "clustering": self._analyze_clustering(queries) + } + + self.workload_history.append({ + "timestamp": time.time(), + "characteristics": characteristics + }) + + return characteristics + + def select_index(self, characteristics, dataset_size): + """Select best index for workload characteristics.""" + # Simple rule-based selection + + if dataset_size < 100000: + # Small dataset - use simple index + return "IVF_FLAT" + + elif dataset_size < 1000000: + # Medium dataset + if characteristics["clustering"] > 0.7: + # Highly clustered - IVF works well + return "IVF_PQ" + else: + # More uniform - HNSW + return "HNSW" + + else: + # Large dataset + if characteristics["sparsity"] > 0.5: + # Sparse vectors - specialized index + return "SPARSE_IVF" + elif characteristics["dimension"] > 1000: + # High dimension - DiskANN with PQ + return "DISKANN" + else: + # Default to HNSW for good all-around performance + return "HNSW" + + def _analyze_distribution(self, queries): + """Analyze query distribution.""" + if len(queries) == 0: + return "unknown" + + # Simple variance check + variance = np.var(queries) + if variance < 0.5: + return "concentrated" + elif variance < 2.0: + return "normal" + else: + return "scattered" + + def _calculate_sparsity(self, queries): + """Calculate sparsity of queries.""" + if len(queries) == 0: + return 0 + + zero_count = np.sum(queries == 0) + total_elements = queries.size + + return zero_count / total_elements if total_elements > 0 else 0 + + def _analyze_clustering(self, queries): + """Analyze clustering tendency.""" + # Simplified clustering score + if len(queries) < 10: + return 0 + + # Calculate pairwise distances for small sample + sample = queries[:min(100, len(queries))] + distances = [] + + for i in range(len(sample)): + for j in range(i + 1, len(sample)): + dist = np.linalg.norm(sample[i] - sample[j]) + distances.append(dist) + + if not distances: + return 0 + + # High variance in distances indicates clustering + distance_var = np.var(distances) + return min(distance_var / 10, 1.0) # Normalize to [0, 1] + + selector = AdaptiveIndexSelector() + + # Test with different workloads + + # Sparse workload + sparse_queries = np.random.randn(100, 2000).astype(np.float32) + sparse_queries[sparse_queries < 1] = 0 # Make sparse + + characteristics = selector.analyze_workload(sparse_queries) + selected_index = selector.select_index(characteristics, 5000000) + + assert characteristics["sparsity"] > 0.3 + + # Dense clustered workload + clustered_queries = [] + for _ in range(5): + center = np.random.randn(128) * 10 + cluster = center + np.random.randn(20, 128) * 0.1 + clustered_queries.append(cluster) + clustered_queries = np.vstack(clustered_queries).astype(np.float32) + + characteristics = selector.analyze_workload(clustered_queries) + selected_index = selector.select_index(characteristics, 500000) + + assert selected_index in ["IVF_PQ", "HNSW"] + + def test_index_warm_up(self, mock_collection): + """Test index warm-up procedures.""" + class IndexWarmUp: + def __init__(self, collection): + self.collection = collection + self.warm_up_stats = [] + + def warm_up(self, num_queries=100, batch_size=10): + """Warm up index with sample queries.""" + total_time = 0 + queries_executed = 0 + + for batch in range(0, num_queries, batch_size): + # Generate random queries + batch_queries = np.random.randn( + min(batch_size, num_queries - batch), + 128 + ).astype(np.float32) + + start = time.time() + + # Execute warm-up queries + self.collection.search( + data=batch_queries.tolist(), + anns_field="embedding", + param={"metric_type": "L2"}, + limit=10 + ) + + elapsed = time.time() - start + total_time += elapsed + queries_executed += len(batch_queries) + + self.warm_up_stats.append({ + "batch": batch // batch_size, + "queries": len(batch_queries), + "time": elapsed, + "qps": len(batch_queries) / elapsed if elapsed > 0 else 0 + }) + + return { + "total_queries": queries_executed, + "total_time": total_time, + "avg_qps": queries_executed / total_time if total_time > 0 else 0, + "stats": self.warm_up_stats + } + + def adaptive_warm_up(self, target_qps=100, max_queries=1000): + """Adaptive warm-up that stops when performance stabilizes.""" + stable_threshold = 0.1 # 10% variation + window_size = 5 + recent_qps = [] + + batch_size = 10 + total_queries = 0 + + while total_queries < max_queries: + queries = np.random.randn(batch_size, 128).astype(np.float32) + + start = time.time() + self.collection.search( + data=queries.tolist(), + anns_field="embedding", + param={"metric_type": "L2"}, + limit=10 + ) + elapsed = time.time() - start + + qps = batch_size / elapsed if elapsed > 0 else 0 + recent_qps.append(qps) + total_queries += batch_size + + # Check if performance is stable + if len(recent_qps) >= window_size: + recent = recent_qps[-window_size:] + avg = sum(recent) / len(recent) + variance = sum((q - avg) ** 2 for q in recent) / len(recent) + cv = (variance ** 0.5) / avg if avg > 0 else 1 + + if cv < stable_threshold and avg >= target_qps: + return { + "warmed_up": True, + "queries_used": total_queries, + "final_qps": avg, + "stabilized": True + } + + return { + "warmed_up": True, + "queries_used": total_queries, + "final_qps": recent_qps[-1] if recent_qps else 0, + "stabilized": False + } + + mock_collection.search.return_value = [[Mock(id=i, distance=0.1*i) for i in range(10)]] + + warmer = IndexWarmUp(mock_collection) + + # Test basic warm-up + with patch('time.time', side_effect=[0, 0.1, 0.2, 0.3, 0.4, 0.5] * 20): + result = warmer.warm_up(num_queries=50, batch_size=10) + + assert result["total_queries"] == 50 + assert len(warmer.warm_up_stats) == 5 + + # Test adaptive warm-up + warmer2 = IndexWarmUp(mock_collection) + + with patch('time.time', side_effect=[i * 0.01 for i in range(200)]): + result = warmer2.adaptive_warm_up(target_qps=100, max_queries=100) + + assert result["warmed_up"] is True + assert result["queries_used"] <= 100 diff --git a/vdb_benchmark/tests/tests/test_load_vdb.py b/vdb_benchmark/tests/tests/test_load_vdb.py new file mode 100755 index 00000000..772f2f93 --- /dev/null +++ b/vdb_benchmark/tests/tests/test_load_vdb.py @@ -0,0 +1,530 @@ +""" +Unit tests for vector loading functionality in vdb-bench +""" +import pytest +import numpy as np +from unittest.mock import Mock, MagicMock, patch, call +import time +from typing import List, Generator +import json + + +class TestVectorGeneration: + """Test vector generation utilities.""" + + def test_uniform_vector_generation(self): + """Test generating vectors with uniform distribution.""" + def generate_uniform_vectors(num_vectors, dimension, seed=None): + if seed is not None: + np.random.seed(seed) + return np.random.uniform(-1, 1, size=(num_vectors, dimension)).astype(np.float32) + + vectors = generate_uniform_vectors(100, 128, seed=42) + + assert vectors.shape == (100, 128) + assert vectors.dtype == np.float32 + assert vectors.min() >= -1 + assert vectors.max() <= 1 + + # Test reproducibility with seed + vectors2 = generate_uniform_vectors(100, 128, seed=42) + np.testing.assert_array_equal(vectors, vectors2) + + def test_normal_vector_generation(self): + """Test generating vectors with normal distribution.""" + def generate_normal_vectors(num_vectors, dimension, mean=0, std=1, seed=None): + if seed is not None: + np.random.seed(seed) + return np.random.normal(mean, std, size=(num_vectors, dimension)).astype(np.float32) + + vectors = generate_normal_vectors(1000, 256, seed=42) + + assert vectors.shape == (1000, 256) + assert vectors.dtype == np.float32 + + # Check distribution properties (should be close to normal) + assert -0.1 < vectors.mean() < 0.1 # Mean should be close to 0 + assert 0.9 < vectors.std() < 1.1 # Std should be close to 1 + + def test_normalized_vector_generation(self): + """Test generating L2-normalized vectors.""" + def generate_normalized_vectors(num_vectors, dimension, seed=None): + if seed is not None: + np.random.seed(seed) + + vectors = np.random.randn(num_vectors, dimension).astype(np.float32) + # L2 normalize each vector + norms = np.linalg.norm(vectors, axis=1, keepdims=True) + return vectors / norms + + vectors = generate_normalized_vectors(50, 64, seed=42) + + assert vectors.shape == (50, 64) + + # Check that all vectors are normalized + norms = np.linalg.norm(vectors, axis=1) + np.testing.assert_array_almost_equal(norms, np.ones(50), decimal=5) + + def test_chunked_vector_generation(self): + """Test generating vectors in chunks for memory efficiency.""" + def generate_vectors_chunked(total_vectors, dimension, chunk_size=1000): + """Generate vectors in chunks to manage memory.""" + num_chunks = (total_vectors + chunk_size - 1) // chunk_size + + for i in range(num_chunks): + start_idx = i * chunk_size + end_idx = min(start_idx + chunk_size, total_vectors) + chunk_vectors = end_idx - start_idx + + yield np.random.randn(chunk_vectors, dimension).astype(np.float32) + + # Generate 10000 vectors in chunks of 1000 + all_vectors = [] + for chunk in generate_vectors_chunked(10000, 128, chunk_size=1000): + all_vectors.append(chunk) + + assert len(all_vectors) == 10 + assert all_vectors[0].shape == (1000, 128) + + # Concatenate and verify total + concatenated = np.vstack(all_vectors) + assert concatenated.shape == (10000, 128) + + def test_vector_generation_with_ids(self): + """Test generating vectors with associated IDs.""" + def generate_vectors_with_ids(num_vectors, dimension, start_id=0): + vectors = np.random.randn(num_vectors, dimension).astype(np.float32) + ids = np.arange(start_id, start_id + num_vectors, dtype=np.int64) + return ids, vectors + + ids, vectors = generate_vectors_with_ids(100, 256, start_id=1000) + + assert len(ids) == 100 + assert ids[0] == 1000 + assert ids[-1] == 1099 + assert vectors.shape == (100, 256) + + def test_vector_generation_progress_tracking(self): + """Test tracking progress during vector generation.""" + def generate_with_progress(num_vectors, dimension, chunk_size=100): + total_generated = 0 + progress_updates = [] + + for chunk_num in range(0, num_vectors, chunk_size): + chunk_end = min(chunk_num + chunk_size, num_vectors) + chunk_size_actual = chunk_end - chunk_num + + vectors = np.random.randn(chunk_size_actual, dimension).astype(np.float32) + + total_generated += chunk_size_actual + progress = (total_generated / num_vectors) * 100 + progress_updates.append(progress) + + yield vectors, progress + + progress_list = [] + vector_list = [] + + for vectors, progress in generate_with_progress(1000, 128, chunk_size=200): + vector_list.append(vectors) + progress_list.append(progress) + + assert len(progress_list) == 5 + assert progress_list[-1] == 100.0 + assert all(p > 0 for p in progress_list) + + +class TestVectorLoading: + """Test vector loading into database.""" + + def test_batch_insertion(self, mock_collection): + """Test inserting vectors in batches.""" + inserted_data = [] + mock_collection.insert.side_effect = lambda data: inserted_data.append(data) + + def insert_vectors_batch(collection, vectors, batch_size=1000): + """Insert vectors in batches.""" + num_vectors = len(vectors) + total_inserted = 0 + + for i in range(0, num_vectors, batch_size): + batch = vectors[i:i + batch_size] + collection.insert([batch]) + total_inserted += len(batch) + + return total_inserted + + vectors = np.random.randn(5000, 128).astype(np.float32) + total = insert_vectors_batch(mock_collection, vectors, batch_size=1000) + + assert total == 5000 + assert mock_collection.insert.call_count == 5 + + def test_insertion_with_error_handling(self, mock_collection): + """Test vector insertion with error handling.""" + # Simulate occasional insertion failures + call_count = 0 + def insert_side_effect(data): + nonlocal call_count + call_count += 1 + if call_count == 1: + raise Exception("Insert failed") + return Mock(primary_keys=list(range(len(data[0])))) + + mock_collection.insert.side_effect = insert_side_effect + + def insert_with_retry(collection, vectors, max_retries=3): + """Insert vectors with retry on failure.""" + for attempt in range(max_retries): + try: + result = collection.insert([vectors]) + return result + except Exception as e: + if attempt == max_retries - 1: + raise + time.sleep(1) + return None + + vectors = np.random.randn(100, 128).astype(np.float32) + + with patch('time.sleep'): + result = insert_with_retry(mock_collection, vectors) + + assert result is not None + assert mock_collection.insert.call_count == 2 # Failed once, succeeded on retry + + def test_parallel_insertion(self, mock_collection): + """Test parallel vector insertion using multiple threads/processes.""" + from concurrent.futures import ThreadPoolExecutor + + def insert_chunk(args): + collection, chunk_id, vectors = args + collection.insert([vectors]) + return chunk_id, len(vectors) + + def parallel_insert(collection, vectors, num_workers=4, chunk_size=1000): + """Insert vectors in parallel.""" + chunks = [] + for i in range(0, len(vectors), chunk_size): + chunk = vectors[i:i + chunk_size] + chunks.append((collection, i // chunk_size, chunk)) + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + results = list(executor.map(insert_chunk, chunks)) + + total_inserted = sum(count for _, count in results) + return total_inserted + + vectors = np.random.randn(4000, 128).astype(np.float32) + + # Mock the insert to track calls + inserted_chunks = [] + mock_collection.insert.side_effect = lambda data: inserted_chunks.append(len(data[0])) + + total = parallel_insert(mock_collection, vectors, num_workers=2, chunk_size=1000) + + assert total == 4000 + assert len(inserted_chunks) == 4 + + def test_insertion_with_metadata(self, mock_collection): + """Test inserting vectors with additional metadata.""" + def insert_vectors_with_metadata(collection, vectors, metadata): + """Insert vectors along with metadata.""" + data = [ + vectors, + metadata.get("ids", list(range(len(vectors)))), + metadata.get("tags", ["default"] * len(vectors)) + ] + + result = collection.insert(data) + return result + + vectors = np.random.randn(100, 128).astype(np.float32) + metadata = { + "ids": list(range(1000, 1100)), + "tags": [f"tag_{i % 10}" for i in range(100)] + } + + mock_collection.insert.return_value = Mock(primary_keys=metadata["ids"]) + + result = insert_vectors_with_metadata(mock_collection, vectors, metadata) + + assert result.primary_keys == metadata["ids"] + mock_collection.insert.assert_called_once() + + @patch('time.time') + def test_insertion_rate_monitoring(self, mock_time, mock_collection): + """Test monitoring insertion rate and throughput.""" + # Start at 1 instead of 0 to avoid issues with 0 being falsy + time_sequence = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] + mock_time.side_effect = time_sequence + + class InsertionMonitor: + def __init__(self): + self.total_vectors = 0 + self.start_time = None + self.batch_times = [] + self.last_time = None + + def start(self): + self.start_time = time.time() + self.last_time = self.start_time + + def record_batch(self, batch_size): + current_time = time.time() + if self.start_time is not None: + # Calculate elapsed since last batch + elapsed = current_time - self.last_time + self.last_time = current_time + self.batch_times.append(current_time) + self.total_vectors += batch_size + + # Calculate throughput + total_elapsed = current_time - self.start_time + throughput = self.total_vectors / total_elapsed if total_elapsed > 0 else 0 + + return { + "batch_size": batch_size, + "batch_time": elapsed, + "total_vectors": self.total_vectors, + "throughput": throughput + } + return None + + def get_summary(self): + # Check if we have data to summarize + if self.start_time is None or len(self.batch_times) == 0: + return None + + # Calculate total time from start to last batch + total_time = self.batch_times[-1] - self.start_time + + # Return summary if we have valid data + if self.total_vectors > 0: + return { + "total_vectors": self.total_vectors, + "total_time": total_time, + "average_throughput": self.total_vectors / total_time if total_time > 0 else 0 + } + + return None + + monitor = InsertionMonitor() + monitor.start() # Uses time value 1.0 + + # Simulate inserting batches (uses time values 2.0-6.0) + stats = [] + for i in range(5): + stat = monitor.record_batch(1000) + if stat: + stats.append(stat) + + summary = monitor.get_summary() + + assert summary is not None + assert summary["total_vectors"] == 5000 + assert summary["total_time"] == 5.0 # From time 1.0 to time 6.0 + assert summary["average_throughput"] == 1000.0 # 5000 vectors / 5 seconds + + def test_load_checkpoint_resume(self, test_data_dir): + """Test checkpoint and resume functionality for large loads.""" + checkpoint_file = test_data_dir / "checkpoint.json" + + class LoadCheckpoint: + def __init__(self, checkpoint_path): + self.checkpoint_path = checkpoint_path + self.state = self.load_checkpoint() + + def load_checkpoint(self): + """Load checkpoint from file if exists.""" + if self.checkpoint_path.exists(): + with open(self.checkpoint_path, 'r') as f: + return json.load(f) + return {"last_batch": 0, "total_inserted": 0} + + def save_checkpoint(self, batch_num, total_inserted): + """Save current progress to checkpoint.""" + self.state = { + "last_batch": batch_num, + "total_inserted": total_inserted, + "timestamp": time.time() + } + with open(self.checkpoint_path, 'w') as f: + json.dump(self.state, f) + + def get_resume_point(self): + """Get the batch number to resume from.""" + return self.state["last_batch"] + + def clear(self): + """Clear checkpoint after successful completion.""" + if self.checkpoint_path.exists(): + self.checkpoint_path.unlink() + self.state = {"last_batch": 0, "total_inserted": 0} + + checkpoint = LoadCheckpoint(checkpoint_file) + + # Simulate partial load + checkpoint.save_checkpoint(5, 5000) + assert checkpoint.get_resume_point() == 5 + + # Simulate resume + checkpoint2 = LoadCheckpoint(checkpoint_file) + assert checkpoint2.get_resume_point() == 5 + assert checkpoint2.state["total_inserted"] == 5000 + + # Clear checkpoint + checkpoint2.clear() + assert not checkpoint_file.exists() + + +class TestLoadOptimization: + """Test load optimization strategies.""" + + def test_dynamic_batch_sizing(self): + """Test dynamic batch size adjustment based on performance.""" + class DynamicBatchSizer: + def __init__(self, initial_size=1000, min_size=100, max_size=10000): + self.current_size = initial_size + self.min_size = min_size + self.max_size = max_size + self.history = [] + + def adjust(self, insertion_time, batch_size): + """Adjust batch size based on insertion performance.""" + throughput = batch_size / insertion_time if insertion_time > 0 else 0 + self.history.append((batch_size, throughput)) + + if len(self.history) >= 3: + # Calculate trend + recent_throughputs = [tp for _, tp in self.history[-3:]] + avg_throughput = sum(recent_throughputs) / len(recent_throughputs) + + if throughput > avg_throughput * 1.1: + # Performance improving, increase batch size + self.current_size = min( + int(self.current_size * 1.2), + self.max_size + ) + elif throughput < avg_throughput * 0.9: + # Performance degrading, decrease batch size + self.current_size = max( + int(self.current_size * 0.8), + self.min_size + ) + + return self.current_size + + sizer = DynamicBatchSizer(initial_size=1000) + + # Simulate good performance - should increase batch size + new_size = sizer.adjust(1.0, 1000) # 1000 vectors/sec + new_size = sizer.adjust(0.9, 1000) # 1111 vectors/sec + new_size = sizer.adjust(0.8, 1000) # 1250 vectors/sec + new_size = sizer.adjust(0.7, new_size) # Improving performance + + assert new_size > 1000 # Should have increased + + # Simulate degrading performance - should decrease batch size + sizer2 = DynamicBatchSizer(initial_size=5000) + new_size = sizer2.adjust(1.0, 5000) # 5000 vectors/sec + new_size = sizer2.adjust(1.2, 5000) # 4166 vectors/sec + new_size = sizer2.adjust(1.5, 5000) # 3333 vectors/sec + new_size = sizer2.adjust(2.0, new_size) # Degrading performance + + assert new_size < 5000 # Should have decreased + + def test_memory_aware_loading(self): + """Test memory-aware vector loading.""" + import psutil + + class MemoryAwareLoader: + def __init__(self, memory_threshold=0.8): + self.memory_threshold = memory_threshold + self.base_batch_size = 1000 + + def get_memory_usage(self): + """Get current memory usage percentage.""" + return psutil.virtual_memory().percent / 100 + + def calculate_safe_batch_size(self, vector_dimension): + """Calculate safe batch size based on available memory.""" + memory_usage = self.get_memory_usage() + + if memory_usage > self.memory_threshold: + # Reduce batch size when memory is high + reduction_factor = 1.0 - (memory_usage - self.memory_threshold) + return max(100, int(self.base_batch_size * reduction_factor)) + + # Calculate based on vector size + bytes_per_vector = vector_dimension * 4 # float32 + available_memory = (1.0 - memory_usage) * psutil.virtual_memory().total + max_vectors = int(available_memory * 0.5 / bytes_per_vector) # Use 50% of available + + return min(max_vectors, self.base_batch_size) + + def should_gc(self): + """Determine if garbage collection should be triggered.""" + return self.get_memory_usage() > 0.7 + + with patch('psutil.virtual_memory') as mock_memory: + # Simulate different memory conditions + mock_memory.return_value = Mock(percent=60, total=16 * 1024**3) # 60% used, 16GB total + + loader = MemoryAwareLoader() + batch_size = loader.calculate_safe_batch_size(1536) + + assert batch_size > 0 + assert not loader.should_gc() + + # Simulate high memory usage + mock_memory.return_value = Mock(percent=85, total=16 * 1024**3) # 85% used + + batch_size = loader.calculate_safe_batch_size(1536) + assert batch_size < loader.base_batch_size # Should be reduced + assert loader.should_gc() + + def test_flush_optimization(self, mock_collection): + """Test optimizing flush operations during loading.""" + flush_count = 0 + + def mock_flush(): + nonlocal flush_count + flush_count += 1 + time.sleep(0.1) # Simulate flush time + + mock_collection.flush = mock_flush + + class FlushOptimizer: + def __init__(self, flush_interval=10000, time_interval=60): + self.flush_interval = flush_interval + self.time_interval = time_interval + self.vectors_since_flush = 0 + self.last_flush_time = time.time() + + def should_flush(self, vectors_inserted): + """Determine if flush should be triggered.""" + self.vectors_since_flush += vectors_inserted + current_time = time.time() + + # Flush based on vector count or time + if (self.vectors_since_flush >= self.flush_interval or + current_time - self.last_flush_time >= self.time_interval): + return True + return False + + def flush(self, collection): + """Perform flush and reset counters.""" + collection.flush() + self.vectors_since_flush = 0 + self.last_flush_time = time.time() + + optimizer = FlushOptimizer(flush_interval=5000) + + with patch('time.sleep'): # Speed up test + # Simulate loading vectors + for i in range(10): + if optimizer.should_flush(1000): + optimizer.flush(mock_collection) + + assert flush_count == 2 # Should have flushed twice (at 5000 and 10000) diff --git a/vdb_benchmark/tests/tests/test_simple_bench.py b/vdb_benchmark/tests/tests/test_simple_bench.py new file mode 100755 index 00000000..c322a3d8 --- /dev/null +++ b/vdb_benchmark/tests/tests/test_simple_bench.py @@ -0,0 +1,766 @@ +""" +Unit tests for benchmarking functionality in vdb-bench +""" +import pytest +import numpy as np +from unittest.mock import Mock, MagicMock, patch, call +import time +import multiprocessing as mp +from typing import List, Dict, Any +import statistics +import json +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor + + +class TestBenchmarkExecution: + """Test benchmark execution and query operations.""" + + def test_single_query_execution(self, mock_collection): + """Test executing a single query.""" + # Mock search result + mock_collection.search.return_value = [[ + Mock(id=1, distance=0.1), + Mock(id=2, distance=0.2), + Mock(id=3, distance=0.3) + ]] + + def execute_single_query(collection, query_vector, top_k=10): + """Execute a single vector search query.""" + start_time = time.time() + + results = collection.search( + data=[query_vector], + anns_field="embedding", + param={"metric_type": "L2", "params": {"nprobe": 10}}, + limit=top_k + ) + + end_time = time.time() + latency = end_time - start_time + + return { + "latency": latency, + "num_results": len(results[0]) if results else 0, + "top_result": results[0][0].id if results and results[0] else None + } + + query = np.random.randn(128).astype(np.float32) + result = execute_single_query(mock_collection, query) + + assert result["latency"] >= 0 + assert result["num_results"] == 3 + assert result["top_result"] == 1 + mock_collection.search.assert_called_once() + + def test_batch_query_execution(self, mock_collection): + """Test executing batch queries.""" + # Mock batch search results + mock_results = [ + [Mock(id=i, distance=0.1*i) for i in range(1, 6)] + for _ in range(10) + ] + mock_collection.search.return_value = mock_results + + def execute_batch_queries(collection, query_vectors, top_k=10): + """Execute batch vector search queries.""" + start_time = time.time() + + results = collection.search( + data=query_vectors, + anns_field="embedding", + param={"metric_type": "L2"}, + limit=top_k + ) + + end_time = time.time() + total_latency = end_time - start_time + + return { + "total_latency": total_latency, + "queries_per_second": len(query_vectors) / total_latency if total_latency > 0 else 0, + "num_queries": len(query_vectors), + "results_per_query": [len(r) for r in results] + } + + queries = np.random.randn(10, 128).astype(np.float32) + result = execute_batch_queries(mock_collection, queries) + + assert result["num_queries"] == 10 + assert len(result["results_per_query"]) == 10 + assert all(r == 5 for r in result["results_per_query"]) + + @patch('time.time') + def test_throughput_measurement(self, mock_time, mock_collection): + """Test measuring query throughput.""" + # Simulate time progression + time_counter = [0] + def time_side_effect(): + time_counter[0] += 0.001 # 1ms per call + return time_counter[0] + + mock_time.side_effect = time_side_effect + mock_collection.search.return_value = [[Mock(id=1, distance=0.1)]] + + class ThroughputBenchmark: + def __init__(self): + self.results = [] + + def run(self, collection, queries, duration=10): + """Run throughput benchmark for specified duration.""" + start_time = time.time() + end_time = start_time + duration + query_count = 0 + latencies = [] + + query_idx = 0 + while time.time() < end_time: + query_start = time.time() + + # Execute query + collection.search( + data=[queries[query_idx % len(queries)]], + anns_field="embedding", + param={"metric_type": "L2"}, + limit=10 + ) + + query_end = time.time() + latencies.append(query_end - query_start) + query_count += 1 + query_idx += 1 + + # Break if we've done enough queries for the test + if query_count >= 100: # Limit for testing + break + + actual_duration = time.time() - start_time + + return { + "total_queries": query_count, + "duration": actual_duration, + "qps": query_count / actual_duration if actual_duration > 0 else 0, + "avg_latency": statistics.mean(latencies) if latencies else 0, + "p50_latency": statistics.median(latencies) if latencies else 0, + "p95_latency": self._percentile(latencies, 95) if latencies else 0, + "p99_latency": self._percentile(latencies, 99) if latencies else 0 + } + + def _percentile(self, data, percentile): + """Calculate percentile of data.""" + size = len(data) + if size == 0: + return 0 + sorted_data = sorted(data) + index = int(size * percentile / 100) + return sorted_data[min(index, size - 1)] + + benchmark = ThroughputBenchmark() + queries = np.random.randn(10, 128).astype(np.float32) + + result = benchmark.run(mock_collection, queries, duration=1) + + assert result["total_queries"] > 0 + assert result["qps"] > 0 + assert result["avg_latency"] > 0 + + def test_concurrent_query_execution(self, mock_collection): + """Test concurrent query execution with multiple threads.""" + query_counter = {'count': 0} + + def mock_search(data, **kwargs): + query_counter['count'] += 1 + time.sleep(0.01) # Simulate query time + return [[Mock(id=i, distance=0.1*i) for i in range(5)]] + + mock_collection.search = mock_search + + class ConcurrentBenchmark: + def __init__(self, num_threads=4): + self.num_threads = num_threads + + def worker(self, args): + """Worker function for concurrent execution.""" + collection, queries, worker_id = args + results = [] + + for i, query in enumerate(queries): + start = time.time() + result = collection.search( + data=[query], + anns_field="embedding", + param={"metric_type": "L2"}, + limit=10 + ) + latency = time.time() - start + results.append({ + "worker_id": worker_id, + "query_id": i, + "latency": latency + }) + + return results + + def run(self, collection, queries): + """Run concurrent benchmark.""" + # Split queries among workers + queries_per_worker = len(queries) // self.num_threads + worker_args = [] + + for i in range(self.num_threads): + start_idx = i * queries_per_worker + end_idx = start_idx + queries_per_worker if i < self.num_threads - 1 else len(queries) + worker_queries = queries[start_idx:end_idx] + worker_args.append((collection, worker_queries, i)) + + start_time = time.time() + + with ThreadPoolExecutor(max_workers=self.num_threads) as executor: + results = list(executor.map(self.worker, worker_args)) + + end_time = time.time() + + # Flatten results + all_results = [] + for worker_results in results: + all_results.extend(worker_results) + + total_duration = end_time - start_time + latencies = [r["latency"] for r in all_results] + + return { + "num_threads": self.num_threads, + "total_queries": len(all_results), + "duration": total_duration, + "qps": len(all_results) / total_duration if total_duration > 0 else 0, + "avg_latency": statistics.mean(latencies) if latencies else 0, + "min_latency": min(latencies) if latencies else 0, + "max_latency": max(latencies) if latencies else 0 + } + + benchmark = ConcurrentBenchmark(num_threads=4) + queries = np.random.randn(100, 128).astype(np.float32) + + result = benchmark.run(mock_collection, queries) + + assert result["total_queries"] == 100 + assert result["num_threads"] == 4 + assert result["qps"] > 0 + assert query_counter['count'] == 100 + + +class TestBenchmarkMetrics: + """Test benchmark metric collection and analysis.""" + + def test_latency_distribution(self): + """Test calculating latency distribution metrics.""" + class LatencyAnalyzer: + def __init__(self): + self.latencies = [] + + def add_latency(self, latency): + """Add a latency measurement.""" + self.latencies.append(latency) + + def get_distribution(self): + """Calculate latency distribution statistics.""" + if not self.latencies: + return {} + + sorted_latencies = sorted(self.latencies) + + return { + "count": len(self.latencies), + "mean": statistics.mean(self.latencies), + "median": statistics.median(self.latencies), + "stdev": statistics.stdev(self.latencies) if len(self.latencies) > 1 else 0, + "min": min(self.latencies), + "max": max(self.latencies), + "p50": self._percentile(sorted_latencies, 50), + "p90": self._percentile(sorted_latencies, 90), + "p95": self._percentile(sorted_latencies, 95), + "p99": self._percentile(sorted_latencies, 99), + "p999": self._percentile(sorted_latencies, 99.9) + } + + def _percentile(self, sorted_data, percentile): + """Calculate percentile from sorted data.""" + index = len(sorted_data) * percentile / 100 + lower = int(index) + upper = lower + 1 + + if upper >= len(sorted_data): + return sorted_data[-1] + + weight = index - lower + return sorted_data[lower] * (1 - weight) + sorted_data[upper] * weight + + analyzer = LatencyAnalyzer() + + # Add sample latencies (in milliseconds) + np.random.seed(42) + latencies = np.random.exponential(10, 1000) # Exponential distribution + for latency in latencies: + analyzer.add_latency(latency) + + dist = analyzer.get_distribution() + + assert dist["count"] == 1000 + assert dist["p50"] < dist["p90"] + assert dist["p90"] < dist["p95"] + assert dist["p95"] < dist["p99"] + assert dist["min"] < dist["mean"] < dist["max"] + + def test_recall_metric(self): + """Test calculating recall metrics for search results.""" + class RecallCalculator: + def __init__(self, ground_truth): + self.ground_truth = ground_truth + + def calculate_recall(self, query_id, retrieved_ids, k): + """Calculate recall@k for a query.""" + if query_id not in self.ground_truth: + return None + + true_ids = set(self.ground_truth[query_id][:k]) + retrieved_ids_set = set(retrieved_ids[:k]) + + intersection = true_ids.intersection(retrieved_ids_set) + recall = len(intersection) / len(true_ids) if true_ids else 0 + + return recall + + def calculate_average_recall(self, results, k): + """Calculate average recall@k across multiple queries.""" + recalls = [] + + for query_id, retrieved_ids in results.items(): + recall = self.calculate_recall(query_id, retrieved_ids, k) + if recall is not None: + recalls.append(recall) + + return statistics.mean(recalls) if recalls else 0 + + # Mock ground truth data + ground_truth = { + 0: [1, 2, 3, 4, 5], + 1: [6, 7, 8, 9, 10], + 2: [11, 12, 13, 14, 15] + } + + calculator = RecallCalculator(ground_truth) + + # Test perfect recall + perfect_results = { + 0: [1, 2, 3, 4, 5], + 1: [6, 7, 8, 9, 10], + 2: [11, 12, 13, 14, 15] + } + + avg_recall = calculator.calculate_average_recall(perfect_results, k=5) + assert avg_recall == 1.0 + + # Test partial recall + partial_results = { + 0: [1, 2, 3, 16, 17], # 3/5 correct + 1: [6, 7, 18, 19, 20], # 2/5 correct + 2: [11, 12, 13, 14, 21] # 4/5 correct + } + + avg_recall = calculator.calculate_average_recall(partial_results, k=5) + assert 0.5 < avg_recall < 0.7 # Should be (3+2+4)/15 = 0.6 + + def test_benchmark_summary_generation(self): + """Test generating comprehensive benchmark summary.""" + class BenchmarkSummary: + def __init__(self): + self.metrics = { + "latencies": [], + "throughputs": [], + "errors": 0, + "total_queries": 0 + } + self.start_time = None + self.end_time = None + + def start(self): + """Start benchmark timing.""" + self.start_time = time.time() + + def end(self): + """End benchmark timing.""" + self.end_time = time.time() + + def add_query_result(self, latency, success=True): + """Add a query result.""" + self.metrics["total_queries"] += 1 + + if success: + self.metrics["latencies"].append(latency) + else: + self.metrics["errors"] += 1 + + def add_throughput_sample(self, qps): + """Add a throughput sample.""" + self.metrics["throughputs"].append(qps) + + def generate_summary(self): + """Generate comprehensive benchmark summary.""" + if not self.start_time or not self.end_time: + return None + + duration = self.end_time - self.start_time + latencies = self.metrics["latencies"] + + summary = { + "duration": duration, + "total_queries": self.metrics["total_queries"], + "successful_queries": len(latencies), + "failed_queries": self.metrics["errors"], + "error_rate": self.metrics["errors"] / self.metrics["total_queries"] + if self.metrics["total_queries"] > 0 else 0 + } + + if latencies: + summary.update({ + "latency_mean": statistics.mean(latencies), + "latency_median": statistics.median(latencies), + "latency_min": min(latencies), + "latency_max": max(latencies), + "latency_p95": sorted(latencies)[int(len(latencies) * 0.95)], + "latency_p99": sorted(latencies)[int(len(latencies) * 0.99)] + }) + + if self.metrics["throughputs"]: + summary.update({ + "throughput_mean": statistics.mean(self.metrics["throughputs"]), + "throughput_max": max(self.metrics["throughputs"]), + "throughput_min": min(self.metrics["throughputs"]) + }) + + # Overall QPS + summary["overall_qps"] = self.metrics["total_queries"] / duration if duration > 0 else 0 + + return summary + + summary = BenchmarkSummary() + summary.start() + + # Simulate query results + np.random.seed(42) + for i in range(1000): + latency = np.random.exponential(10) # 10ms average + success = np.random.random() > 0.01 # 99% success rate + summary.add_query_result(latency, success) + + # Add throughput samples + for i in range(10): + summary.add_throughput_sample(100 + np.random.normal(0, 10)) + + time.sleep(0.1) # Simulate benchmark duration + summary.end() + + result = summary.generate_summary() + + assert result["total_queries"] == 1000 + assert result["error_rate"] < 0.02 # Should be around 1% + assert result["latency_p99"] > result["latency_p95"] + assert result["latency_p95"] > result["latency_median"] + + +class TestBenchmarkConfiguration: + """Test benchmark configuration and parameter tuning.""" + + def test_search_parameter_tuning(self): + """Test tuning search parameters for optimal performance.""" + class SearchParameterTuner: + def __init__(self, collection): + self.collection = collection + self.results = [] + + def test_parameters(self, params, test_queries): + """Test a set of search parameters.""" + latencies = [] + + for query in test_queries: + start = time.time() + self.collection.search( + data=[query], + anns_field="embedding", + param=params, + limit=10 + ) + latencies.append(time.time() - start) + + return { + "params": params, + "avg_latency": statistics.mean(latencies), + "p95_latency": sorted(latencies)[int(len(latencies) * 0.95)] + } + + def tune(self, parameter_sets, test_queries): + """Find optimal parameters.""" + for params in parameter_sets: + result = self.test_parameters(params, test_queries) + self.results.append(result) + + # Find best parameters based on latency + best = min(self.results, key=lambda x: x["avg_latency"]) + return best + + mock_collection = Mock() + mock_collection.search.return_value = [[Mock(id=1, distance=0.1)]] + + tuner = SearchParameterTuner(mock_collection) + + # Define parameter sets to test + parameter_sets = [ + {"metric_type": "L2", "params": {"nprobe": 10}}, + {"metric_type": "L2", "params": {"nprobe": 20}}, + {"metric_type": "L2", "params": {"nprobe": 50}}, + ] + + test_queries = np.random.randn(10, 128).astype(np.float32) + + best_params = tuner.tune(parameter_sets, test_queries) + + assert best_params is not None + assert "params" in best_params + assert "avg_latency" in best_params + + def test_workload_generation(self): + """Test generating different query workloads.""" + class WorkloadGenerator: + def __init__(self, dimension, seed=None): + self.dimension = dimension + if seed: + np.random.seed(seed) + + def generate_uniform(self, num_queries): + """Generate uniformly distributed queries.""" + return np.random.uniform(-1, 1, (num_queries, self.dimension)).astype(np.float32) + + def generate_gaussian(self, num_queries, centers=1): + """Generate queries from Gaussian distributions.""" + if centers == 1: + return np.random.randn(num_queries, self.dimension).astype(np.float32) + + # Multiple centers + queries_per_center = num_queries // centers + remainder = num_queries % centers + queries = [] + + for i in range(centers): + center = np.random.randn(self.dimension) * 10 + # Add extra query to first clusters if there's a remainder + extra = 1 if i < remainder else 0 + cluster = np.random.randn(queries_per_center + extra, self.dimension) + center + queries.append(cluster) + + return np.vstack(queries).astype(np.float32) + + def generate_skewed(self, num_queries, hot_ratio=0.2): + """Generate skewed workload with hot and cold queries.""" + num_hot = int(num_queries * hot_ratio) + num_cold = num_queries - num_hot + + # Hot queries - concentrated around a few points + hot_queries = np.random.randn(num_hot, self.dimension) * 0.1 + + # Cold queries - widely distributed + cold_queries = np.random.randn(num_cold, self.dimension) * 10 + + # Mix them + all_queries = np.vstack([hot_queries, cold_queries]) + np.random.shuffle(all_queries) + + return all_queries.astype(np.float32) + + def generate_temporal(self, num_queries, drift_rate=0.01): + """Generate queries with temporal drift.""" + queries = [] + current_center = np.zeros(self.dimension) + + for i in range(num_queries): + # Drift the center + current_center += np.random.randn(self.dimension) * drift_rate + + # Generate query around current center + query = current_center + np.random.randn(self.dimension) + queries.append(query) + + return np.array(queries).astype(np.float32) + + generator = WorkloadGenerator(dimension=128, seed=42) + + # Test uniform workload + uniform = generator.generate_uniform(100) + assert uniform.shape == (100, 128) + assert uniform.min() >= -1.1 # Small tolerance + assert uniform.max() <= 1.1 + + # Test Gaussian workload + gaussian = generator.generate_gaussian(100, centers=3) + assert gaussian.shape == (100, 128) + + # Test skewed workload + skewed = generator.generate_skewed(100, hot_ratio=0.2) + assert skewed.shape == (100, 128) + + # Test temporal workload + temporal = generator.generate_temporal(100, drift_rate=0.01) + assert temporal.shape == (100, 128) + + +class TestBenchmarkOutput: + """Test benchmark result output and reporting.""" + + def test_json_output_format(self, test_data_dir): + """Test outputting benchmark results in JSON format.""" + results = { + "timestamp": "2024-01-01T12:00:00", + "configuration": { + "collection": "test_collection", + "dimension": 1536, + "index_type": "DISKANN", + "num_processes": 4, + "batch_size": 100 + }, + "metrics": { + "total_queries": 10000, + "duration": 60.5, + "qps": 165.29, + "latency_p50": 5.2, + "latency_p95": 12.8, + "latency_p99": 18.3, + "error_rate": 0.001 + }, + "system_info": { + "cpu_count": 8, + "memory_gb": 32, + "platform": "Linux" + } + } + + output_file = test_data_dir / "benchmark_results.json" + + # Save results + with open(output_file, 'w') as f: + json.dump(results, f, indent=2) + + # Verify saved file + with open(output_file, 'r') as f: + loaded = json.load(f) + + assert loaded["metrics"]["qps"] == 165.29 + assert loaded["configuration"]["index_type"] == "DISKANN" + + def test_csv_output_format(self, test_data_dir): + """Test outputting benchmark results in CSV format.""" + import csv + + results = [ + {"timestamp": "2024-01-01T12:00:00", "qps": 150.5, "latency_p95": 12.3}, + {"timestamp": "2024-01-01T12:01:00", "qps": 155.2, "latency_p95": 11.8}, + {"timestamp": "2024-01-01T12:02:00", "qps": 148.9, "latency_p95": 12.7} + ] + + output_file = test_data_dir / "benchmark_results.csv" + + # Save results + with open(output_file, 'w', newline='') as f: + writer = csv.DictWriter(f, fieldnames=["timestamp", "qps", "latency_p95"]) + writer.writeheader() + writer.writerows(results) + + # Verify saved file + with open(output_file, 'r') as f: + reader = csv.DictReader(f) + loaded = list(reader) + + assert len(loaded) == 3 + assert float(loaded[0]["qps"]) == 150.5 + + def test_comparison_report_generation(self): + """Test generating comparison reports between benchmarks.""" + class ComparisonReport: + def __init__(self): + self.benchmarks = {} + + def add_benchmark(self, name, results): + """Add benchmark results.""" + self.benchmarks[name] = results + + def generate_comparison(self): + """Generate comparison report.""" + if len(self.benchmarks) < 2: + return None + + comparison = { + "benchmarks": [], + "best_qps": None, + "best_latency": None + } + + best_qps = 0 + best_latency = float('inf') + + for name, results in self.benchmarks.items(): + benchmark_summary = { + "name": name, + "qps": results.get("qps", 0), + "latency_p95": results.get("latency_p95", 0), + "latency_p99": results.get("latency_p99", 0), + "error_rate": results.get("error_rate", 0) + } + + comparison["benchmarks"].append(benchmark_summary) + + if benchmark_summary["qps"] > best_qps: + best_qps = benchmark_summary["qps"] + comparison["best_qps"] = name + + if benchmark_summary["latency_p95"] < best_latency: + best_latency = benchmark_summary["latency_p95"] + comparison["best_latency"] = name + + # Calculate improvements + if len(self.benchmarks) == 2: + names = list(self.benchmarks.keys()) + baseline = self.benchmarks[names[0]] + comparison_bench = self.benchmarks[names[1]] + + comparison["qps_improvement"] = ( + (comparison_bench["qps"] - baseline["qps"]) / baseline["qps"] * 100 + if baseline.get("qps", 0) > 0 else 0 + ) + + comparison["latency_improvement"] = ( + (baseline["latency_p95"] - comparison_bench["latency_p95"]) / baseline["latency_p95"] * 100 + if baseline.get("latency_p95", 0) > 0 else 0 + ) + + return comparison + + report = ComparisonReport() + + # Add benchmark results + report.add_benchmark("DISKANN", { + "qps": 1500, + "latency_p95": 10.5, + "latency_p99": 15.2, + "error_rate": 0.001 + }) + + report.add_benchmark("HNSW", { + "qps": 1200, + "latency_p95": 8.3, + "latency_p99": 12.1, + "error_rate": 0.002 + }) + + comparison = report.generate_comparison() + + assert comparison["best_qps"] == "DISKANN" + assert comparison["best_latency"] == "HNSW" + assert len(comparison["benchmarks"]) == 2 + assert comparison["qps_improvement"] == -20.0 # HNSW is 20% slower diff --git a/vdb_benchmark/tests/tests/test_vector_generation.py b/vdb_benchmark/tests/tests/test_vector_generation.py new file mode 100755 index 00000000..22cf2be9 --- /dev/null +++ b/vdb_benchmark/tests/tests/test_vector_generation.py @@ -0,0 +1,369 @@ +""" +Unit tests for vector generation utilities +""" +import pytest +import numpy as np +from unittest.mock import Mock, patch +import h5py +import tempfile +from pathlib import Path + + +class TestVectorGenerationUtilities: + """Test vector generation utility functions.""" + + def test_vector_normalization(self): + """Test different vector normalization methods.""" + class VectorNormalizer: + @staticmethod + def l2_normalize(vectors): + """L2 normalization.""" + norms = np.linalg.norm(vectors, axis=1, keepdims=True) + return vectors / (norms + 1e-10) # Add epsilon to avoid division by zero + + @staticmethod + def l1_normalize(vectors): + """L1 normalization.""" + norms = np.sum(np.abs(vectors), axis=1, keepdims=True) + return vectors / (norms + 1e-10) + + @staticmethod + def max_normalize(vectors): + """Max normalization (scale by maximum absolute value).""" + max_vals = np.max(np.abs(vectors), axis=1, keepdims=True) + return vectors / (max_vals + 1e-10) + + @staticmethod + def standardize(vectors): + """Standardization (zero mean, unit variance).""" + mean = np.mean(vectors, axis=0, keepdims=True) + std = np.std(vectors, axis=0, keepdims=True) + return (vectors - mean) / (std + 1e-10) + + # Test data + vectors = np.random.randn(100, 128).astype(np.float32) + + # Test L2 normalization + l2_norm = VectorNormalizer.l2_normalize(vectors) + norms = np.linalg.norm(l2_norm, axis=1) + np.testing.assert_array_almost_equal(norms, np.ones(100), decimal=5) + + # Test L1 normalization + l1_norm = VectorNormalizer.l1_normalize(vectors) + l1_sums = np.sum(np.abs(l1_norm), axis=1) + np.testing.assert_array_almost_equal(l1_sums, np.ones(100), decimal=5) + + # Test max normalization + max_norm = VectorNormalizer.max_normalize(vectors) + max_vals = np.max(np.abs(max_norm), axis=1) + np.testing.assert_array_almost_equal(max_vals, np.ones(100), decimal=5) + + # Test standardization + standardized = VectorNormalizer.standardize(vectors) + assert abs(np.mean(standardized)) < 0.01 # Mean should be close to 0 + assert abs(np.std(standardized) - 1.0) < 0.1 # Std should be close to 1 + + def test_vector_quantization(self): + """Test vector quantization methods.""" + class VectorQuantizer: + @staticmethod + def scalar_quantize(vectors, bits=8): + """Scalar quantization to specified bit depth.""" + min_val = np.min(vectors) + max_val = np.max(vectors) + + # Scale to [0, 2^bits - 1] + scale = (2 ** bits - 1) / (max_val - min_val) + quantized = np.round((vectors - min_val) * scale).astype(np.uint8 if bits == 8 else np.uint16) + + return quantized, (min_val, max_val, scale) + + @staticmethod + def dequantize(quantized, params): + """Dequantize vectors.""" + min_val, max_val, scale = params + return quantized.astype(np.float32) / scale + min_val + + @staticmethod + def product_quantize(vectors, num_subvectors=8, codebook_size=256): + """Simple product quantization simulation.""" + dimension = vectors.shape[1] + subvector_dim = dimension // num_subvectors + + codes = [] + codebooks = [] + + for i in range(num_subvectors): + start = i * subvector_dim + end = start + subvector_dim + subvectors = vectors[:, start:end] + + # Simulate codebook (in reality would use k-means) + codebook = np.random.randn(codebook_size, subvector_dim).astype(np.float32) + codebooks.append(codebook) + + # Assign codes (find nearest codebook entry) + # Simplified - just random assignment for testing + subvector_codes = np.random.randint(0, codebook_size, len(vectors)) + codes.append(subvector_codes) + + return np.array(codes).T, codebooks + + vectors = np.random.randn(100, 128).astype(np.float32) + + # Test scalar quantization + quantizer = VectorQuantizer() + quantized, params = quantizer.scalar_quantize(vectors, bits=8) + + assert quantized.dtype == np.uint8 + assert quantized.shape == vectors.shape + + # Test reconstruction + reconstructed = quantizer.dequantize(quantized, params) + assert reconstructed.shape == vectors.shape + + # Test product quantization + pq_codes, codebooks = quantizer.product_quantize(vectors, num_subvectors=8) + + assert pq_codes.shape == (100, 8) # 100 vectors, 8 subvectors + assert len(codebooks) == 8 + + def test_synthetic_dataset_generation(self): + """Test generating synthetic datasets with specific properties.""" + class SyntheticDataGenerator: + @staticmethod + def generate_clustered(num_vectors, dimension, num_clusters=10, cluster_std=0.1): + """Generate clustered vectors.""" + vectors_per_cluster = num_vectors // num_clusters + vectors = [] + labels = [] + + # Generate cluster centers + centers = np.random.randn(num_clusters, dimension) * 10 + + for i in range(num_clusters): + # Generate vectors around center + cluster_vectors = centers[i] + np.random.randn(vectors_per_cluster, dimension) * cluster_std + vectors.append(cluster_vectors) + labels.extend([i] * vectors_per_cluster) + + # Handle remaining vectors + remaining = num_vectors - (vectors_per_cluster * num_clusters) + if remaining > 0: + cluster_idx = np.random.randint(0, num_clusters) + extra_vectors = centers[cluster_idx] + np.random.randn(remaining, dimension) * cluster_std + vectors.append(extra_vectors) + labels.extend([cluster_idx] * remaining) + + return np.vstack(vectors).astype(np.float32), np.array(labels) + + @staticmethod + def generate_sparse(num_vectors, dimension, sparsity=0.9): + """Generate sparse vectors.""" + vectors = np.random.randn(num_vectors, dimension).astype(np.float32) + + # Create mask for sparsity + mask = np.random.random((num_vectors, dimension)) < sparsity + vectors[mask] = 0 + + return vectors + + @staticmethod + def generate_correlated(num_vectors, dimension, correlation=0.8): + """Generate vectors with correlated dimensions.""" + # Create correlation matrix + base = np.random.randn(num_vectors, 1) + + vectors = [] + for i in range(dimension): + if i == 0: + vectors.append(base.flatten()) + else: + # Mix with random noise based on correlation + noise = np.random.randn(num_vectors) + correlated = correlation * base.flatten() + (1 - correlation) * noise + vectors.append(correlated) + + return np.array(vectors).T.astype(np.float32) + + generator = SyntheticDataGenerator() + + # Test clustered generation + vectors, labels = generator.generate_clustered(1000, 128, num_clusters=10) + assert vectors.shape == (1000, 128) + assert len(labels) == 1000 + assert len(np.unique(labels)) == 10 + + # Test sparse generation + sparse_vectors = generator.generate_sparse(100, 256, sparsity=0.9) + assert sparse_vectors.shape == (100, 256) + sparsity_ratio = np.sum(sparse_vectors == 0) / sparse_vectors.size + assert 0.85 < sparsity_ratio < 0.95 # Should be approximately 0.9 + + # Test correlated generation + correlated = generator.generate_correlated(100, 64, correlation=0.8) + assert correlated.shape == (100, 64) + + def test_vector_io_operations(self, test_data_dir): + """Test saving and loading vectors in different formats.""" + class VectorIO: + @staticmethod + def save_npy(vectors, filepath): + """Save vectors as NPY file.""" + np.save(filepath, vectors) + + @staticmethod + def load_npy(filepath): + """Load vectors from NPY file.""" + return np.load(filepath) + + @staticmethod + def save_hdf5(vectors, filepath, dataset_name="vectors"): + """Save vectors as HDF5 file.""" + with h5py.File(filepath, 'w') as f: + f.create_dataset(dataset_name, data=vectors, compression="gzip") + + @staticmethod + def load_hdf5(filepath, dataset_name="vectors"): + """Load vectors from HDF5 file.""" + with h5py.File(filepath, 'r') as f: + return f[dataset_name][:] + + @staticmethod + def save_binary(vectors, filepath): + """Save vectors as binary file.""" + vectors.tofile(filepath) + + @staticmethod + def load_binary(filepath, dtype=np.float32, shape=None): + """Load vectors from binary file.""" + vectors = np.fromfile(filepath, dtype=dtype) + if shape: + vectors = vectors.reshape(shape) + return vectors + + @staticmethod + def save_text(vectors, filepath): + """Save vectors as text file.""" + np.savetxt(filepath, vectors, fmt='%.6f') + + @staticmethod + def load_text(filepath): + """Load vectors from text file.""" + return np.loadtxt(filepath, dtype=np.float32) + + io_handler = VectorIO() + vectors = np.random.randn(100, 128).astype(np.float32) + + # Test NPY format + npy_path = test_data_dir / "vectors.npy" + io_handler.save_npy(vectors, npy_path) + loaded_npy = io_handler.load_npy(npy_path) + np.testing.assert_array_almost_equal(vectors, loaded_npy) + + # Test HDF5 format + hdf5_path = test_data_dir / "vectors.h5" + io_handler.save_hdf5(vectors, hdf5_path) + loaded_hdf5 = io_handler.load_hdf5(hdf5_path) + np.testing.assert_array_almost_equal(vectors, loaded_hdf5) + + # Test binary format + bin_path = test_data_dir / "vectors.bin" + io_handler.save_binary(vectors, bin_path) + loaded_bin = io_handler.load_binary(bin_path, shape=(100, 128)) + np.testing.assert_array_almost_equal(vectors, loaded_bin) + + # Test text format (smaller dataset for text) + small_vectors = vectors[:10] + txt_path = test_data_dir / "vectors.txt" + io_handler.save_text(small_vectors, txt_path) + loaded_txt = io_handler.load_text(txt_path) + np.testing.assert_array_almost_equal(small_vectors, loaded_txt, decimal=5) + + +class TestIndexConfiguration: + """Test index-specific configurations and parameters.""" + + def test_diskann_parameter_validation(self): + """Test DiskANN index parameter validation.""" + class DiskANNConfig: + VALID_METRICS = ["L2", "IP", "COSINE"] + + @staticmethod + def validate_params(params): + """Validate DiskANN parameters.""" + errors = [] + + # Check metric type + if params.get("metric_type") not in DiskANNConfig.VALID_METRICS: + errors.append(f"Invalid metric_type: {params.get('metric_type')}") + + # Check max_degree + max_degree = params.get("max_degree", 64) + if not (1 <= max_degree <= 128): + errors.append(f"max_degree must be between 1 and 128, got {max_degree}") + + # Check search_list_size + search_list = params.get("search_list_size", 200) + if not (100 <= search_list <= 1000): + errors.append(f"search_list_size must be between 100 and 1000, got {search_list}") + + # Check PQ parameters if present + if "pq_code_budget_gb" in params: + budget = params["pq_code_budget_gb"] + if budget <= 0: + errors.append(f"pq_code_budget_gb must be positive, got {budget}") + + return len(errors) == 0, errors + + @staticmethod + def get_default_params(num_vectors, dimension): + """Get default parameters based on dataset size.""" + if num_vectors < 1000000: + return { + "metric_type": "L2", + "max_degree": 32, + "search_list_size": 100 + } + elif num_vectors < 10000000: + return { + "metric_type": "L2", + "max_degree": 64, + "search_list_size": 200 + } + else: + return { + "metric_type": "L2", + "max_degree": 64, + "search_list_size": 300, + "pq_code_budget_gb": 0.2 + } + + # Test valid parameters + valid_params = { + "metric_type": "L2", + "max_degree": 64, + "search_list_size": 200 + } + + is_valid, errors = DiskANNConfig.validate_params(valid_params) + assert is_valid is True + assert len(errors) == 0 + + # Test invalid parameters + invalid_params = { + "metric_type": "INVALID", + "max_degree": 200, + "search_list_size": 50 + } + + is_valid, errors = DiskANNConfig.validate_params(invalid_params) + assert is_valid is False + assert len(errors) == 3 + + # Test default parameter generation + small_defaults = DiskANNConfig.get_default_params(100000, 128) + assert small_defaults["max_degree"] == 32 + + large_defaults = DiskANNConfig.get_default_params(20000000, 1536) + assert "pq_code_budget_gb" in large_defaults diff --git a/vdb_benchmark/tests/tests/verify_fixes.py b/vdb_benchmark/tests/tests/verify_fixes.py new file mode 100755 index 00000000..ec482a3e --- /dev/null +++ b/vdb_benchmark/tests/tests/verify_fixes.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +""" +Test Suite Verification Script +Verifies that all test fixes have been applied correctly +""" +import subprocess +import sys +import json +from pathlib import Path + +def run_single_test(test_path): + """Run a single test and return result.""" + result = subprocess.run( + [sys.executable, "-m", "pytest", test_path, "-v", "--tb=short"], + capture_output=True, + text=True + ) + return result.returncode == 0, result.stdout, result.stderr + +def main(): + """Run all previously failing tests to verify fixes.""" + + # List of previously failing tests + failing_tests = [ + "tests/test_compact_and_watch.py::TestMonitoring::test_collection_stats_monitoring", + "tests/test_config.py::TestConfigurationLoader::test_config_environment_variable_override", + "tests/test_database_connection.py::TestConnectionResilience::test_automatic_reconnection", + "tests/test_index_management.py::TestIndexManagement::test_index_status_check", + "tests/test_load_vdb.py::TestVectorLoading::test_insertion_with_error_handling", + "tests/test_load_vdb.py::TestVectorLoading::test_insertion_rate_monitoring", + "tests/test_simple_bench.py::TestBenchmarkConfiguration::test_workload_generation" + ] + + print("=" * 60) + print("VDB-Bench Test Suite - Verification of Fixes") + print("=" * 60) + print() + + results = [] + + for test in failing_tests: + print(f"Testing: {test}") + passed, stdout, stderr = run_single_test(test) + + results.append({ + "test": test, + "passed": passed, + "output": stdout if not passed else "" + }) + + if passed: + print(" ✅ PASSED") + else: + print(" ❌ FAILED") + print(f" Error: {stderr[:200]}") + print() + + # Summary + print("=" * 60) + print("Summary") + print("=" * 60) + + passed_count = sum(1 for r in results if r["passed"]) + failed_count = len(results) - passed_count + + print(f"Total Tests: {len(results)}") + print(f"Passed: {passed_count}") + print(f"Failed: {failed_count}") + + if failed_count == 0: + print("\n✅ All previously failing tests now pass!") + return 0 + else: + print("\n❌ Some tests are still failing. Please review the fixes.") + for result in results: + if not result["passed"]: + print(f" - {result['test']}") + return 1 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/vdb_benchmark/tests/utils/__init__.py b/vdb_benchmark/tests/utils/__init__.py new file mode 100755 index 00000000..df966d6e --- /dev/null +++ b/vdb_benchmark/tests/utils/__init__.py @@ -0,0 +1,47 @@ +""" +Test utilities package for vdb-bench +""" + +from .test_helpers import ( + TestDataGenerator, + MockMilvusCollection, + PerformanceSimulator, + temporary_directory, + mock_time_progression, + create_test_yaml_config, + create_test_json_results, + assert_performance_within_bounds, + calculate_recall, + calculate_precision, + generate_random_string, + BenchmarkResultValidator +) + +from .mock_data import ( + MockDataGenerator, + BenchmarkDatasetGenerator, + QueryWorkloadGenerator, + MetricDataGenerator +) + +__all__ = [ + # Test helpers + 'TestDataGenerator', + 'MockMilvusCollection', + 'PerformanceSimulator', + 'temporary_directory', + 'mock_time_progression', + 'create_test_yaml_config', + 'create_test_json_results', + 'assert_performance_within_bounds', + 'calculate_recall', + 'calculate_precision', + 'generate_random_string', + 'BenchmarkResultValidator', + + # Mock data + 'MockDataGenerator', + 'BenchmarkDatasetGenerator', + 'QueryWorkloadGenerator', + 'MetricDataGenerator' +] diff --git a/vdb_benchmark/tests/utils/mock_data.py b/vdb_benchmark/tests/utils/mock_data.py new file mode 100755 index 00000000..da60e37d --- /dev/null +++ b/vdb_benchmark/tests/utils/mock_data.py @@ -0,0 +1,415 @@ +""" +Mock data generators for vdb-bench testing +""" +import numpy as np +import random +from typing import List, Dict, Any, Tuple, Optional +from datetime import datetime, timedelta +import json + + +class MockDataGenerator: + """Generate various types of mock data for testing.""" + + def __init__(self, seed: Optional[int] = None): + """Initialize with optional random seed for reproducibility.""" + if seed is not None: + random.seed(seed) + np.random.seed(seed) + + @staticmethod + def generate_sift_like_vectors(num_vectors: int, dimension: int = 128) -> np.ndarray: + """Generate SIFT-like vectors (similar to common benchmark datasets).""" + # SIFT vectors are typically L2-normalized and have specific distribution + vectors = np.random.randn(num_vectors, dimension).astype(np.float32) + + # Add some structure (make some dimensions more important) + important_dims = random.sample(range(dimension), k=dimension // 4) + vectors[:, important_dims] *= 3 + + # L2 normalize + norms = np.linalg.norm(vectors, axis=1, keepdims=True) + vectors = vectors / (norms + 1e-10) + + # Scale to typical SIFT range + vectors = vectors * 512 + + return vectors.astype(np.float32) + + @staticmethod + def generate_deep_learning_embeddings(num_vectors: int, + dimension: int = 768, + model_type: str = "bert") -> np.ndarray: + """Generate embeddings similar to deep learning models.""" + if model_type == "bert": + # BERT-like embeddings (768-dimensional) + vectors = np.random.randn(num_vectors, dimension).astype(np.float32) + # BERT embeddings typically have values in [-2, 2] range + vectors = np.clip(vectors * 0.5, -2, 2) + + elif model_type == "resnet": + # ResNet-like features (2048-dimensional typical) + vectors = np.random.randn(num_vectors, dimension).astype(np.float32) + # Apply ReLU-like sparsity + vectors[vectors < 0] = 0 + # L2 normalize + norms = np.linalg.norm(vectors, axis=1, keepdims=True) + vectors = vectors / (norms + 1e-10) + + elif model_type == "clip": + # CLIP-like embeddings (512-dimensional, normalized) + vectors = np.random.randn(num_vectors, dimension).astype(np.float32) + # Normalize to unit sphere + norms = np.linalg.norm(vectors, axis=1, keepdims=True) + vectors = vectors / (norms + 1e-10) + + else: + # Generic embeddings + vectors = np.random.randn(num_vectors, dimension).astype(np.float32) + + return vectors + + @staticmethod + def generate_time_series_vectors(num_vectors: int, + dimension: int = 100, + num_series: int = 10) -> Tuple[np.ndarray, List[int]]: + """Generate time series data as vectors with series labels.""" + vectors = [] + labels = [] + + for series_id in range(num_series): + # Generate base pattern for this series + base_pattern = np.sin(np.linspace(0, 4 * np.pi, dimension)) + base_pattern += np.random.randn(dimension) * 0.1 # Add noise + + # Generate variations of the pattern + series_vectors = num_vectors // num_series + for _ in range(series_vectors): + # Add temporal drift and noise + variation = base_pattern + np.random.randn(dimension) * 0.3 + variation += np.random.randn() * 0.1 # Global shift + + vectors.append(variation) + labels.append(series_id) + + # Handle remaining vectors + remaining = num_vectors - len(vectors) + for _ in range(remaining): + vectors.append(vectors[-1] + np.random.randn(dimension) * 0.1) + labels.append(labels[-1]) + + return np.array(vectors).astype(np.float32), labels + + @staticmethod + def generate_categorical_embeddings(num_vectors: int, + num_categories: int = 100, + dimension: int = 64) -> Tuple[np.ndarray, List[str]]: + """Generate embeddings for categorical data.""" + # Create embedding for each category + category_embeddings = np.random.randn(num_categories, dimension).astype(np.float32) + + # Normalize category embeddings + norms = np.linalg.norm(category_embeddings, axis=1, keepdims=True) + category_embeddings = category_embeddings / (norms + 1e-10) + + vectors = [] + categories = [] + + # Generate vectors by sampling categories + for _ in range(num_vectors): + cat_idx = random.randint(0, num_categories - 1) + + # Add small noise to category embedding + vector = category_embeddings[cat_idx] + np.random.randn(dimension) * 0.05 + + vectors.append(vector) + categories.append(f"category_{cat_idx}") + + return np.array(vectors).astype(np.float32), categories + + @staticmethod + def generate_multimodal_vectors(num_vectors: int, + text_dim: int = 768, + image_dim: int = 2048) -> Dict[str, np.ndarray]: + """Generate multimodal vectors (text + image embeddings).""" + # Generate text embeddings (BERT-like) + text_vectors = np.random.randn(num_vectors, text_dim).astype(np.float32) + text_vectors = np.clip(text_vectors * 0.5, -2, 2) + + # Generate image embeddings (ResNet-like) + image_vectors = np.random.randn(num_vectors, image_dim).astype(np.float32) + image_vectors[image_vectors < 0] = 0 # ReLU + norms = np.linalg.norm(image_vectors, axis=1, keepdims=True) + image_vectors = image_vectors / (norms + 1e-10) + + # Combined embeddings (concatenated and projected) + combined_dim = 512 + projection_matrix = np.random.randn(text_dim + image_dim, combined_dim).astype(np.float32) + projection_matrix /= np.sqrt(text_dim + image_dim) # Xavier initialization + + concatenated = np.hstack([text_vectors, image_vectors]) + combined_vectors = np.dot(concatenated, projection_matrix) + + # Normalize combined vectors + norms = np.linalg.norm(combined_vectors, axis=1, keepdims=True) + combined_vectors = combined_vectors / (norms + 1e-10) + + return { + "text": text_vectors, + "image": image_vectors, + "combined": combined_vectors + } + + +class BenchmarkDatasetGenerator: + """Generate datasets similar to common benchmarks.""" + + @staticmethod + def generate_ann_benchmark_dataset(dataset_type: str = "random", + num_train: int = 100000, + num_test: int = 10000, + dimension: int = 128, + num_neighbors: int = 100) -> Dict[str, Any]: + """Generate dataset similar to ANN-Benchmarks format.""" + + if dataset_type == "random": + train_vectors = np.random.randn(num_train, dimension).astype(np.float32) + test_vectors = np.random.randn(num_test, dimension).astype(np.float32) + + elif dataset_type == "clustered": + train_vectors = [] + num_clusters = 100 + vectors_per_cluster = num_train // num_clusters + + for _ in range(num_clusters): + center = np.random.randn(dimension) * 10 + cluster = center + np.random.randn(vectors_per_cluster, dimension) + train_vectors.append(cluster) + + train_vectors = np.vstack(train_vectors).astype(np.float32) + + # Test vectors from same distribution + test_vectors = [] + test_per_cluster = num_test // num_clusters + + for _ in range(num_clusters): + center = np.random.randn(dimension) * 10 + cluster = center + np.random.randn(test_per_cluster, dimension) + test_vectors.append(cluster) + + test_vectors = np.vstack(test_vectors).astype(np.float32) + + else: + raise ValueError(f"Unknown dataset type: {dataset_type}") + + # Generate ground truth (simplified - random for now) + ground_truth = np.random.randint(0, num_train, + (num_test, num_neighbors)) + + # Calculate distances for ground truth (simplified) + distances = np.random.random((num_test, num_neighbors)).astype(np.float32) + distances.sort(axis=1) # Ensure sorted by distance + + return { + "train": train_vectors, + "test": test_vectors, + "neighbors": ground_truth, + "distances": distances, + "dimension": dimension, + "metric": "euclidean" + } + + @staticmethod + def generate_streaming_dataset(initial_size: int = 10000, + dimension: int = 128, + stream_rate: int = 100, + drift_rate: float = 0.01) -> Dict[str, Any]: + """Generate dataset that simulates streaming/incremental scenarios.""" + # Initial dataset + initial_vectors = np.random.randn(initial_size, dimension).astype(np.float32) + + # Streaming batches with concept drift + stream_batches = [] + current_center = np.zeros(dimension) + + for batch_id in range(10): # 10 batches + # Drift the distribution center + current_center += np.random.randn(dimension) * drift_rate + + # Generate batch around drifted center + batch = current_center + np.random.randn(stream_rate, dimension) + stream_batches.append(batch.astype(np.float32)) + + return { + "initial": initial_vectors, + "stream_batches": stream_batches, + "dimension": dimension, + "stream_rate": stream_rate, + "drift_rate": drift_rate + } + + +class QueryWorkloadGenerator: + """Generate different types of query workloads.""" + + @staticmethod + def generate_uniform_workload(num_queries: int, + dimension: int, + seed: Optional[int] = None) -> np.ndarray: + """Generate uniformly distributed queries.""" + if seed: + np.random.seed(seed) + + return np.random.uniform(-1, 1, (num_queries, dimension)).astype(np.float32) + + @staticmethod + def generate_hotspot_workload(num_queries: int, + dimension: int, + num_hotspots: int = 5, + hotspot_ratio: float = 0.8) -> np.ndarray: + """Generate workload with hotspots (skewed distribution).""" + queries = [] + + # Generate hotspot centers + hotspots = np.random.randn(num_hotspots, dimension) * 10 + + num_hot_queries = int(num_queries * hotspot_ratio) + num_cold_queries = num_queries - num_hot_queries + + # Hot queries - concentrated around hotspots + for _ in range(num_hot_queries): + hotspot_idx = random.randint(0, num_hotspots - 1) + query = hotspots[hotspot_idx] + np.random.randn(dimension) * 0.1 + queries.append(query) + + # Cold queries - random distribution + cold_queries = np.random.randn(num_cold_queries, dimension) * 5 + queries.extend(cold_queries) + + # Shuffle to mix hot and cold queries + queries = np.array(queries) + np.random.shuffle(queries) + + return queries.astype(np.float32) + + @staticmethod + def generate_temporal_workload(num_queries: int, + dimension: int, + time_windows: int = 10) -> List[np.ndarray]: + """Generate workload that changes over time.""" + queries_per_window = num_queries // time_windows + workload_windows = [] + + # Start with initial distribution center + current_center = np.zeros(dimension) + + for window in range(time_windows): + # Drift the center over time + drift = np.random.randn(dimension) * 0.5 + current_center += drift + + # Generate queries for this time window + window_queries = current_center + np.random.randn(queries_per_window, dimension) + workload_windows.append(window_queries.astype(np.float32)) + + return workload_windows + + @staticmethod + def generate_mixed_workload(num_queries: int, + dimension: int) -> Dict[str, np.ndarray]: + """Generate mixed workload with different query types.""" + workload = {} + + # Point queries (exact vectors) + num_point = num_queries // 4 + workload["point"] = np.random.randn(num_point, dimension).astype(np.float32) + + # Range queries (represented as center + radius) + num_range = num_queries // 4 + range_centers = np.random.randn(num_range, dimension).astype(np.float32) + range_radii = np.random.uniform(0.1, 2.0, num_range).astype(np.float32) + workload["range"] = {"centers": range_centers, "radii": range_radii} + + # KNN queries (standard similarity search) + num_knn = num_queries // 4 + workload["knn"] = np.random.randn(num_knn, dimension).astype(np.float32) + + # Filtered queries (queries with metadata filters) + num_filtered = num_queries - num_point - num_range - num_knn + filtered_queries = np.random.randn(num_filtered, dimension).astype(np.float32) + filters = [{"category": random.choice(["A", "B", "C"])} for _ in range(num_filtered)] + workload["filtered"] = {"queries": filtered_queries, "filters": filters} + + return workload + + +class MetricDataGenerator: + """Generate realistic metric data for testing.""" + + @staticmethod + def generate_latency_distribution(num_samples: int = 1000, + distribution: str = "lognormal", + mean: float = 10, + std: float = 5) -> np.ndarray: + """Generate realistic latency distribution.""" + if distribution == "lognormal": + # Log-normal distribution (common for latencies) + log_mean = np.log(mean / np.sqrt(1 + (std / mean) ** 2)) + log_std = np.sqrt(np.log(1 + (std / mean) ** 2)) + latencies = np.random.lognormal(log_mean, log_std, num_samples) + + elif distribution == "exponential": + # Exponential distribution + latencies = np.random.exponential(mean, num_samples) + + elif distribution == "gamma": + # Gamma distribution + shape = (mean / std) ** 2 + scale = std ** 2 / mean + latencies = np.random.gamma(shape, scale, num_samples) + + else: + # Normal distribution (less realistic for latencies) + latencies = np.random.normal(mean, std, num_samples) + latencies = np.maximum(latencies, 0.1) # Ensure positive + + return latencies.astype(np.float32) + + @staticmethod + def generate_throughput_series(duration: int = 3600, # 1 hour in seconds + base_qps: float = 1000, + pattern: str = "steady") -> List[Tuple[float, float]]: + """Generate time series of throughput measurements.""" + series = [] + + if pattern == "steady": + for t in range(duration): + qps = base_qps + np.random.normal(0, base_qps * 0.05) + series.append((t, max(0, qps))) + + elif pattern == "diurnal": + # Simulate daily pattern + for t in range(duration): + # Use sine wave for daily pattern + hour = (t / 3600) % 24 + multiplier = 0.5 + 0.5 * np.sin(2 * np.pi * (hour - 6) / 24) + qps = base_qps * multiplier + np.random.normal(0, base_qps * 0.05) + series.append((t, max(0, qps))) + + elif pattern == "spike": + # Occasional spikes + for t in range(duration): + if random.random() < 0.01: # 1% chance of spike + qps = base_qps * random.uniform(2, 5) + else: + qps = base_qps + np.random.normal(0, base_qps * 0.05) + series.append((t, max(0, qps))) + + elif pattern == "degrading": + # Performance degradation over time + for t in range(duration): + degradation = 1 - (t / duration) * 0.5 # 50% degradation + qps = base_qps * degradation + np.random.normal(0, base_qps * 0.05) + series.append((t, max(0, qps))) + + return series diff --git a/vdb_benchmark/tests/utils/test_helpers.py b/vdb_benchmark/tests/utils/test_helpers.py new file mode 100755 index 00000000..1721ba92 --- /dev/null +++ b/vdb_benchmark/tests/utils/test_helpers.py @@ -0,0 +1,458 @@ +""" +Test helper utilities for vdb-bench tests +""" +import numpy as np +import time +import json +import yaml +from pathlib import Path +from typing import Dict, Any, List, Optional, Tuple +from unittest.mock import Mock, MagicMock +import random +import string +from contextlib import contextmanager +import tempfile +import shutil + + +class TestDataGenerator: + """Generate test data for various scenarios.""" + + @staticmethod + def generate_vectors(num_vectors: int, dimension: int, + distribution: str = "normal", + seed: Optional[int] = None) -> np.ndarray: + """Generate test vectors with specified distribution.""" + if seed is not None: + np.random.seed(seed) + + if distribution == "normal": + return np.random.randn(num_vectors, dimension).astype(np.float32) + elif distribution == "uniform": + return np.random.uniform(-1, 1, (num_vectors, dimension)).astype(np.float32) + elif distribution == "sparse": + vectors = np.random.randn(num_vectors, dimension).astype(np.float32) + mask = np.random.random((num_vectors, dimension)) < 0.9 + vectors[mask] = 0 + return vectors + elif distribution == "clustered": + vectors = [] + clusters = 10 + vectors_per_cluster = num_vectors // clusters + + for _ in range(clusters): + center = np.random.randn(dimension) * 10 + cluster_vectors = center + np.random.randn(vectors_per_cluster, dimension) * 0.5 + vectors.append(cluster_vectors) + + return np.vstack(vectors).astype(np.float32) + else: + raise ValueError(f"Unknown distribution: {distribution}") + + @staticmethod + def generate_ids(num_ids: int, start: int = 0) -> List[int]: + """Generate sequential IDs.""" + return list(range(start, start + num_ids)) + + @staticmethod + def generate_metadata(num_items: int) -> List[Dict[str, Any]]: + """Generate random metadata for vectors.""" + metadata = [] + + for i in range(num_items): + metadata.append({ + "id": i, + "category": random.choice(["A", "B", "C", "D"]), + "timestamp": time.time() + i, + "score": random.random(), + "tags": random.sample(["tag1", "tag2", "tag3", "tag4", "tag5"], + k=random.randint(1, 3)) + }) + + return metadata + + @staticmethod + def generate_ground_truth(num_queries: int, num_vectors: int, + top_k: int = 100) -> Dict[int, List[int]]: + """Generate ground truth for recall calculation.""" + ground_truth = {} + + for query_id in range(num_queries): + # Generate random ground truth IDs + true_ids = random.sample(range(num_vectors), + min(top_k, num_vectors)) + ground_truth[query_id] = true_ids + + return ground_truth + + @staticmethod + def generate_config(collection_name: str = "test_collection") -> Dict[str, Any]: + """Generate test configuration.""" + return { + "database": { + "host": "localhost", + "port": 19530, + "database": "default", + "timeout": 30 + }, + "dataset": { + "collection_name": collection_name, + "num_vectors": 10000, + "dimension": 128, + "distribution": "uniform", + "batch_size": 1000, + "num_shards": 2 + }, + "index": { + "index_type": "HNSW", + "metric_type": "L2", + "params": { + "M": 16, + "efConstruction": 200 + } + }, + "benchmark": { + "num_queries": 1000, + "top_k": 10, + "num_processes": 4, + "runtime": 60 + } + } + + +class MockMilvusCollection: + """Advanced mock Milvus collection for testing.""" + + def __init__(self, name: str, dimension: int = 128): + self.name = name + self.dimension = dimension + self.vectors = [] + self.ids = [] + self.num_entities = 0 + self.index = None + self.is_loaded = False + self.partitions = [] + self.schema = Mock() + self.description = f"Mock collection {name}" + + # Index-related attributes + self.index_progress = 0 + self.index_state = "NotExist" + self.index_params = None + + # Compaction-related + self.compaction_id = None + self.compaction_state = "Idle" + + # Search behavior + self.search_latency = 0.01 # Default 10ms + self.search_results = None + + def insert(self, data: List) -> Mock: + """Mock insert operation.""" + vectors = data[0] if isinstance(data[0], (list, np.ndarray)) else data + num_new = len(vectors) if hasattr(vectors, '__len__') else 1 + + self.vectors.extend(vectors) + new_ids = list(range(self.num_entities, self.num_entities + num_new)) + self.ids.extend(new_ids) + self.num_entities += num_new + + result = Mock() + result.primary_keys = new_ids + result.insert_count = num_new + + return result + + def search(self, data: List, anns_field: str, param: Dict, + limit: int = 10, **kwargs) -> List: + """Mock search operation.""" + time.sleep(self.search_latency) # Simulate latency + + if self.search_results: + return self.search_results + + # Generate mock results + results = [] + for query in data: + query_results = [] + for i in range(min(limit, 10)): + result = Mock() + result.id = random.randint(0, max(self.num_entities - 1, 0)) + result.distance = random.random() + query_results.append(result) + results.append(query_results) + + return results + + def create_index(self, field_name: str, index_params: Dict) -> bool: + """Mock index creation.""" + self.index_params = index_params + self.index_state = "InProgress" + self.index_progress = 0 + + # Simulate index building + self.index = Mock() + self.index.params = index_params + self.index.field_name = field_name + + return True + + def drop_index(self, field_name: str) -> None: + """Mock index dropping.""" + self.index = None + self.index_state = "NotExist" + self.index_progress = 0 + self.index_params = None + + def load(self) -> None: + """Mock collection loading.""" + self.is_loaded = True + + def release(self) -> None: + """Mock collection release.""" + self.is_loaded = False + + def flush(self) -> None: + """Mock flush operation.""" + pass # Simulate successful flush + + def compact(self) -> int: + """Mock compaction operation.""" + self.compaction_id = random.randint(1000, 9999) + self.compaction_state = "Executing" + return self.compaction_id + + def get_compaction_state(self, compaction_id: int) -> str: + """Mock getting compaction state.""" + return self.compaction_state + + def drop(self) -> None: + """Mock collection drop.""" + self.vectors = [] + self.ids = [] + self.num_entities = 0 + self.index = None + + def create_partition(self, partition_name: str) -> None: + """Mock partition creation.""" + if partition_name not in self.partitions: + self.partitions.append(partition_name) + + def has_partition(self, partition_name: str) -> bool: + """Check if partition exists.""" + return partition_name in self.partitions + + def get_stats(self) -> Dict[str, Any]: + """Get collection statistics.""" + return { + "row_count": self.num_entities, + "partitions": len(self.partitions), + "index_state": self.index_state, + "loaded": self.is_loaded + } + + +class PerformanceSimulator: + """Simulate performance metrics for testing.""" + + def __init__(self): + self.base_latency = 10 # Base latency in ms + self.base_qps = 1000 + self.variation = 0.2 # 20% variation + + def simulate_latency(self, num_samples: int = 100) -> List[float]: + """Generate simulated latency values.""" + latencies = [] + + for _ in range(num_samples): + # Add random variation + variation = random.uniform(1 - self.variation, 1 + self.variation) + latency = self.base_latency * variation + + # Occasionally add outliers + if random.random() < 0.05: # 5% outliers + latency *= random.uniform(2, 5) + + latencies.append(latency) + + return latencies + + def simulate_throughput(self, duration: int = 60) -> List[Tuple[float, float]]: + """Generate simulated throughput over time.""" + throughput_data = [] + current_time = 0 + + while current_time < duration: + # Simulate varying QPS + variation = random.uniform(1 - self.variation, 1 + self.variation) + qps = self.base_qps * variation + + # Occasionally simulate load spikes or drops + if random.random() < 0.1: # 10% chance of anomaly + if random.random() < 0.5: + qps *= 0.5 # Drop + else: + qps *= 1.5 # Spike + + throughput_data.append((current_time, qps)) + current_time += 1 + + return throughput_data + + def simulate_resource_usage(self, duration: int = 60) -> Dict[str, List[Tuple[float, float]]]: + """Simulate CPU and memory usage over time.""" + cpu_usage = [] + memory_usage = [] + + base_cpu = 50 + base_memory = 60 + + for t in range(duration): + # CPU usage + cpu = base_cpu + random.uniform(-10, 20) + cpu = max(0, min(100, cpu)) # Clamp to 0-100 + cpu_usage.append((t, cpu)) + + # Memory usage (more stable) + memory = base_memory + random.uniform(-5, 10) + memory = max(0, min(100, memory)) + memory_usage.append((t, memory)) + + # Gradually increase if simulating memory leak + if random.random() < 0.1: + base_memory += 0.5 + + return { + "cpu": cpu_usage, + "memory": memory_usage + } + + +@contextmanager +def temporary_directory(): + """Context manager for temporary directory.""" + temp_dir = tempfile.mkdtemp() + try: + yield Path(temp_dir) + finally: + shutil.rmtree(temp_dir, ignore_errors=True) + + +@contextmanager +def mock_time_progression(increments: List[float]): + """Mock time.time() with controlled progression.""" + time_values = [] + current = 0 + + for increment in increments: + current += increment + time_values.append(current) + + with patch('time.time', side_effect=time_values): + yield + + +def create_test_yaml_config(path: Path, config: Dict[str, Any]) -> None: + """Create a YAML configuration file for testing.""" + with open(path, 'w') as f: + yaml.dump(config, f, default_flow_style=False) + + +def create_test_json_results(path: Path, results: Dict[str, Any]) -> None: + """Create a JSON results file for testing.""" + with open(path, 'w') as f: + json.dump(results, f, indent=2) + + +def assert_performance_within_bounds(actual: float, expected: float, + tolerance: float = 0.1) -> None: + """Assert that performance metric is within expected bounds.""" + lower_bound = expected * (1 - tolerance) + upper_bound = expected * (1 + tolerance) + + assert lower_bound <= actual <= upper_bound, \ + f"Performance {actual} not within {tolerance*100}% of expected {expected}" + + +def calculate_recall(retrieved: List[int], relevant: List[int], k: int) -> float: + """Calculate recall@k metric.""" + retrieved_k = set(retrieved[:k]) + relevant_k = set(relevant[:k]) + + if not relevant_k: + return 0.0 + + intersection = retrieved_k.intersection(relevant_k) + return len(intersection) / len(relevant_k) + + +def calculate_precision(retrieved: List[int], relevant: List[int], k: int) -> float: + """Calculate precision@k metric.""" + retrieved_k = set(retrieved[:k]) + relevant_set = set(relevant) + + if not retrieved_k: + return 0.0 + + intersection = retrieved_k.intersection(relevant_set) + return len(intersection) / len(retrieved_k) + + +def generate_random_string(length: int = 10) -> str: + """Generate random string for testing.""" + return ''.join(random.choices(string.ascii_lowercase + string.digits, k=length)) + + +class BenchmarkResultValidator: + """Validate benchmark results for consistency.""" + + @staticmethod + def validate_metrics(metrics: Dict[str, Any]) -> Tuple[bool, List[str]]: + """Validate that metrics are reasonable.""" + errors = [] + + # Check required fields + required_fields = ["qps", "latency_p50", "latency_p95", "latency_p99"] + for field in required_fields: + if field not in metrics: + errors.append(f"Missing required field: {field}") + + # Check value ranges + if "qps" in metrics: + if metrics["qps"] <= 0: + errors.append("QPS must be positive") + if metrics["qps"] > 1000000: + errors.append("QPS seems unrealistically high") + + if "latency_p50" in metrics and "latency_p95" in metrics: + if metrics["latency_p50"] > metrics["latency_p95"]: + errors.append("P50 latency cannot be greater than P95") + + if "latency_p95" in metrics and "latency_p99" in metrics: + if metrics["latency_p95"] > metrics["latency_p99"]: + errors.append("P95 latency cannot be greater than P99") + + if "error_rate" in metrics: + if not (0 <= metrics["error_rate"] <= 1): + errors.append("Error rate must be between 0 and 1") + + return len(errors) == 0, errors + + @staticmethod + def validate_consistency(results: List[Dict[str, Any]]) -> Tuple[bool, List[str]]: + """Check consistency across multiple benchmark runs.""" + if len(results) < 2: + return True, [] + + errors = [] + + # Check for extreme variations + qps_values = [r["qps"] for r in results if "qps" in r] + if qps_values: + mean_qps = sum(qps_values) / len(qps_values) + for i, qps in enumerate(qps_values): + if abs(qps - mean_qps) / mean_qps > 0.5: # 50% variation + errors.append(f"Run {i} has QPS {qps} which varies >50% from mean {mean_qps}") + + return len(errors) == 0, errors diff --git a/vdb_benchmark/vdbbench/__init__.py b/vdb_benchmark/vdbbench/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vdb_benchmark/vdbbench/compact_and_watch.py b/vdb_benchmark/vdbbench/compact_and_watch.py new file mode 100644 index 00000000..b6fafa47 --- /dev/null +++ b/vdb_benchmark/vdbbench/compact_and_watch.py @@ -0,0 +1,292 @@ +import argparse +import logging +import os +import sys +import time + +from datetime import datetime, timedelta +from pymilvus import connections, Collection, utility + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) + +# Add the parent directory to sys.path to import config_loader +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from vdbbench.config_loader import load_config, merge_config_with_args + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Monitor Milvus collection compaction process") + parser.add_argument("--host", type=str, default="127.0.0.1", help="Milvus server host") + parser.add_argument("--port", type=str, default="19530", help="Milvus server port") + parser.add_argument("--collection", type=str, required=False, help="Collection name to compact and monitor") + parser.add_argument("--interval", type=int, default=5, help="Monitoring interval in seconds") + parser.add_argument("--compact", action="store_true", help="Perform compaction before monitoring") + parser.add_argument("--zero-threshold", type=int, default=90, + help="Time in seconds to wait with zero pending rows before considering complete") + parser.add_argument("--config", type=str, help="Path to YAML configuration file") + + args = parser.parse_args() + + # Track which arguments were explicitly set vs using defaults + args.is_default = { + 'host': args.host == "127.0.0.1", + 'port': args.port == "19530", + 'interval': args.interval == 5, + 'zero_threshold': args.zero_threshold == 90, + 'compact': not args.compact # Default is False + } + + # Load configuration from YAML if specified + config = {} + if args.config: + config = load_config(args.config) + args = merge_config_with_args(config, args) + + # Validate required parameters + if not args.collection: + parser.error("Collection name is required. Specify with --collection or in config file.") + + return args + + +def connect_to_milvus(host, port): + """Connect to Milvus server""" + try: + connections.connect( + "default", + host=host, + port=port, + max_receive_message_length=514_983_574, + max_send_message_length=514_983_574 + ) + logging.info(f"Connected to Milvus server at {host}:{port}") + return True + except Exception as e: + logging.error(f"Failed to connect to Milvus: {str(e)}") + return False + +def perform_compaction(collection_name): + """Perform compaction on the collection""" + try: + collection = Collection(name=collection_name) + logging.info(f"Starting compaction on collection: {collection_name}") + compaction_start = time.time() + collection.compact() + compaction_time = time.time() - compaction_start + logging.info(f"Compaction command completed in {compaction_time:.2f} seconds") + return True + except Exception as e: + logging.error(f"Failed to perform compaction: {str(e)}") + return False + +def monitor_progress(collection_name, interval=60, zero_threshold=300): + """Monitor the progress of index building/compaction""" + start_time = time.time() + prev_check_time = start_time + + try: + # Get initial progress + prev_progress = utility.index_building_progress(collection_name=collection_name) + initial_indexed_rows = prev_progress.get("indexed_rows", 0) + initial_pending_rows = prev_progress.get("pending_index_rows", 0) + total_rows = prev_progress.get("total_rows", 0) + + logging.info(f"Starting to monitor progress for collection: {collection_name}") + logging.info(f"Initial state: {initial_indexed_rows:,} of {total_rows:,} rows indexed") + logging.info(f"Initial pending rows: {initial_pending_rows:,}") + + # Track the phases + indexing_phase_complete = initial_indexed_rows >= total_rows + pending_phase_complete = False + + # Track time with zero pending rows + pending_zero_start_time = None + + while True: + time.sleep(interval) # Check at specified interval + current_time = time.time() + elapsed_time = current_time - start_time + time_since_last_check = current_time - prev_check_time + + try: + progress = utility.index_building_progress(collection_name=collection_name) + + # Calculate progress metrics + indexed_rows = progress.get("indexed_rows", 0) + total_rows = progress.get("total_rows", total_rows) # Use previous if not available + pending_rows = progress.get("pending_index_rows", 0) + + # Quick exit: + if pending_rows == 0 and indexed_rows == total_rows: + # Ensure the pending counter has started + if not pending_zero_start_time: + pending_zero_start_time = current_time + logging.info("No pending rows detected. Assuming indexing phase is complete.") + indexing_phase_complete = True + + # Calculate both overall and recent indexing rates + total_rows_indexed_since_start = indexed_rows - initial_indexed_rows + rows_since_last_check = indexed_rows - prev_progress.get("indexed_rows", indexed_rows) + + # Calculate pending rows reduction + pending_rows_reduction = prev_progress.get("pending_index_rows", pending_rows) - pending_rows + pending_reduction_rate = pending_rows_reduction / time_since_last_check if time_since_last_check > 0 else 0 + + # Calculate overall rate (based on total time since monitoring began) + if elapsed_time > 0: + # Calculate percent done regardless of whether new rows were indexed + percent_done = indexed_rows / total_rows * 100 if total_rows > 0 else 100 + + if total_rows_indexed_since_start > 0: + # Normal case: some rows have been indexed since we started monitoring + overall_indexing_rate = total_rows_indexed_since_start / elapsed_time # rows per second + remaining_rows = total_rows - indexed_rows + estimated_seconds_remaining = remaining_rows / overall_indexing_rate if overall_indexing_rate > 0 else float('inf') + + # Alternative estimate based on pending rows + pending_estimate = pending_rows / pending_reduction_rate if pending_reduction_rate > 0 and pending_rows > 0 else float('inf') + + # Calculate recent rate (for comparison) + recent_indexing_rate = rows_since_last_check / time_since_last_check if time_since_last_check > 0 else 0 + + # Format the estimated time remaining + eta = datetime.now() + timedelta(seconds=estimated_seconds_remaining) + eta_str = eta.strftime("%Y-%m-%d %H:%M:%S") + + # Format the pending-based estimate + pending_eta = datetime.now() + timedelta(seconds=pending_estimate) if pending_estimate != float('inf') else "Unknown" + if isinstance(pending_eta, datetime): + pending_eta_str = pending_eta.strftime("%Y-%m-%d %H:%M:%S") + else: + pending_eta_str = str(pending_eta) + + # Log progress with estimates + if not indexing_phase_complete: + # Still in initial indexing phase + logging.info( + f"Phase 1 - Building index: {percent_done:.2f}% complete... " + f"({indexed_rows:,}/{total_rows:,} rows) | " + f"Pending rows: {pending_rows:,} | " + f"Overall rate: {overall_indexing_rate:.2f} rows/sec | " + f"Recent rate: {recent_indexing_rate:.2f} rows/sec | " + f"ETA: {eta_str} | " + f"Est. remaining: {timedelta(seconds=int(estimated_seconds_remaining))}" + ) + else: + # In pending rows processing phase + if pending_rows > 0: + # Reset the zero pending timer if we see pending rows + pending_zero_start_time = None + + logging.info( + f"Phase 2 - Processing pending rows: {pending_rows:,} remaining | " + f"Reduction rate: {pending_reduction_rate:.2f} rows/sec | " + f"ETA: {pending_eta_str} | " + f"Est. remaining: {timedelta(seconds=int(pending_estimate)) if pending_estimate != float('inf') else 'Unknown'}" + ) + else: + # Handle zero pending rows case (same as below) + if pending_zero_start_time is None: + pending_zero_start_time = current_time + logging.info(f"No pending rows detected. Starting {zero_threshold//60}-minute confirmation timer.") + else: + zero_pending_time = current_time - pending_zero_start_time + logging.info(f"No pending rows for {zero_pending_time:.1f} seconds (waiting for {zero_threshold} seconds to confirm)") + + if zero_pending_time >= zero_threshold: + logging.info(f"No pending rows detected for {zero_threshold//60} minutes. Process is considered complete.") + pending_phase_complete = True + else: + # Special case: all rows were already indexed when we started monitoring + logging.info( + f"Progress: {percent_done:.2f}% complete... " + f"({indexed_rows:,}/{total_rows:,} rows) | " + f"Pending rows: {pending_rows:,}" + ) + + # If all rows are indexed and there are no pending rows, we might be done + if indexed_rows >= total_rows and pending_rows == 0: + if not indexing_phase_complete: + indexing_phase_complete = True + logging.info(f"Initial indexing phase complete! All {indexed_rows:,} rows have been indexed.") + + # Handle zero pending rows case + if pending_zero_start_time is None: + pending_zero_start_time = current_time + logging.info(f"No pending rows detected. Starting {zero_threshold}-second confirmation timer.") + else: + zero_pending_time = current_time - pending_zero_start_time + logging.info(f"No pending rows for {zero_pending_time:.1f} seconds (waiting for {zero_threshold} seconds to confirm)") + + if zero_pending_time >= zero_threshold: + logging.info(f"No pending rows detected for {zero_threshold} seconds. Process is considered complete.") + pending_phase_complete = True + else: + # If no time has elapsed (first iteration) + percent_done = indexed_rows / total_rows * 100 if total_rows > 0 else 0 + logging.info( + f"Progress: {percent_done:.2f}% complete... " + f"({indexed_rows:,}/{total_rows:,} rows) | " + f"Pending rows: {pending_rows:,} | " + f"Initial measurement, no progress data yet" + ) + + # Check if pending phase is complete + if not pending_phase_complete and pending_rows == 0: + # If we've already waited long enough with zero pending rows + if pending_zero_start_time is not None and (current_time - pending_zero_start_time) >= zero_threshold: + pending_phase_complete = True + logging.info(f"Pending rows processing complete! All pending rows have been processed.") + + # Check if both phases are complete + if (indexed_rows >= total_rows or indexing_phase_complete) and pending_phase_complete: + total_time = time.time() - start_time + logging.info(f"Process fully complete! Total time: {timedelta(seconds=int(total_time))}") + break + + # Update for next iteration + prev_progress = progress + prev_check_time = current_time + + except Exception as e: + logging.error(f"Error checking progress: {str(e)}") + time.sleep(5) # Short delay before retrying + + except Exception as e: + logging.error(f"Error in monitor_progress: {str(e)}") + return False + + return True + +def main(): + args = parse_args() + + # Connect to Milvus + if not connect_to_milvus(args.host, args.port): + return 1 + + # Perform compaction if requested + if args.compact: + if not perform_compaction(args.collection): + return 1 + + # Monitor progress + logging.info(f"Starting to monitor progress (checking every {args.interval} seconds)") + if not monitor_progress(args.collection, args.interval, args.zero_threshold): + return 1 + + logging.info("Monitoring completed successfully!") + return 0 + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/vdb_benchmark/vdbbench/config_loader.py b/vdb_benchmark/vdbbench/config_loader.py new file mode 100644 index 00000000..ba6449d5 --- /dev/null +++ b/vdb_benchmark/vdbbench/config_loader.py @@ -0,0 +1,60 @@ +import yaml +import os + +def load_config(config_file=None): + """ + Load configuration from a YAML file. + + Args: + config_file (str): Path to the YAML configuration file + + Returns: + dict: Configuration dictionary or empty dict if file not found + """ + if not config_file: + return {} + + path_exists = os.path.exists(config_file) + configs_path_exists = os.path.exists(os.path.join("configs", config_file)) + if path_exists or configs_path_exists: + config_file = config_file if path_exists else os.path.join("configs", config_file) + else: + print(f"ERROR: Configuration file not found: {config_file}") + return {} + + try: + with open(config_file, 'r') as f: + config = yaml.safe_load(f) + print(f"Loaded vdbbench configuration from {config_file}") + return config + except Exception as e: + print("ERROR - Error loading configuration file: {str(e)}") + return {} + + +def merge_config_with_args(config, args): + """ + Merge configuration from YAML with command line arguments. + Command line arguments take precedence over YAML configuration. + + Args: + config (dict): Configuration dictionary from YAML + args (Namespace): Parsed command line arguments + + Returns: + Namespace: Updated arguments with values from config where not specified in args + """ + # Convert args to a dictionary + args_dict = vars(args) + + # For each key in config, if the corresponding arg is None or has a default value, + # update it with the value from config + for section, params in config.items(): + for key, value in params.items(): + if key in args_dict and (args_dict[key] is None or + (hasattr(args, 'is_default') and + key in args.is_default and + args.is_default[key])): + args_dict[key] = value + + return args diff --git a/vdb_benchmark/vdbbench/configs/10m_diskann.yaml b/vdb_benchmark/vdbbench/configs/10m_diskann.yaml new file mode 100644 index 00000000..a25b6810 --- /dev/null +++ b/vdb_benchmark/vdbbench/configs/10m_diskann.yaml @@ -0,0 +1,26 @@ +database: + host: 127.0.0.1 + port: 19530 + database: milvus + max_receive_message_length: 514_983_574 + max_send_message_length: 514_983_574 + +dataset: + collection_name: mlps_10m_10shards_1536dim_uniform_diskann + num_vectors: 10_000_000 + dimension: 1536 + distribution: uniform + chunk_size: 1_000_000 + batch_size: 1000 + num_shards: 10 + vector_dtype: FLOAT_VECTOR + +index: + index_type: DISKANN + metric_type: COSINE + #index_params + max_degree: 64 + search_list_size: 200 + +workflow: + compact: True diff --git a/vdb_benchmark/vdbbench/configs/10m_hnsw.yaml b/vdb_benchmark/vdbbench/configs/10m_hnsw.yaml new file mode 100644 index 00000000..da4228f1 --- /dev/null +++ b/vdb_benchmark/vdbbench/configs/10m_hnsw.yaml @@ -0,0 +1,26 @@ +database: + host: 127.0.0.1 + port: 19530 + database: milvus + max_receive_message_length: 514_983_574 + max_send_message_length: 514_983_574 + +dataset: + collection_name: mlps_10m_10shards_1536dim_uniform_hnsw + num_vectors: 10_000_000 + dimension: 1536 + distribution: uniform + chunk_size: 1_000_000 + batch_size: 1000 + num_shards: 10 + vector_dtype: FLOAT_VECTOR + +index: + index_type: HNSW + metric_type: COSINE + #index_params + M: 64 + ef_construction: 200 + +workflow: + compact: True diff --git a/vdb_benchmark/vdbbench/configs/1m_diskann.yaml b/vdb_benchmark/vdbbench/configs/1m_diskann.yaml new file mode 100644 index 00000000..34d55707 --- /dev/null +++ b/vdb_benchmark/vdbbench/configs/1m_diskann.yaml @@ -0,0 +1,26 @@ +database: + host: 127.0.0.1 + port: 19530 + database: milvus + max_receive_message_length: 514_983_574 + max_send_message_length: 514_983_574 + +dataset: + collection_name: mlps_1m_1shards_1536dim_uniform_diskann + num_vectors: 1_000_000 + dimension: 1536 + distribution: uniform + chunk_size: 100_000 + batch_size: 1000 + num_shards: 1 + vector_dtype: FLOAT_VECTOR + +index: + index_type: DISKANN + metric_type: COSINE + #index_params + max_degree: 64 + search_list_size: 200 + +workflow: + compact: True diff --git a/vdb_benchmark/vdbbench/configs/1m_hnsw.yaml b/vdb_benchmark/vdbbench/configs/1m_hnsw.yaml new file mode 100644 index 00000000..1aeb4283 --- /dev/null +++ b/vdb_benchmark/vdbbench/configs/1m_hnsw.yaml @@ -0,0 +1,26 @@ +database: + host: 127.0.0.1 + port: 19530 + database: milvus + max_receive_message_length: 514_983_574 + max_send_message_length: 514_983_574 + +dataset: + collection_name: mlps_1m_1shards_1536dim_uniform_hnsw + num_vectors: 1_000_000 + dimension: 1536 + distribution: uniform + chunk_size: 100_000 + batch_size: 1000 + num_shards: 1 + vector_dtype: FLOAT_VECTOR + +index: + index_type: HNSW + metric_type: COSINE + #index_params + M: 64 + ef_construction: 200 + +workflow: + compact: True diff --git a/vdb_benchmark/vdbbench/list_collections.py b/vdb_benchmark/vdbbench/list_collections.py new file mode 100644 index 00000000..d6633cbc --- /dev/null +++ b/vdb_benchmark/vdbbench/list_collections.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +""" +Milvus Collection Information Script + +This script connects to a Milvus instance and lists all collections with detailed information +including the number of vectors in each collection and index information. +""" + +import sys +import os +import argparse +import logging +from tabulate import tabulate +from typing import Dict, List, Any + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Add the parent directory to sys.path to import config_loader +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +try: + from pymilvus import connections, utility, Collection +except ImportError: + logger.error("Error: pymilvus package not found. Please install it with 'pip install pymilvus'") + sys.exit(1) + +try: + from tabulate import tabulate +except ImportError: + logger.error("Error: tabulate package not found. Please install it with 'pip install tabulate'") + sys.exit(1) + + +def parse_args(): + """Parse command line arguments""" + parser = argparse.ArgumentParser(description="List Milvus collections with detailed information") + parser.add_argument("--host", type=str, default="127.0.0.1", help="Milvus server host") + parser.add_argument("--port", type=str, default="19530", help="Milvus server port") + parser.add_argument("--format", type=str, choices=["table", "json"], default="table", + help="Output format (table or json)") + return parser.parse_args() + + +def connect_to_milvus(host, port): + """Connect to Milvus server""" + try: + connections.connect( + alias="default", + host=host, + port=port + ) + logger.info(f"Connected to Milvus server at {host}:{port}") + return True + except Exception as e: + logger.error(f"Failed to connect to Milvus server: {str(e)}") + return False + + +def get_collection_info(collection_name, release=True): + """Get detailed information about a collection""" + try: + collection = Collection(collection_name) + # collection.load() + + # Get basic collection info - using num_entities instead of get_statistics + row_count = collection.num_entities + # row_count = get_collection_info(collection_name)["row_count"] + + # Get schema information + schema = collection.schema + dimension = None + for field in schema.fields: + if field.dtype in [100, 101]: # FLOAT_VECTOR or BINARY_VECTOR + dimension = field.params.get("dim") + break + + # Get index information + index_info = [] + if collection.has_index(): + index = collection.index() + index_info.append({ + "field_name": index.field_name, + "index_type": index.params.get("index_type"), + "metric_type": index.params.get("metric_type"), + "params": index.params.get("params", {}) + }) + + # Get partition information + partitions = collection.partitions + partition_info = [{"name": p.name, "description": p.description} for p in partitions] + + return { + "name": collection_name, + "row_count": row_count, + "dimension": dimension, + "schema": str(schema), + "index_info": index_info, + "partitions": partition_info + } + except Exception as e: + logger.error(f"Error getting info for collection {collection_name}: {str(e)}") + return { + "name": collection_name, + "error": str(e) + } + finally: + # Release collection + if release: + try: + collection.release() + except: + pass + + +def main(): + """Main function""" + args = parse_args() + + # Connect to Milvus + if not connect_to_milvus(args.host, args.port): + return 1 + + # List all collections + try: + collection_names = utility.list_collections() + logger.info(f"Found {len(collection_names)} collections") + + if not collection_names: + logger.info("No collections found in the Milvus instance") + return 0 + + # Get detailed information for each collection + collections_info = [] + for name in collection_names: + logger.info(f"Getting information for collection: {name}") + info = get_collection_info(name) + collections_info.append(info) + + # Display information based on format + if args.format == "json": + import json + print(json.dumps(collections_info, indent=2)) + else: + # Table format + table_data = [] + for info in collections_info: + index_types = ", ".join([idx.get("index_type", "N/A") for idx in info.get("index_info", [])]) + metric_types = ", ".join([idx.get("metric_type", "N/A") for idx in info.get("index_info", [])]) + + row = [ + info["name"], + info.get("row_count", "N/A"), + info.get("dimension", "N/A"), + index_types, + metric_types, + len(info.get("partitions", [])) + ] + table_data.append(row) + + headers = ["Collection Name", "Vector Count", "Dimension", "Index Types", "Metric Types", "Partitions"] + print(tabulate(table_data, headers=headers, tablefmt="grid")) + + return 0 + + except Exception as e: + logger.error(f"Error listing collections: {str(e)}") + return 1 + finally: + # Disconnect from Milvus + try: + connections.disconnect("default") + logger.info("Disconnected from Milvus server") + except: + pass + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/vdb_benchmark/vdbbench/load_vdb.py b/vdb_benchmark/vdbbench/load_vdb.py new file mode 100644 index 00000000..0a7a9324 --- /dev/null +++ b/vdb_benchmark/vdbbench/load_vdb.py @@ -0,0 +1,370 @@ +import argparse +import logging +import sys +import os +import time +import numpy as np +from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility + +# Add the parent directory to sys.path to import config_loader +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from vdbbench.config_loader import load_config, merge_config_with_args +from vdbbench.compact_and_watch import monitor_progress + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +def parse_args(): + parser = argparse.ArgumentParser(description="Load vectors into Milvus database") + + # Connection parameters + parser.add_argument("--host", type=str, default="localhost", help="Milvus server host") + parser.add_argument("--port", type=str, default="19530", help="Milvus server port") + + # Collection parameters + parser.add_argument("--collection-name", type=str, help="Name of the collection to create") + parser.add_argument("--dimension", type=int, help="Vector dimension") + parser.add_argument("--num-shards", type=int, default=1, help="Number of shards for the collection") + parser.add_argument("--vector-dtype", type=str, default="float", choices=["FLOAT_VECTOR"], + help="Vector data type. Only FLOAT_VECTOR is supported for now") + parser.add_argument("--force", action="store_true", help="Force recreate collection if it exists") + + # Data generation parameters + parser.add_argument("--num-vectors", type=int, help="Number of vectors to generate") + parser.add_argument("--distribution", type=str, default="uniform", + choices=["uniform", "normal"], help="Distribution for vector generation") + parser.add_argument("--batch-size", type=int, default=10000, help="Batch size for insertion") + parser.add_argument("--chunk-size", type=int, default=1000000, help="Number of vectors to generate in each chunk (for memory management)") + + # Index parameters + parser.add_argument("--index-type", type=str, default="DISKANN", help="Index type") + parser.add_argument("--metric-type", type=str, default="COSINE", help="Metric type for index") + parser.add_argument("--max-degree", type=int, default=16, help="DiskANN MaxDegree parameter") + parser.add_argument("--search-list-size", type=int, default=200, help="DiskANN SearchListSize parameter") + parser.add_argument("--M", type=int, default=16, help="HNSW M parameter") + parser.add_argument("--ef-construction", type=int, default=200, help="HNSW efConstruction parameter") + + # Monitoring parameters + parser.add_argument("--monitor-interval", type=int, default=5, help="Interval in seconds for monitoring index building") + parser.add_argument("--compact", action="store_true", help="Perform compaction after loading") + + # Configuration file + parser.add_argument("--config", type=str, help="Path to YAML configuration file") + + # What-if option to print args and exit + parser.add_argument("--what-if", action="store_true", help="Print the arguments after processing and exit") + + # Debug option to set logging level to DEBUG + parser.add_argument("--debug", action="store_true", help="Enable debug logging") + + args = parser.parse_args() + + # Track which arguments were explicitly set vs using defaults + args.is_default = { + 'host': args.host == "localhost", + 'port': args.port == "19530", + 'num_shards': args.num_shards == 1, + 'vector_dtype': args.vector_dtype == "float", + 'distribution': args.distribution == "uniform", + 'batch_size': args.batch_size == 10000, + 'chunk_size': args.chunk_size == 1000000, + 'index_type': args.index_type == "DISKANN", + 'metric_type': args.metric_type == "COSINE", + 'max_degree': args.max_degree == 16, + 'search_list_size': args.search_list_size == 200, + 'M': args.M == 16, + 'ef_construction': args.ef_construction == 200, + 'monitor_interval': args.monitor_interval == 5, + 'compact': not args.compact, # Default is False + 'force': not args.force, # Default is False + 'what_if': not args.what_if, # Default is False + 'debug': not args.debug # Default is False + } + + # Set logging level to DEBUG if --debug is specified + if args.debug: + logger.setLevel(logging.DEBUG) + logger.debug("Debug logging enabled") + + # Load configuration from YAML if specified + if args.config: + config = load_config(args.config) + args = merge_config_with_args(config, args) + + # If what-if is specified, print the arguments and exit + if args.what_if: + logger.info("Running in what-if mode. Printing arguments and exiting.") + print("\nConfiguration after processing arguments and config file:") + print("=" * 60) + for key, value in vars(args).items(): + if key != 'is_default': # Skip the is_default dictionary + source = "default" if args.is_default.get(key, False) else "specified" + print(f"{key}: {value} ({source})") + print("=" * 60) + sys.exit(0) + + # Validate required parameters + required_params = ['collection_name', 'dimension', 'num_vectors'] + missing_params = [param for param in required_params if getattr(args, param.replace('-', '_'), None) is None] + + if missing_params: + parser.error(f"Missing required parameters: {', '.join(missing_params)}. " + f"Specify with command line arguments or in config file.") + + return args + + +def connect_to_milvus(host, port): + """Connect to Milvus server""" + try: + logger.debug(f"Connecting to Milvus server at {host}:{port}") + connections.connect( + "default", + host=host, + port=port, + max_receive_message_length=514_983_574, + max_send_message_length=514_983_574 + ) + logger.info(f"Connected to Milvus server at {host}:{port}") + return True + + except Exception as e: + logger.error(f"Error connecting to Milvus server: {str(e)}") + return False + + +def create_collection(collection_name, dim, num_shards, vector_dtype, force=False): + """Create a new collection with the specified parameters""" + try: + # Check if collection exists + if utility.has_collection(collection_name): + if force: + Collection(name=collection_name).drop() + logger.info(f"Dropped existing collection: {collection_name}") + else: + logger.warning(f"Collection '{collection_name}' already exists. Use --force to drop and recreate it.") + return None + + # Define vector data type + vector_type = DataType.FLOAT_VECTOR + + # Define collection schema + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False), + FieldSchema(name="vector", dtype=vector_type, dim=dim) + ] + schema = CollectionSchema(fields, description="Benchmark Collection") + + # Create collection + collection = Collection(name=collection_name, schema=schema, num_shards=num_shards) + logger.info(f"Created collection '{collection_name}' with {dim} dimensions and {num_shards} shards") + + return collection + except Exception as e: + logger.error(f"Failed to create collection: {str(e)}") + return None + + +def generate_vectors(num_vectors, dim, distribution='uniform'): + """Generate random vectors based on the specified distribution""" + if distribution == 'uniform': + vectors = np.random.random((num_vectors, dim)).astype('float16') + elif distribution == 'normal': + vectors = np.random.normal(0, 1, (num_vectors, dim)).astype('float16') + elif distribution == 'zipfian': + # Simplified zipfian-like distribution + base = np.random.random((num_vectors, dim)).astype('float16') + skew = np.random.zipf(1.5, (num_vectors, 1)).astype('float16') + vectors = base * (skew / 10) + else: + vectors = np.random.random((num_vectors, dim)).astype('float16') + + # Normalize vectors + norms = np.linalg.norm(vectors, axis=1, keepdims=True) + normalized_vectors = vectors / norms + + return normalized_vectors.tolist() + + +def insert_data(collection, vectors, batch_size=10000): + """Insert vectors into the collection in batches""" + total_vectors = len(vectors) + num_batches = (total_vectors + batch_size - 1) // batch_size + + start_time = time.time() + total_inserted = 0 + + for i in range(num_batches): + batch_start = i * batch_size + batch_end = min((i + 1) * batch_size, total_vectors) + batch_size_actual = batch_end - batch_start + + # Prepare batch data + ids = list(range(batch_start, batch_end)) + batch_vectors = vectors[batch_start:batch_end] + + # Insert batch + try: + collection.insert([ids, batch_vectors]) + total_inserted += batch_size_actual + + # Log progress + progress = total_inserted / total_vectors * 100 + elapsed = time.time() - start_time + rate = total_inserted / elapsed if elapsed > 0 else 0 + + logger.info(f"Inserted batch {i+1}/{num_batches}: {progress:.2f}% complete, " + f"rate: {rate:.2f} vectors/sec") + + except Exception as e: + logger.error(f"Error inserting batch {i+1}: {str(e)}") + + return total_inserted, time.time() - start_time + + +def flush_collection(collection): + # Flush the collection + flush_start = time.time() + collection.flush() + flush_time = time.time() - flush_start + logger.info(f"Flush completed in {flush_time:.2f} seconds") + + +def create_index(collection, index_params): + """Create an index on the collection""" + try: + start_time = time.time() + logger.info(f"Creating index with parameters: {index_params}") + collection.create_index("vector", index_params) + index_creation_time = time.time() - start_time + logger.info(f"Index creation command completed in {index_creation_time:.2f} seconds") + return True + except Exception as e: + logger.error(f"Failed to create index: {str(e)}") + return False + + +def main(): + args = parse_args() + + # Connect to Milvus + if not connect_to_milvus(args.host, args.port): + logger.error("Failed to connect to Milvus.") + return 1 + + logger.debug(f'Determining datatype for vector representation.') + # Determine vector data type + try: + # Check if FLOAT16 is available in newer versions of pymilvus + if hasattr(DataType, 'FLOAT16'): + logger.debug(f'Using FLOAT16 data type for vector representation.")') + vector_dtype = DataType.FLOAT16 if args.vector_dtype == 'float16' else DataType.FLOAT_VECTOR + else: + # Fall back to supported data types + logger.warning("FLOAT16 data type not available in this version of pymilvus. Using FLOAT_VECTOR instead.") + vector_dtype = DataType.FLOAT_VECTOR + except Exception as e: + logger.warning(f"Error determining vector data type: {str(e)}. Using FLOAT_VECTOR as default.") + vector_dtype = DataType.FLOAT_VECTOR + + # Create collection + collection = create_collection( + collection_name=args.collection_name, + dim=args.dimension, + num_shards=args.num_shards, + vector_dtype=vector_dtype, + force=args.force + ) + + if collection is None: + return 1 + + # Create index with updated parameters + index_params = { + "index_type": args.index_type, + "metric_type": args.metric_type, + "params": {} + } + + # Update only the parameters based on index_type + if args.index_type == "HNSW": + index_params["params"] = { + "M": args.M, + "efConstruction": args.ef_construction + } + elif args.index_type == "DISKANN": + index_params["params"] = { + "MaxDegree": args.max_degree, + "SearchListSize": args.search_list_size + } + else: + raise ValueError(f"Unsupported index_type: {args.index_type}") + + logger.debug(f'Creating index. This should be immediate on an empty collection') + if not create_index(collection, index_params): + return 1 + + # Generate vectors + logger.info( + f"Generating {args.num_vectors} vectors with {args.dimension} dimensions using {args.distribution} distribution") + start_gen_time = time.time() + + # Split vector generation into chunks if num_vectors is large + if args.num_vectors > args.chunk_size: + logger.info(f"Large vector count detected. Generating in chunks of {args.chunk_size:,} vectors") + vectors = [] + remaining = args.num_vectors + chunks_processed = 0 + + while remaining > 0: + chunk_size = min(args.chunk_size, remaining) + logger.info(f"Generating chunk {chunks_processed+1}: {chunk_size:,} vectors") + chunk_start = time.time() + chunk_vectors = generate_vectors(chunk_size, args.dimension, args.distribution) + chunk_time = time.time() - chunk_start + + logger.info(f"Generated chunk {chunks_processed} ({chunk_size:,} vectors) in {chunk_time:.2f} seconds. " + f"Progress: {(args.num_vectors - remaining):,}/{args.num_vectors:,} vectors " + f"({(args.num_vectors - remaining) / args.num_vectors * 100:.1f}%)") + + # Insert data + logger.info(f"Inserting {args.num_vectors} vectors into collection '{args.collection_name}'") + total_inserted, insert_time = insert_data(collection, chunk_vectors, args.batch_size) + logger.info(f"Inserted {total_inserted} vectors in {insert_time:.2f} seconds") + + remaining -= chunk_size + chunks_processed += 1 + else: + # For smaller vector counts, generate all at once + vectors = generate_vectors(args.num_vectors, args.dimension, args.distribution) + # Insert data + logger.info(f"Inserting {args.num_vectors} vectors into collection '{args.collection_name}'") + total_inserted, insert_time = insert_data(collection, vectors, args.batch_size) + logger.info(f"Inserted {total_inserted} vectors in {insert_time:.2f} seconds") + + gen_time = time.time() - start_gen_time + logger.info(f"Generated all {args.num_vectors:,} vectors in {gen_time:.2f} seconds") + + flush_collection(collection) + + # Monitor index building + logger.info(f"Starting to monitor index building progress (checking every {args.monitor_interval} seconds)") + monitor_progress(args.collection_name, args.monitor_interval, zero_threshold=10) + + if args.compact: + logger.info(f"Compacting collection '{args.collection_name}'") + collection.compact() + monitor_progress(args.collection_name, args.monitor_interval, zero_threshold=30) + logger.info(f"Collection '{args.collection_name}' compacted successfully.") + + # Summary + logger.info("Benchmark completed successfully!") + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/vdb_benchmark/vdbbench/simple_bench.py b/vdb_benchmark/vdbbench/simple_bench.py new file mode 100644 index 00000000..b679cd11 --- /dev/null +++ b/vdb_benchmark/vdbbench/simple_bench.py @@ -0,0 +1,668 @@ +#!/usr/bin/env python3 +""" +Milvus Vector Database Benchmark Script + +This script executes random vector queries against a Milvus collection using multiple processes. +It measures and reports query latency statistics. +""" + +import argparse +import multiprocessing as mp +import numpy as np +import os +import time +import json +import csv +import uuid +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Any, Optional, Tuple, Union +import signal +import sys +from tabulate import tabulate + +from vdbbench.config_loader import load_config, merge_config_with_args +from vdbbench.list_collections import get_collection_info + +try: + from pymilvus import connections, Collection, utility +except ImportError: + print("Error: pymilvus package not found. Please install it with 'pip install pymilvus'") + sys.exit(1) + +STAGGER_INTERVAL_SEC = 0.1 + +# Global flag for graceful shutdown +shutdown_flag = mp.Value('i', 0) + +# CSV header fields +csv_fields = [ + "process_id", + "batch_id", + "timestamp", + "batch_size", + "batch_time_seconds", + "avg_query_time_seconds", + "success" +] + + +def signal_handler(sig, frame): + """Handle interrupt signals to gracefully shut down worker processes""" + print("\nReceived interrupt signal. Shutting down workers gracefully...") + with shutdown_flag.get_lock(): + shutdown_flag.value = 1 + + +def read_disk_stats() -> Dict[str, Dict[str, int]]: + """ + Read disk I/O statistics from /proc/diskstats + + Returns: + Dictionary mapping device names to their read/write statistics + """ + stats = {} + try: + with open('/proc/diskstats', 'r') as f: + for line in f: + parts = line.strip().split() + if len(parts) >= 14: # Ensure we have enough fields + device = parts[2] + # Fields based on kernel documentation + # https://www.kernel.org/doc/Documentation/ABI/testing/procfs-diskstats + sectors_read = int(parts[5]) # sectors read + sectors_written = int(parts[9]) # sectors written + + # 1 sector = 512 bytes + bytes_read = sectors_read * 512 + bytes_written = sectors_written * 512 + + stats[device] = { + "bytes_read": bytes_read, + "bytes_written": bytes_written + } + return stats + except FileNotFoundError: + print("Warning: /proc/diskstats not available (non-Linux system)") + return {} + except Exception as e: + print(f"Error reading disk stats: {e}") + return {} + + +def format_bytes(bytes_value: int) -> str: + """Format bytes into human-readable format with appropriate units""" + units = ['B', 'KB', 'MB', 'GB', 'TB'] + unit_index = 0 + value = float(bytes_value) + + while value > 1024 and unit_index < len(units) - 1: + value /= 1024 + unit_index += 1 + + return f"{value:.2f} {units[unit_index]}" + + +def calculate_disk_io_diff(start_stats: Dict[str, Dict[str, int]], + end_stats: Dict[str, Dict[str, int]]) -> Dict[str, Dict[str, int]]: + """Calculate the difference in disk I/O between start and end measurements""" + diff_stats = {} + + for device in end_stats: + if device in start_stats: + diff_stats[device] = { + "bytes_read": end_stats[device]["bytes_read"] - start_stats[device]["bytes_read"], + "bytes_written": end_stats[device]["bytes_written"] - start_stats[device]["bytes_written"] + } + + return diff_stats + + +def generate_random_vector(dim: int) -> List[float]: + """Generate a random normalized vector of the specified dimension""" + vec = np.random.random(dim).astype(np.float32) + return (vec / np.linalg.norm(vec)).tolist() + + +def connect_to_milvus(host: str, port: str) -> connections: + """Establish connection to Milvus server""" + try: + connections.connect(alias="default", host=host, port=port) + return connections + except Exception as e: + print(f"Failed to connect to Milvus: {e}") + return False + + +def execute_batch_queries(process_id: int, host: str, port: str, collection_name: str, vector_dim: int, batch_size: int, + report_count: int, max_queries: Optional[int], runtime_seconds: Optional[int], output_dir: str, + shutdown_flag: mp.Value) -> None: + """ + Execute batches of vector queries and log results to disk + + Args: + process_id: ID of the current process + host: Milvus server host + port: Milvus server port + collection_name: Name of the collection to query + vector_dim: Dimension of vectors + batch_size: Number of queries to execute in each batch + max_queries: Maximum number of queries to execute (None for unlimited) + runtime_seconds: Maximum runtime in seconds (None for unlimited) + output_dir: Directory to save results + shutdown_flag: Shared value to signal process termination + """ + print(f'Process {process_id} initialized') + # Connect to Milvus + connections = connect_to_milvus(host, port) + if not connections: + print(f'Process {process_id} - No milvus connection') + return + + # Get collection + try: + collection = Collection(collection_name) + print(f'Process {process_id} - Loading collection') + collection.load() + except Exception as e: + print(f"Process {process_id}: Failed to load collection: {e}") + return + + # Prepare output file + output_file = Path(output_dir) / f"milvus_benchmark_p{process_id}.csv" + sys.stdout.write(f"Process {process_id}: Writing results to {output_file}\r\n") + # Create output directory if it doesn't exist + os.makedirs(os.path.dirname(output_file), exist_ok=True) + + # Track execution + start_time = time.time() + query_count = 0 + batch_count = 0 + + sys.stdout.write(f"Process {process_id}: Starting benchmark ...\r\n") + sys.stdout.flush() + + try: + with open(output_file, 'w') as f: + writer = csv.DictWriter(f, fieldnames=csv_fields) + writer.writeheader() + while True: + # Check if we should terminate + with shutdown_flag.get_lock(): + if shutdown_flag.value == 1: + break + + # Check termination conditions + current_time = time.time() + elapsed_time = current_time - start_time + + if runtime_seconds is not None and elapsed_time >= runtime_seconds: + break + + if max_queries is not None and query_count >= max_queries: + break + + # Generate batch of query vectors + batch_vectors = [generate_random_vector(vector_dim) for _ in range(batch_size)] + + # Execute batch and measure time + batch_start = time.time() + try: + search_params = {"metric_type": "COSINE", "params": {"ef": 200}} + results = collection.search( + data=batch_vectors, + anns_field="vector", + param=search_params, + limit=10, + output_fields=["id"] + ) + batch_end = time.time() + batch_success = True + except Exception as e: + print(f"Process {process_id}: Search error: {e}") + batch_end = time.time() + batch_success = False + + # Record batch results + batch_time = batch_end - batch_start + batch_count += 1 + query_count += batch_size + + # Log batch results to file + batch_data = { + "process_id": process_id, + "batch_id": batch_count, + "timestamp": current_time, + "batch_size": batch_size, + "batch_time_seconds": batch_time, + "avg_query_time_seconds": batch_time / batch_size, + "success": batch_success + } + + writer.writerow(batch_data) + f.flush() # Ensure data is written to disk immediately + + # Print progress + if batch_count % report_count == 0: + sys.stdout.write(f"Process {process_id}: Completed {query_count} queries in {elapsed_time:.2f} seconds.\r\n") + sys.stdout.flush() + + except Exception as e: + print(f"Process {process_id}: Error during benchmark: {e}") + + finally: + # Disconnect from Milvus + try: + connections.disconnect("default") + except: + pass + + print( + f"Process {process_id}: Finished. Executed {query_count} queries in {time.time() - start_time:.2f} seconds", flush=True) + + +def calculate_statistics(results_dir: str) -> Dict[str, Union[str, int, float, Dict[str, int]]]: + """Calculate statistics from benchmark results""" + import pandas as pd + + # Find all result files + file_paths = list(Path(results_dir).glob("milvus_benchmark_p*.csv")) + + if not file_paths: + return {"error": "No benchmark result files found"} + + # Read and concatenate all CSV files into a single DataFrame + dfs = [] + for file_path in file_paths: + try: + df = pd.read_csv(file_path) + if not df.empty: + dfs.append(df) + except Exception as e: + print(f"Error reading result file {file_path}: {e}") + + if not dfs: + return {"error": "No valid data found in benchmark result files"} + + # Concatenate all dataframes + all_data = pd.concat(dfs, ignore_index=True) + all_data.sort_values('timestamp', inplace=True) + + # Calculate start and end times + file_start_time = min(all_data['timestamp']) + file_end_time = max(all_data['timestamp'] + all_data['batch_time_seconds']) + total_time_seconds = file_end_time - file_start_time + + # Each row represents a batch, so we need to expand based on batch_size + all_latencies = [] + for _, row in all_data.iterrows(): + query_time_ms = row['avg_query_time_seconds'] * 1000 + all_latencies.extend([query_time_ms] * row['batch_size']) + + # Convert batch times to milliseconds + batch_times_ms = all_data['batch_time_seconds'] * 1000 + + # Calculate statistics + latencies = np.array(all_latencies) + batch_times = np.array(batch_times_ms) + total_queries = len(latencies) + + stats = { + "total_queries": total_queries, + "total_time_seconds": total_time_seconds, + "min_latency_ms": float(np.min(latencies)), + "max_latency_ms": float(np.max(latencies)), + "mean_latency_ms": float(np.mean(latencies)), + "median_latency_ms": float(np.median(latencies)), + "p95_latency_ms": float(np.percentile(latencies, 95)), + "p99_latency_ms": float(np.percentile(latencies, 99)), + "p999_latency_ms": float(np.percentile(latencies, 99.9)), + "p9999_latency_ms": float(np.percentile(latencies, 99.99)), + "throughput_qps": float(total_queries / total_time_seconds) if total_time_seconds > 0 else 0, + + # Batch time statistics + "batch_count": len(batch_times), + "min_batch_time_ms": float(np.min(batch_times)) if len(batch_times) > 0 else 0, + "max_batch_time_ms": float(np.max(batch_times)) if len(batch_times) > 0 else 0, + "mean_batch_time_ms": float(np.mean(batch_times)) if len(batch_times) > 0 else 0, + "median_batch_time_ms": float(np.median(batch_times)) if len(batch_times) > 0 else 0, + "p95_batch_time_ms": float(np.percentile(batch_times, 95)) if len(batch_times) > 0 else 0, + "p99_batch_time_ms": float(np.percentile(batch_times, 99)) if len(batch_times) > 0 else 0, + "p999_batch_time_ms": float(np.percentile(batch_times, 99.9)) if len(batch_times) > 0 else 0, + "p9999_batch_time_ms": float(np.percentile(batch_times, 99.99)) if len(batch_times) > 0 else 0 + } + + return stats + + +def load_database(host: str, port: str, collection_name: str, reload=False) -> Union[dict, None]: + print(f'Connecting to Milvus server at {host}:{port}...', flush=True) + connections = connect_to_milvus(host, port) + if not connections: + print(f'Unable to connect to Milvus server', flush=True) + return None + + # Connect to Milvus + try: + collection = Collection(collection_name) + except Exception as e: + print(f"Unable to connect to Milvus collection {collection_name}: {e}", flush=True) + return None + + try: + # Get the load state of the collection: + state = utility.load_state(collection_name) + if reload or state.name != "Loaded": + if reload: + print(f'Reloading the collection {collection_name}...') + else: + print(f'Loading the collection {collection_name}...') + start_load_time = time.time() + collection.load() + load_time = time.time() - start_load_time + print(f'Collection {collection_name} loaded in {load_time:.2f} seconds', flush=True) + if not reload and state.name == "Loaded": + print(f'Collection {collection_name} already reloaded and not reloading...') + + except Exception as e: + print(f'Unable to load collection {collection_name}: {e}') + return None + + print(f'Getting collection statistics...', flush=True) + collection_info = get_collection_info(collection_name, release=False) + table_data = [] + + index_types = ", ".join([idx.get("index_type", "N/A") for idx in collection_info.get("index_info", [])]) + metric_types = ", ".join([idx.get("metric_type", "N/A") for idx in collection_info.get("index_info", [])]) + + row = [ + collection_info["name"], + collection_info.get("row_count", "N/A"), + collection_info.get("dimension", "N/A"), + index_types, + metric_types, + len(collection_info.get("partitions", [])) + ] + table_data.append(row) + + headers = ["Collection Name", "Vector Count", "Dimension", "Index Types", "Metric Types", "Partitions"] + print(f'\nTabulating information...', flush=True) + tabulated_data = tabulate(table_data, headers=headers, tablefmt="grid") + print(tabulated_data, flush=True) + + return collection_info + + +def main(): + parser = argparse.ArgumentParser(description="Milvus Vector Database Benchmark") + + parser.add_argument("--config", type=str, help="Path to vdbbench config file") + + # Required parameters + parser.add_argument("--processes", type=int, help="Number of parallel processes") + parser.add_argument("--batch-size", type=int, help="Number of queries per batch") + parser.add_argument("--vector-dim", type=int, default=1536, help="Vector dimension") + parser.add_argument("--report-count", type=int, default=10, help="Number of queries between logging results") + + # Database parameters + parser.add_argument("--host", type=str, default="localhost", help="Milvus server host") + parser.add_argument("--port", type=str, default="19530", help="Milvus server port") + parser.add_argument("--collection-name", type=str, help="Collection name to query") + + # Termination conditions (at least one must be specified) + termination_group = parser.add_argument_group("termination conditions (at least one required)") + termination_group.add_argument("--runtime", type=int, help="Maximum runtime in seconds") + termination_group.add_argument("--queries", type=int, help="Total number of queries to execute") + + # Output directory + parser.add_argument("--output-dir", type=str, help="Directory to save benchmark results") + parser.add_argument("--json-output", action="store_true", help="Print benchmark results as JSON document") + + args = parser.parse_args() + + # Validate termination conditions + if args.runtime is None and args.queries is None: + parser.error("At least one termination condition (--runtime or --queries) must be specified") + + # Register signal handlers for graceful shutdown + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + print("") + print("=" * 50) + print("OUTPUT CONFIGURATION", flush=True) + print("=" * 50, flush=True) + + # Load config from YAML if specified + if args.config: + config = load_config(args.config) + args = merge_config_with_args(config, args) + + # Create output directory + if not args.output_dir: + output_dir = "vdbbench_results" + datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = os.path.join(output_dir, datetime_str) + else: + output_dir = args.output_dir + + os.makedirs(output_dir, exist_ok=True) + + # Save benchmark configuration + config = { + "timestamp": datetime.now().isoformat(), + "processes": args.processes, + "batch_size": args.batch_size, + "report_count": args.report_count, + "vector_dim": args.vector_dim, + "host": args.host, + "port": args.port, + "collection_name": args.collection_name, + "runtime_seconds": args.runtime, + "total_queries": args.queries + } + + print(f"Results will be saved to: {output_dir}") + print(f'Writing configuration to {output_dir}/config.json') + with open(os.path.join(output_dir, "config.json"), 'w') as f: + json.dump(config, f, indent=2) + + print("") + print("=" * 50) + print("Database Verification and Loading", flush=True) + print("=" * 50) + + connections = connect_to_milvus(args.host, args.port) + print(f'Verifing database connection and loading collection') + if collection_info := load_database(args.host, args.port, args.collection_name): + print(f"\nCOLLECTION INFORMATION: {collection_info}") + # Having an active connection in the main thread when we fork seems to cause problems + connections.disconnect("default") + else: + print("Unable to load the specified collection") + sys.exit(1) + + # Read initial disk stats + print(f'\nCollecting initial disk statistics...') + start_disk_stats = read_disk_stats() + + # Calculate queries per process if total queries specified + max_queries_per_process = None + if args.queries is not None: + max_queries_per_process = args.queries // args.processes + # Add remainder to the first process + remainder = args.queries % args.processes + + # Start worker processes + processes = [] + stagger_interval_secs = 1 / args.processes + + print("") + print("=" * 50) + print("Benchmark Execution", flush=True) + print("=" * 50) + if max_queries_per_process is not None: + print(f"Starting benchmark with {args.processes} processes and {max_queries_per_process} queries per process") + else: + print(f'Starting benchmark with {args.processes} processes and running for {args.runtime} seconds') + if args.processes > 1: + print(f"Staggering benchmark execution by {stagger_interval_secs} seconds between processes") + try: + for i in range(args.processes): + if i > 0: + time.sleep(stagger_interval_secs) + # Adjust queries for the first process if there's a remainder + process_max_queries = None + if max_queries_per_process is not None: + process_max_queries = max_queries_per_process + (remainder if i == 0 else 0) + + p = mp.Process( + target=execute_batch_queries, + args=( + i, + args.host, + args.port, + args.collection_name, + args.vector_dim, + args.batch_size, + args.report_count, + process_max_queries, + args.runtime, + output_dir, + shutdown_flag + ) + ) + print(f'Starting process {i}...') + p.start() + processes.append(p) + + # Wait for all processes to complete + for p in processes: + p.join() + except Exception as e: + print(f"Error during benchmark execution: {e}") + # Signal all processes to terminate + with shutdown_flag.get_lock(): + shutdown_flag.value = 1 + + # Wait for processes to terminate + for p in processes: + if p.is_alive(): + p.join(timeout=5) + if p.is_alive(): + p.terminate() + else: + print(f'Running single process benchmark...') + execute_batch_queries(0, args.host, args.port, args.collection_name, args.vector_dim, args.batch_size, + args.report_count, args.queries, args.runtime, output_dir, shutdown_flag) + + # Read final disk stats + print('Reading final disk statistics...') + end_disk_stats = read_disk_stats() + + # Calculate disk I/O during benchmark + disk_io_diff = calculate_disk_io_diff(start_disk_stats, end_disk_stats) + + # Calculate and print statistics + print("\nCalculating benchmark statistics...") + stats = calculate_statistics(output_dir) + + # Add disk I/O statistics to the stats dictionary + if disk_io_diff: + # Calculate totals across all devices + total_bytes_read = sum(dev_stats["bytes_read"] for dev_stats in disk_io_diff.values()) + total_bytes_written = sum(dev_stats["bytes_written"] for dev_stats in disk_io_diff.values()) + + # Add disk I/O totals to stats + stats["disk_io"] = { + "total_bytes_read": total_bytes_read, + "total_bytes_read_per_sec": total_bytes_read / stats["total_time_seconds"], + "total_bytes_written": total_bytes_written, + "total_read_formatted": format_bytes(total_bytes_read), + "total_write_formatted": format_bytes(total_bytes_written), + "devices": {} + } + + # Add per-device breakdown + for device, io_stats in disk_io_diff.items(): + bytes_read = io_stats["bytes_read"] + bytes_written = io_stats["bytes_written"] + if bytes_read > 0 or bytes_written > 0: # Only include devices with activity + stats["disk_io"]["devices"][device] = { + "bytes_read": bytes_read, + "bytes_written": bytes_written, + "read_formatted": format_bytes(bytes_read), + "write_formatted": format_bytes(bytes_written) + } + else: + stats["disk_io"] = {"error": "Disk I/O statistics not available"} + + # Save statistics to file + with open(os.path.join(output_dir, "statistics.json"), 'w') as f: + json.dump(stats, f, indent=2) + + if args.json_output: + print("\nBenchmark statistics as JSON:") + print(json.dumps(stats)) + else: + # Print summary + print("\n" + "=" * 50) + print("BENCHMARK SUMMARY") + print("=" * 50) + print(f"Total Queries: {stats.get('total_queries', 0)}") + print(f"Total Batches: {stats.get('batch_count', 0)}") + print(f'Total Runtime: {stats.get("total_time_seconds", 0):.2f} seconds') + + # Print query time statistics + print("\nQUERY STATISTICS") + print("-" * 50) + + print(f"Mean Latency: {stats.get('mean_latency_ms', 0):.2f} ms") + print(f"Median Latency: {stats.get('median_latency_ms', 0):.2f} ms") + print(f"95th Percentile: {stats.get('p95_latency_ms', 0):.2f} ms") + print(f"99th Percentile: {stats.get('p99_latency_ms', 0):.2f} ms") + print(f"99.9th Percentile: {stats.get('p999_latency_ms', 0):.2f} ms") + print(f"99.99th Percentile: {stats.get('p9999_latency_ms', 0):.2f} ms") + print(f"Throughput: {stats.get('throughput_qps', 0):.2f} queries/second") + + # Print batch time statistics + print("\nBATCH STATISTICS") + print("-" * 50) + + print(f"Mean Batch Time: {stats.get('mean_batch_time_ms', 0):.2f} ms") + print(f"Median Batch Time: {stats.get('median_batch_time_ms', 0):.2f} ms") + print(f"95th Percentile: {stats.get('p95_batch_time_ms', 0):.2f} ms") + print(f"99th Percentile: {stats.get('p99_batch_time_ms', 0):.2f} ms") + print(f"99.9th Percentile: {stats.get('p999_batch_time_ms', 0):.2f} ms") + print(f"99.99th Percentile: {stats.get('p9999_batch_time_ms', 0):.2f} ms") + print(f"Max Batch Time: {stats.get('max_batch_time_ms', 0):.2f} ms") + print(f"Batch Throughput: {1000 / stats.get('mean_batch_time_ms', float('inf')):.2f} batches/second") + + # Print disk I/O statistics + print("\nDISK I/O DURING BENCHMARK") + print("-" * 50) + if disk_io_diff: + # Calculate totals across all devices + total_bytes_read = sum(dev_stats["bytes_read"] for dev_stats in disk_io_diff.values()) + total_bytes_written = sum(dev_stats["bytes_written"] for dev_stats in disk_io_diff.values()) + + print(f"Total Bytes Read: {format_bytes(total_bytes_read)}") + print(f"Total Bytes Written: {format_bytes(total_bytes_written)}") + print("\nPer-Device Breakdown:") + + for device, io_stats in disk_io_diff.items(): + bytes_read = io_stats["bytes_read"] + bytes_written = io_stats["bytes_written"] + if bytes_read > 0 or bytes_written > 0: # Only show devices with activity + print(f" {device}:") + print(f" Read: {format_bytes(bytes_read)}") + print(f" Write: {format_bytes(bytes_written)}") + else: + print("Disk I/O statistics not available") + + print("\nDetailed results saved to:", output_dir) + print("=" * 50) + + +if __name__ == "__main__": + main() \ No newline at end of file From 92d5e89b887178645e28b75543095ef67115b604 Mon Sep 17 00:00:00 2001 From: Hazem Awadallah Date: Sun, 21 Dec 2025 18:31:13 -0800 Subject: [PATCH 02/43] feat: Replace legacy spillover logic with Waterfall LRU architecture (#219) * feat: Replace legacy spillover logic with Waterfall LRU architecture This is a major architectural upgrade to the core benchmark logic. Replacing the original "Spillover" memory management strategy with the new "Waterfall LRU" implementation to accurately simulate enterprise storage hierarchies. Key Changes: - Waterfall Eviction: Implemented recursive eviction (GPU -> CPU -> NVMe). New data now correctly lands in the fastest available tier, pushing cold data down, rather than the old behavior where new data skipped directly to NVMe if RAM was full. - Static Buffer Optimization: Replaced the CPU-bound np.random generation with a pre-allocated static noise buffer. This removes the CPU bottleneck that was masking true storage latency, allowing us to fully saturate high-performance NVMe drives. - Concurrency Hardening: Added semaphore-based concurrency limits (max_concurrent_allocs) and atomic memory reservations to prevent OOM crashes under heavy load. - Storage Metrics: Added explicit tracking for nvme_tokens_processed to calculate true storage throughput separate from system throughput. - Stress Test Validation: Verified that this new architecture correctly exposes storage latency limits (e.g., pushing P95 write latency >1000ms) where the old script artificially throttled the load. * Fix two runtime errors in RAG-enabled benchmark mode This patch addresses two bugs that surface when running the benchmark with --enable-rag: 1. Race condition in process_requests (line 2693) Worker threads begin processing requests immediately upon benchmark start, while RAG document ingestion runs in a separate daemon thread. When a worker hits the 10% RAG query path before any documents have been ingested, random.choice() is called on an empty list, raising IndexError. Fixed by adding a truthiness check on self.rag_manager.documents before entering the RAG code path. An empty dict evaluates to False, so RAG queries are safely skipped until ingestion populates at least one document. 2. Division by zero in KVCacheGenerator.generate (line 1097) The buffer slicing logic uses modulo to compute a pseudo-random start index: seed % (buffer_size - total_elements). When total_elements exactly equals buffer_size (an edge case permitted by the <= guard), the divisor becomes zero, raising ZeroDivisionError. Fixed by computing the divisor separately and defaulting start_idx to 0 when the divisor is zero. * Add detailed README.md for running the different invocations of kv-cache.py * fix: line endings from dos2unix; increase cpu memory to 4GB for mlperf invocation * Update MLperf v3 KV cache proposal.md to recommend using a minimum of 4G of DRAM to reduce Queue contention and unrealistic read amplification --- .../MLperf v3 KV cache proposal.md | 215 +++++- kv_cache_benchmark/README.md | 622 +++++++++++++++++- kv_cache_benchmark/kv-cache-wrapper.sh | 14 +- kv_cache_benchmark/kv-cache.py | 449 +++++++++++-- 4 files changed, 1218 insertions(+), 82 deletions(-) diff --git a/kv_cache_benchmark/MLperf v3 KV cache proposal.md b/kv_cache_benchmark/MLperf v3 KV cache proposal.md index 53dd6445..b2ff3601 100644 --- a/kv_cache_benchmark/MLperf v3 KV cache proposal.md +++ b/kv_cache_benchmark/MLperf v3 KV cache proposal.md @@ -529,6 +529,75 @@ The benchmark copies that pattern with three simple pieces: In the summary you will see both numbers. A high reuse count with few hits simply says the prompt was detected but the stored copy had already been evicted, just like what operators watch for in production. +### J. ShareGPT Replay: Realistic Workload Simulation + +While synthetic workloads (using random token counts within a range) are excellent for controlled stress testing, they may not fully capture the nuances of human-AI interaction. The **ShareGPT Replay** feature addresses this by loading real conversation trees from the ShareGPT dataset. + +**How it works:** +1. **Ingestion:** The `ShareGPTDatasetLoader` parses a JSON dataset of real conversations. It uses a tokenizer to calculate the exact `context_tokens` (user prompt) and `generate_tokens` (model response) for every turn. +2. **Replay:** Instead of generating random requests, the benchmark feeds these real token counts into the `InferenceRequest` queue. +3. **Structure Preservation:** Crucially, it preserves the multi-turn structure of the data. Request 2 is guaranteed to be a follow-up to Request 1, testing the `MultiTierCache`'s ability to handle real conversational locality. + +**Case Study: Analyzing ShareGPT Results** +Running a replay with the `llama3.1-70b-instruct` model on a memory-constrained system (2GB CPU RAM) reveals bottlenecks often hidden by uniform random distributions. + +* **High Cache Hit Rate (97.2%):** Real conversations exhibit high locality. Users ask follow-up questions, allowing the system to reuse the KV cache effectively. +* **NVMe Read Latency Spikes (291ms P95):** Unlike synthetic tests which might average around a mean, real user inputs vary wildly. A single request with a 16k token context can saturate the read bandwidth, pushing the P95 latency above the 200ms target, resulting in a "FAIL" assessment for storage even if throughput is high. + +**Sample Output Summary:** +```text +### STORAGE PERFORMANCE ASSESSMENT: FAIL ✗ ### + Criteria Passed: 3/4 + ✓ NVMe Write P95 < 500ms: 54.50ms + ✗ NVMe Read P95 < 200ms: 291.11ms (Target: 200ms) + ✓ Cache Hit Rate > 30%: 97.2% + +### CACHE TIER DISTRIBUTION ### + GPU Entries: 0 (0.00 GB) + CPU Entries: 156 (1.60 GB) + NVMe Entries: 1772 (92% of cache on slow storage) +``` + +### K. The Importance of Realism: A Comparative Case Study + +To illustrate why workload realism matters, we compared two runs of the benchmark on identical hardware (50 users, 70B model, NVMe-only cache). + +**Run A: Real Workload (ShareGPT)** +This run uses the actual conversation data, reflecting human usage patterns. +```bash +python3 kv-cache_sharegpt_replay.py \ + --model llama3.1-70b-instruct \ + --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ + --gpu-mem-gb 0 --cpu-mem-gb 2 --cache-dir /mnt/nvme \ + --num-users 50 --duration 300 --generation-mode none +``` + +**Run B: Synthetic Workload (Random)** +This run omits the dataset, causing the benchmark to fall back to generating random, full-length contexts. This represents a "worst-case" scenario (e.g., massive document processing) rather than a chat workload. +```bash +python3 kv-cache_sharegpt_replay.py \ + --model llama3.1-70b-instruct \ + --gpu-mem-gb 0 --cpu-mem-gb 2 --cache-dir /mnt/nvme \ + --num-users 50 --duration 300 --generation-mode none +``` + +The results were dramatically different: + +| Metric | Run A: ShareGPT (Real) | Run B: Synthetic (Random) | Difference | +| :--- | :--- | :--- | :--- | +| **Workload Type** | Human Conversations | Random Large Contexts | | +| **Mean Context Size** | **133 tokens** (~41 MB) | **2,676 tokens** (~836 MB) | **20x Larger Data** | +| **Throughput** | **2,610 tok/sec** | **362 tok/sec** | **7.2x Slower** | +| **NVMe Read P95** | **291 ms** | **6,752 ms** (6.7s) | **23x Slower** | +| **End-to-End P50** | 93 ms | 121,158 ms (2 min) | **System Collapse** | + +**Key Findings:** +1. **Context Size Explosion:** Real human queries are concise (avg 133 tokens). The synthetic generator, aiming for coverage, produced contexts averaging 2,676 tokens. This forced the storage system to read/write **20x more data per request** in the synthetic run. +2. **System Collapse:** In the synthetic run, the P50 end-to-end latency ballooned to **2 minutes**, while the storage latency was only ~4 seconds. This indicates the system was in a state of **thrashing**, where requests spent 95% of their time waiting in the queue because the storage was saturated handling massive files. +3. **Cache Efficiency:** Real conversations have high locality (85.9% multi-turn hit rate) because users ask follow-up questions. The synthetic run had a much lower hit rate (60.1%), further stressing the storage. + +**Conclusion:** Run A represents a realistic chatbot application, where the NVMe drive is nearly sufficient. Run B represents a worst-case scenario, proving that for such heavy workloads, the current hardware configuration is inadequate. + --- ## 6. Current Work: Validating Simulation Accuracy with vLLM @@ -557,7 +626,7 @@ python3 kv-cache.py \ --num-users 10 \ --duration 120 \ --gpu-mem-gb 24 \ - --cpu-mem-gb 0 \ + --cpu-mem-gb 4 \ --generation-mode deterministic \ --seed 42 \ --output validation_kv_cache_gpu_only.json @@ -644,16 +713,16 @@ Two primary scenarios should be submitted to give a comprehensive view of storag #### Standard Submission: `llama3.1-8b` -This workload provides a baseline for storage performance under typical conditions. A fixed seed is required to ensure the workload is identical for all submissions, enabling fair and reproducible comparisons. +This workload provides a baseline for storage performance under typical conditions. **Note:** We set `cpu-mem-gb 4` to provide a minimal CPU buffer that prevents pathological queue contention while still forcing the vast majority of I/O to NVMe. Analysis showed that 0GB causes a 25,942x queueing factor where application latency reaches 21 seconds despite device latency of only 0.81ms. The 4GB setting reduces mean latency 20x while still stressing NVMe with over 1,054 GB of reads. ```bash # MLPerf v3.0 Recommended Invocation: Storage Saturation Test (8B Model) -python3 kv-cache.py \ +python3 kv-cache-waterfall-lru.py \ --model llama3.1-8b \ --num-users 150 \ --duration 600 \ --gpu-mem-gb 0 \ - --cpu-mem-gb 2 \ + --cpu-mem-gb 4 \ --generation-mode realistic \ --performance-profile throughput \ --seed 42 \ @@ -666,7 +735,7 @@ This workload tests the storage's ability to handle a much heavier load, as the ```bash # MLPerf v3.0 Recommended Invocation: Storage Saturation Test (70B Model) -python3 kv-cache.py \ +python3 kv-cache-waterfall-lru.py \ --model llama3.1-70b-instruct \ --num-users 40 \ --duration 600 \ @@ -678,11 +747,14 @@ python3 kv-cache.py \ --output mlperf_v3_storage_submission_70b.json ``` +**Why `cpu-mem-gb 4`?** +Analysis of benchmark behavior revealed that `--cpu-mem-gb 0` creates pathological queue contention rather than measuring true storage performance. At 0GB, the queueing factor reaches 25,942x (device latency 0.81ms, application latency 21,000ms). At 4GB, the queueing factor drops to 7,307x while NVMe still processes 1,054 GB of reads. This small CPU buffer prevents the benchmark from measuring queue management overhead instead of storage I/O performance, providing more realistic and actionable results. + **Key Parameters Explained:** * `--num-users 150`: A high, fixed user count is used to ensure the storage device is placed under significant and continuous load. * `--duration 600`: A 10-minute duration ensures the benchmark reaches a stable, steady-state performance level, which is a standard requirement for MLPerf results. * `--gpu-mem-gb 0`: **This is the critical parameter for a storage-focused test.** It ensures the benchmark does not allocate any GPU memory, making it suitable for systems without a GPU or for isolating storage performance. -* `--cpu-mem-gb 2`: This small memory budget is intentionally chosen to be insufficient for the user load, forcing the system to bypass this faster tier and offload almost all KV cache data directly to the NVMe storage. +* `--cpu-mem-gb 4`: This small memory budget provides a minimal buffer to prevent pathological queue contention while still forcing the vast majority of KV cache data to NVMe storage. Analysis showed this reduces mean latency 20x compared to 0GB while maintaining significant storage stress (1,054 GB reads). * `--generation-mode realistic`: This is essential for a valid submission. It adds a 30ms emulated sleep for each token generated, accurately simulating the backpressure from a real GPU's computation time. Without this, the benchmark would incorrectly measure storage performance in an unrealistic, I/O-only scenario. * `--performance-profile throughput`: This new parameter is crucial for official submissions. It instructs the benchmark to use **throughput (tokens/second) as the sole pass/fail metric**, ignoring latency. This is because the high user count and low memory budget are *designed* to cause high latency to saturate the storage. This profile ensures the benchmark correctly evaluates the storage device's ability to sustain a high data rate under stress, which is the true goal of this test. * `--seed 42`: **This parameter is mandatory for a valid submission.** It ensures that the pseudo-random workload (user request timings, context lengths, etc.) is identical across all test runs and systems. This removes workload variance as a factor and guarantees a true "apples-to-apples" comparison of hardware performance. The final report will include the seed used. @@ -872,7 +944,7 @@ python3 kv-cache.py \ --num-users 50 \ --duration 180 \ --gpu-mem-gb 0 \ - --cpu-mem-gb 0.5 \ + --cpu-mem-gb 4 \ --generation-mode realistic \ --cache-dir /mnt/nvme \ --seed 42 \ @@ -925,7 +997,7 @@ python3 kv-cache.py \ --num-users 10 \ --duration 180 \ --gpu-mem-gb 0 \ - --cpu-mem-gb 32 \ + --cpu-mem-gb 4 \ --enable-autoscaling \ --autoscaler-mode capacity \ --generation-mode none \ @@ -1004,4 +1076,129 @@ python3 kv-cache.py \ --cache-dir /mnt/nvme \ --seed 42 \ --output results_max_stress.json -``` \ No newline at end of file +``` + +### Test 9: ShareGPT Workload Replay + +**Purpose:** Validates system performance against a trace of real-world human-AI conversations. This is the closest approximation to running a production service. It uses the dedicated replay script [`kv-cache_sharegpt_replay.py`](kv-cache_sharegpt_replay.py ). + +```bash +python3 kv-cache_sharegpt_replay.py \ + --model llama3.1-70b-instruct \ + --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ + --max-conversations 1000 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 2 \ + --cache-dir /mnt/nvme \ + --num-users 50 \ + --duration 300 \ + --generation-mode none \ + --output results_sharegpt_replay.json +``` + +--- + +# CHANGES-12-05-2025: The "Waterfall" Architecture & Optimization + +**Date:** December 5, 2025 +**Subject:** Major architectural upgrade to `kv-cache-waterfall-lru.py`. + +This update introduces a fundamental shift in how the benchmark manages memory, moving from a simple "Spillover" model to a sophisticated "Waterfall" eviction strategy. It also addresses a critical CPU bottleneck that was masking true storage performance. + +## 1. Architectural Shift: From Spillover to Waterfall + +The original benchmark used a **Spillover** strategy. When the GPU was full, new data was forced directly into the CPU (and then NVMe). +* **The Problem:** New data is often the "hottest" (most likely to be read again soon). By forcing it to the slowest tier, we were penalizing active conversations. Meanwhile, old, cold data sat comfortably in the GPU, wasting valuable VRAM. +* **The Solution (Waterfall):** The new implementation enforces a strict hierarchy. New data **always** targets the fastest tier (GPU). + * If the GPU is full, the system identifies the **Least Recently Used (LRU)** item in the GPU and moves it to the CPU to make room. + * If the CPU is full, it moves the CPU's LRU item to NVMe. + * **Result:** The hottest data stays fast. Only truly cold data "falls" down the waterfall to storage. This mimics the behavior of production-grade caching systems like Redis or vLLM. + +### The Waterfall Flow + +```ascii + [ New Data ] + | + v + +-------------+ (Full?) +-------------+ (Full?) +-------------+ + | GPU Tier | --------------> | CPU Tier | --------------> | NVMe Tier | + | (Fastest) | Evict LRU | (Medium) | Evict LRU | (Slowest) | + +-------------+ +-------------+ +-------------+ + ^ ^ ^ + | | | + [ Hot Access ] [ Warm Access ] [ Cold Access ] +``` + +### Implementation: Recursive Eviction + +The core logic resides in `_ensure_space_in_tier`. It recursively clears space in lower tiers to make room for demotions from higher tiers. + +```python +def _ensure_space_in_tier(self, tier: str, required_bytes: int, recursion_depth: int = 0) -> bool: + # ... (recursion limits and checks omitted) ... + + # Find the LRU entry in this tier + lru_entries = self._get_lru_entries_in_tier(tier) + lru_key, lru_entry = lru_entries[0] + lru_size = lru_entry['size'] + + # Recursively ensure the next tier has space for this entry + # This triggers the "Waterfall" effect down the hierarchy + if not self._ensure_space_in_tier(next_tier, lru_size, recursion_depth + 1): + return False + + # Demote the LRU entry to the next tier + success, _ = self._demote_entry(lru_key, tier, next_tier) +``` + +## 2. Removing the CPU Bottleneck: Static Noise Buffers + +**The Issue:** +Profiling the original script revealed that `np.random.uniform`—the function used to generate the dummy KV cache data—was consuming massive amounts of CPU time. +* **Impact:** The CPU was spending so much time generating random numbers that it couldn't issue storage I/O requests fast enough. The benchmark was measuring the speed of Python's random number generator, not the speed of the NVMe drive. + +**The Fix:** +We replaced dynamic generation with a **Static Noise Buffer**. +* **Mechanism:** At startup, the benchmark pre-allocates a 256MB block of random noise in memory. +* **Zero-Copy Slicing:** When a request needs 10MB of data, instead of generating 10MB of new numbers, the system simply takes a "slice" (a view) of the pre-existing buffer. +* **Result:** Data generation is now effectively instant (zero CPU cost). This ensures that 100% of the latency measured is due to the storage subsystem, providing a true test of hardware performance. + +```python +class KVCacheGenerator: + def __init__(self, model_config: ModelConfig, global_seed: Optional[int] = None): + # Pre-allocate a large buffer of random noise (e.g., 256MB) + self.buffer_size_elements = 128 * 1024 * 1024 + self.precomputed_buffer = rng.uniform(-1.0, 1.0, size=self.buffer_size_elements).astype(self.dtype) + + def generate(self, sequence_length: int, key: Optional[str] = None) -> np.ndarray: + # ... (shape calculation omitted) ... + + # Zero-Copy Slicing: Take a view of the pre-existing buffer + if total_elements <= self.buffer_size_elements: + flat_view = self.precomputed_buffer[start_idx : start_idx + total_elements] + return flat_view.reshape(kv_shape) +``` + +## 3. Concurrency Hardening + +Implementing the Waterfall strategy introduced complex race conditions, where multiple threads might try to evict the same item or claim the same free space simultaneously. +* **Atomic Reservations:** We implemented a "check-and-reserve" logic inside the memory locks. A thread now claims space *before* it starts writing, preventing over-subscription. +* **Loop Protection:** We added hard caps to the eviction loops. In a pathological case where the system is thrashing, the eviction logic will now abort rather than spinning infinitely, preventing the benchmark from hanging. + +```python +# Inside _ensure_space_in_tier +with self.memory_lock: + current_usage = self._get_tier_usage(tier) + # Check if we have space + if current_usage + required_bytes <= target_usage: + # ATOMIC RESERVATION: Claim the space immediately inside the lock. + # This prevents other threads from seeing this space as free. + self._update_tier_usage(tier, required_bytes) + return True +``` + +## 4. Enhanced Metrics: NVMe Token Throughput + +To align with MLPerf requirements, we added a specific counter for `nvme_tokens_processed`. +* **Why:** Previously, we tracked raw bytes. However, MLPerf metrics are often in "Tokens per Second." +* **How:** The system now tracks the exact number of tokens associated with every read, write, and demotion operation that touches the NVMe drive. This allows us to report a precise "Storage Throughput (tok/s)" metric that accounts for the massive read amplification inherent in LLM inference. diff --git a/kv_cache_benchmark/README.md b/kv_cache_benchmark/README.md index e432d46b..eed01412 100644 --- a/kv_cache_benchmark/README.md +++ b/kv_cache_benchmark/README.md @@ -1,39 +1,615 @@ # MLPerf Storage KV Cache Benchmark -This directory contains the initial implementation of the KV Cache benchmark for MLPerf Storage v3. +A storage benchmarking tool for Large Language Model inference systems. This benchmark measures the performance of your storage subsystem under realistic KV cache offloading workloads, helping you answer critical questions about hardware capacity and configuration. -## Overview +**Author:** Hazem Awadallah, Kingston Digital +**License:** Apache 2.0 +**Version:** MLPerf Storage v3.0 -The KV Cache benchmark simulates the storage access patterns of Large Language Model (LLM) inference systems, specifically focusing on key-value cache operations that are critical for multi-turn conversations and long-context processing. +--- -## Components +## Table of Contents -### Core Scripts +1. [What This Benchmark Does](#what-this-benchmark-does) +2. [Architecture Overview](#architecture-overview) +3. [System Requirements](#system-requirements) +4. [Installation](#installation) +5. [Quick Start](#quick-start) +6. [Running the Benchmark](#running-the-benchmark) +7. [Using the Wrapper Script](#using-the-wrapper-script) +8. [Understanding Results](#understanding-results) +9. [MLPerf Submission Guidelines](#mlperf-submission-guidelines) +10. [Troubleshooting](#troubleshooting) -- **kv-cache.py**: Main benchmark implementation for KV cache storage performance testing -- **kv-cache_sharegpt_replay.py**: ShareGPT conversation replay-based benchmark for realistic workload simulation -- **kv-cache-wrapper.sh**: Wrapper script for running benchmark configurations -- **validate.sh**: Validation script for benchmark results +--- -### Documentation +## What This Benchmark Does -- **MLperf v3 KV cache proposal.md**: Detailed proposal for KV cache benchmark integration into MLPerf Storage -- **MLperf v3 KV cache proposal.pdf**: PDF version of the proposal -- **sources.md**: References and source documentation +During LLM inference, models store intermediate attention data in a structure called the KV (Key-Value) cache. This cache grows with conversation length and can consume enormous amounts of memory. Production systems offload this cache from expensive GPU VRAM to cheaper CPU RAM or NVMe storage. -## Purpose +This benchmark simulates that offloading behavior. It generates realistic multi-user inference workloads and measures how your storage performs under pressure. It measures these components: -This benchmark addresses the growing need to measure storage system performance under AI/ML inference workloads, particularly: +- How many concurrent users your hardware can support +- Whether your NVMe drive is fast enough to handle cache spillover +- The real latency impact of each storage tier +- Where the bottleneck sits in your system -- Key-value cache read/write patterns -- Mixed sequential and random access patterns -- Multi-threaded concurrent access -- Realistic conversation-based workload replay +This is not a pass/fail test. It is a diagnostic tool for system architects and performance engineers. -## Getting Started +--- -See the proposal documents for detailed information about the benchmark design, metrics, and validation criteria. +## Architecture Overview -## Status +The benchmark implements a three-tier memory hierarchy that mirrors production LLM serving systems. -Initial implementation - work in progress for MLPerf Storage v3.0 +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ KV Cache Benchmark Architecture │ +└─────────────────────────────────────────────────────────────────────────────┘ + + ┌──────────────────┐ + │ User Requests │ + │ (Multi-tenant) │ + └────────┬─────────┘ + │ + ▼ + ┌──────────────────────────────────────┐ + │ Request Queue │ + │ (Priority-based: QoS levels) │ + │ Interactive > Responsive > Batch │ + └──────────────────┬───────────────────┘ + │ + ▼ + ┌────────────────────────────────────────────────────────┐ + │ IntegratedBenchmark │ + │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────┐ │ + │ │ Prefill │ │ Decode │ │ Conversation │ │ + │ │ (Write) │ │ (Read) │ │ Manager │ │ + │ └──────┬──────┘ └──────┬──────┘ └────────┬────────┘ │ + └─────────┼────────────────┼─────────────────┼───────────┘ + │ │ │ + └────────────────┼─────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ MultiTierCache │ +│ (Waterfall LRU Eviction) │ +│ │ +│ New Data ─────► Always targets fastest available tier │ +│ If full, LRU entry cascades down │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ │ │ +│ │ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ │ +│ │ │ GPU VRAM │ │ CPU RAM │ │ NVMe │ │ │ +│ │ │ (Tier 1) │─────►│ (Tier 2) │─────►│ (Tier 3) │ │ │ +│ │ │ │ LRU │ │ LRU │ │ │ │ +│ │ │ Sub-ms │evict │ Tens of ms │evict │ Hundreds │ │ │ +│ │ │ latency │ │ latency │ │ of ms │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ PyTorch/CuPy │ │ NumPy arrays │ │ .npy files │ │ │ +│ │ │ tensors │ │ in memory │ │ on disk │ │ │ +│ │ └───────────────┘ └───────────────┘ └───────────────┘ │ │ +│ │ │ │ +│ │ ◄──── HOT DATA ────────────────────────────── COLD DATA ────► │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ + + │ + ▼ + ┌──────────────────────────────────────┐ + │ Statistics Collector │ + │ │ + │ - Latency percentiles (P50/P95/P99) │ + │ - Throughput (tokens/sec) │ + │ - Cache hit rates │ + │ - Tier distribution │ + │ - QoS compliance │ + └──────────────────────────────────────┘ +``` + +### Key Components + +**MultiTierCache**: The core engine. It decides where to place data based on available space and access patterns. New data always targets the fastest tier. When that tier fills up, the least recently used entry gets pushed down to the next tier. + +**Inference Phases**: The benchmark models two distinct I/O patterns: +- **Prefill**: Write-heavy. Processing the user prompt generates new KV cache entries. +- **Decode**: Read-heavy. Generating each output token requires reading the existing cache. + +**User Simulation**: Creates realistic traffic from multiple concurrent users with different behaviors (chatbot, coding assistant, document analysis) and priority levels. + +**Autoscaler**: Automatically adjusts user load to find either the maximum users your system can handle (QoS mode) or the peak throughput of your storage (capacity mode). + +--- + +## System Requirements + +### Minimum + +- CPU: 8+ cores (AMD EPYC, Intel Xeon) +- RAM: 32 GB +- Storage: 256 GB free space on SSD +- OS: Linux (Ubuntu 22.04, RHEL 9, or similar) +- Python: 3.8 or higher +- No GPU required (runs in CPU-only mode) + +### Recommended + +- CPU: 32+ cores +- RAM: 128 GB or more +- GPU: NVIDIA A100/H100 with 40+ GB VRAM (optional but enables full three-tier testing) +- Storage: 1 TB+ on NVMe (PCIe Gen4 or Gen5) +- Tools: `bc`, `jq` for the wrapper script + +--- + +## Installation + +1. Clone or download this repository. + +2. Install Python dependencies: + +```bash +pip install numpy +``` + +3. For GPU support (optional): + +```bash +pip install torch # or cupy-cuda12x for CuPy +``` + +4. Verify the installation: + +```bash +python3 kv-cache.py --help +``` + +--- + +## Quick Start + +Run a basic storage test with 50 users for 2 minutes: + +```bash +python3 kv-cache.py \ + --model llama3.1-8b \ + --num-users 50 \ + --duration 120 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results.json +``` + +This forces all cache operations to hit your NVMe drive, giving you a baseline measurement of storage performance. + +--- + +## Running the Benchmark + +### Command Line Options + +``` +python3 kv-cache.py [options] + +Required Arguments: + --model MODEL Model configuration to use. Choices: + tiny-1b, mistral-7b, llama2-7b, llama3.1-8b, + llama3.1-70b-instruct + --num-users N Number of concurrent users to simulate + --duration SECONDS Duration of the benchmark in seconds + +Memory Configuration: + --gpu-mem-gb N GPU VRAM budget in GB (0 to disable GPU tier) + --cpu-mem-gb N CPU RAM budget in GB (0 to disable CPU tier) + --cache-dir PATH Directory for NVMe cache files (defaults to temp directory) + +Token Generation: + --generation-mode Token generation speed simulation. Choices: + - none: Pure storage test, no GPU simulation + - fast: 2ms per token (high-end GPU) + - realistic: 30ms per token (typical production) + +Caching Features: + --disable-multi-turn Disable multi-turn conversation caching + --disable-prefix-caching + Disable prefix caching (shared system prompts) + +Autoscaling: + --enable-autoscaling Enable workload autoscaling + --autoscaler-mode Autoscaling strategy. Choices: + - qos: Latency-based, finds max users at target saturation + - capacity: Throughput-based, finds peak storage performance + --target-saturation N Target storage saturation for QoS autoscaling (0.0-1.0, + default: 0.8) + +RAG Workload: + --enable-rag Enable RAG workload simulation + --rag-num-docs N Number of RAG documents to ingest + +Trace-Driven Workloads: + --use-burst-trace Use BurstGPT trace for workload generation instead of + synthetic traffic + --burst-trace-path PATH + Path to the BurstGPT trace CSV file + --validation-trace PATH + Path to a real-world trace file for accuracy validation + +Performance and Output: + --performance-profile Profile for pass/fail criteria. Choices: + - latency: Default, evaluates P95 latency targets + - throughput: For MLPerf submission, evaluates tokens/sec + --output FILE Write results to JSON file + --seed N Seed for random number generators (required for MLPerf + reproducibility) + +Resource Limits: + --max-concurrent-allocs N + Limit concurrent cache allocations to bound RAM usage. + 0 = unlimited. Recommended: 8-16 for large models to + prevent memory explosion. +``` + +### Test Scenarios + +#### Scenario 1: Storage-Only Baseline + +Isolate your NVMe drive by setting both GPU and CPU memory to zero. This tells you the raw performance of your storage. + +```bash +python3 kv-cache.py \ + --model llama3.1-8b \ + --num-users 50 \ + --duration 180 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_storage_only.json +``` + +**What to look for:** NVMe read P95 should be under 200ms, write P95 under 500ms. If your drive cannot meet these targets here, it will bottleneck any multi-tier configuration. + +#### Scenario 2: Realistic Production Setup + +Test a balanced three-tier configuration that mirrors production deployment. + +```bash +python3 kv-cache.py \ + --model llama3.1-8b \ + --num-users 100 \ + --duration 300 \ + --gpu-mem-gb 16 \ + --cpu-mem-gb 32 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_production.json +``` + +**What to look for:** Compare end-to-end latency against the storage-only test. You should see significant improvement. Check the cache tier distribution to understand how data flows through your hierarchy. + +#### Scenario 3: Find Maximum User Count (QoS Mode) + +Let the autoscaler discover how many users your system can handle while maintaining acceptable latency. + +```bash +python3 kv-cache.py \ + --model llama3.1-8b \ + --num-users 20 \ + --duration 300 \ + --gpu-mem-gb 16 \ + --cpu-mem-gb 32 \ + --enable-autoscaling \ + --autoscaler-mode qos \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_autoscale_qos.json +``` + +**What to look for:** The autoscaling_stats section in the output shows the final stable user count. Use this number (minus a safety margin) to configure your production load balancer. + +#### Scenario 4: Find Peak Storage Throughput (Capacity Mode) + +Discover the absolute maximum I/O your storage can deliver by ignoring latency constraints. + +```bash +python3 kv-cache.py \ + --model llama3.1-70b-instruct \ + --num-users 10 \ + --duration 180 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --enable-autoscaling \ + --autoscaler-mode capacity \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_capacity.json +``` + +**What to look for:** The test stops when throughput plateaus. The peak_throughput value represents your storage device's maximum capability for this workload. + +#### Scenario 5: RAG Workload + +Test the bursty I/O patterns characteristic of Retrieval-Augmented Generation. + +```bash +python3 kv-cache.py \ + --model llama3.1-8b \ + --num-users 30 \ + --duration 300 \ + --gpu-mem-gb 16 \ + --cpu-mem-gb 32 \ + --enable-rag \ + --rag-num-docs 20 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_rag.json +``` + +--- + +## Using the Wrapper Script + +The `kv-cache-wrapper.sh` script automates a complete benchmark suite. It detects your hardware, calculates appropriate parameters, and runs multiple test scenarios. + +### Basic Usage + +```bash +./kv-cache-wrapper.sh +``` + +This runs all 10 test scenarios with default settings. Expect roughly 30 minutes for the full suite. + +### Options + +``` +./kv-cache-wrapper.sh [options] + + -m MODEL Model to benchmark (default: llama3.1-8b) + -t SECONDS Duration for tier comparison tests (default: 120) + -s SECONDS Duration for storage saturation test (default: 180) + -r SECONDS Duration for production test (default: 180) + -a SECONDS Duration for autoscaling tests (default: 300) + -w LIST Comma-separated list of workloads to run + -u USERS Override baseline user count + -U USERS Override high-load user count + -R Enable RAG workload + -D DOCS Number of RAG documents (default: 10) + -h Show help +``` + +### Available Workloads + +You can run specific tests using the `-w` flag: + +```bash +# Run only the storage isolation test +./kv-cache-wrapper.sh -w storage-only + +# Run production and autoscaling tests +./kv-cache-wrapper.sh -w production,autoscale + +# Run MLPerf submission tests +./kv-cache-wrapper.sh -w mlperf_submission +``` + +Valid workload names: +- `gpu-only`: All cache in GPU VRAM +- `cpu-only`: All cache in CPU RAM +- `storage-only`: All cache on NVMe +- `gpu-cpu`: Two-tier without storage +- `cpu-storage`: Two-tier without GPU +- `gpu-cpu-storage`: Full three-tier hierarchy +- `storage-saturation`: Stress test for NVMe +- `production`: Balanced realistic workload +- `autoscale`: QoS-based user discovery +- `capacity-autoscale`: Peak throughput discovery +- `mlperf_submission`: Official MLPerf tests + +### Example: Custom Configuration + +```bash +./kv-cache-wrapper.sh \ + -m llama3.1-70b-instruct \ + -t 90 \ + -u 30 \ + -U 100 \ + -w cpu-storage,storage-saturation,production +``` + +This runs a 70B model test with 30 baseline users, 100 high-load users, and only three specific workloads. + +### Output + +The wrapper generates individual JSON files for each test and prints a comparison report at the end. The report shows throughput, latency percentiles, cache distribution, and pass/fail status for each scenario. + +--- + +## Understanding Results + +### Key Metrics + +**Throughput (tokens/sec)**: How many tokens the system processes per second. Higher is better. + +**End-to-End Latency**: Total time from request submission to completion. This includes queue wait time, storage I/O, and token generation. This is what users experience. + +**Storage I/O Latency**: Time spent reading from and writing to storage tiers. Does not include queue wait or generation time. This measures your hardware. + +**Queue Wait Time**: Time requests spend waiting before processing begins. If this dominates, your system is overloaded. + +**Cache Hit Rate**: Percentage of reads served from cache. Higher rates mean less storage pressure. + +### Reading the Output + +The benchmark prints a summary like this: + +``` +### STORAGE PERFORMANCE ASSESSMENT: PASS ### + Criteria Passed: 4/4 + [PASS] NVMe Write P95 < 500ms: 45.20ms + [PASS] NVMe Read P95 < 200ms: 123.45ms + [PASS] CPU RAM P95 < 150ms: 12.30ms + [PASS] Cache Hit Rate > 30%: 67.5% + +### OVERALL PERFORMANCE ### + Total Requests: 2847 + Total Tokens Generated: 489,231 + Throughput: 1,630.77 tok/s + +### LATENCY BREAKDOWN ### + End-to-End: mean 89.3ms, P50 45.2ms, P95 312.4ms + Storage I/O: mean 23.1ms, P50 12.4ms, P95 89.2ms + +### CACHE TIER DISTRIBUTION ### + GPU Entries: 0 (0.00 GB) + CPU Entries: 234 (2.34 GB) + NVMe Entries: 1,892 (18.92 GB) +``` + +### Interpreting Latency Numbers + +When you see high latency numbers, especially under stress tests, look at the breakdown: + +1. **Queue wait dominates**: Your system is overloaded. Reduce users or add hardware. +2. **Storage I/O dominates**: Your disk is the bottleneck. Get faster storage. +3. **Generation dominates**: Expected behavior for realistic mode. GPU is doing its job. + +The MLPerf submission tests intentionally push the system into saturation. High latency in those tests is expected and informative. + +--- + +## MLPerf Submission Guidelines + +For official MLPerf v3.0 storage submissions, use these standardized commands: + +### Standard Submission (8B Model) + +```bash +python3 kv-cache.py \ + --model llama3.1-8b \ + --num-users 150 \ + --duration 600 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --generation-mode realistic \ + --performance-profile throughput \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output mlperf_v3_submission_8b.json +``` + +### Large Model Submission (70B Model) + +```bash +python3 kv-cache.py \ + --model llama3.1-70b-instruct \ + --num-users 40 \ + --duration 600 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --generation-mode realistic \ + --performance-profile throughput \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output mlperf_v3_submission_70b.json +``` + +### Critical Parameters + +- **seed 42**: Required for reproducibility across systems +- **gpu-mem-gb 0, cpu-mem-gb 4**: Minimal CPU buffer prevents pathological queue contention while still forcing heavy NVMe usage. Analysis showed 0GB causes 25,942x queueing factor; 4GB reduces this 20x while maintaining 1,054 GB of NVMe reads. +- **generation-mode realistic**: Simulates 30ms/token GPU backpressure +- **performance-profile throughput**: Uses throughput as the primary metric instead of latency +- **duration 600**: 10-minute run ensures steady-state measurement + +--- + +## Troubleshooting + +### Out of Memory Errors + +Reduce the number of concurrent users or limit parallel allocations: + +```bash +python3 kv-cache.py ... --max-concurrent-allocs 50 +``` + +### Benchmark Hangs + +The system may be thrashing. Reduce users or increase memory budgets. Check system logs for OOM killer activity. + +### No Disk I/O Visible in iostat + +The benchmark uses posix_fadvise to bypass page cache. If you still see zero reads, verify your cache directory is on the correct device: + +```bash +df /mnt/nvme +``` + +### Poor Cache Hit Rates + +Low hit rates indicate your working set exceeds available fast memory. Either: +- Increase GPU/CPU memory budgets +- Reduce user count +- Accept that cold data will hit storage + +### Results Vary Between Runs + +Use the `--seed` flag for reproducible results. Without a seed, workload generation is randomized. + +--- + +## Model Configurations + +| Model | KV Cache per Token | 8K Context Size | +|-------|-------------------|-----------------| +| tiny-1b | 24 KB | 192 MB | +| mistral-7b | 128 KB | 1 GB | +| llama3.1-8b | 128 KB | 1 GB | +| llama2-7b | 512 KB | 4 GB | +| llama3.1-70b-instruct | 320 KB | 2.5 GB | + +Choose your model based on how much memory pressure you want to apply. The 70B model generates the largest cache entries and stresses storage most heavily. + +--- + +## Files in This Repository + +- `kv-cache.py`: Main benchmark implementation +- `kv-cache-wrapper.sh`: Automated test suite runner +- `kv-cache_sharegpt_replay.py`: ShareGPT conversation replay benchmark +- `MLperf v3 KV cache proposal.md`: Detailed technical documentation +- `validate.sh`: Results validation script + +--- + +## Contributing + +This benchmark is developed by the MLPerf Storage Working Group. Contributions are welcome in the following areas: + +- Additional storage backends (object storage, RDMA) +- Improved GPU simulation models +- Alternative cache eviction policies +- Distributed multi-node support + +--- + +## License + +Apache License 2.0 + +--- + +## Contact + +For questions or feedback, open an issue on the repository or contact the MLPerf Storage Working Group. diff --git a/kv_cache_benchmark/kv-cache-wrapper.sh b/kv_cache_benchmark/kv-cache-wrapper.sh index 3a62119f..d6d997c8 100644 --- a/kv_cache_benchmark/kv-cache-wrapper.sh +++ b/kv_cache_benchmark/kv-cache-wrapper.sh @@ -1,7 +1,7 @@ #!/bin/bash # KV Cache Storage Benchmark - Multi-Tier Performance Comparison -# Hazem Awadallah, Kingston Digital, 2025 -# Assisted by Github Copilot +# Kingston Digital, 2025 +# Apache 2.0 license # This script runs a comprehensive comparison of cache tier configurations for LLM inference workloads. # It automatically detects your hardware (GPU, RAM, storage) and runs 9 different test scenarios to show # you exactly where your data ends up and how fast it moves between tiers. @@ -371,7 +371,7 @@ if should_run 'capacity-autoscale'; then --num-users "$capacity_start_users" \ --duration "$autoscale_duration" \ --gpu-mem-gb 0 \ - --cpu-mem-gb "$cpu_mem_realistic" \ + --cpu-mem-gb 4 \ --enable-autoscaling \ --autoscaler-mode capacity \ --generation-mode none \ @@ -413,7 +413,7 @@ if should_run 'mlperf_submission'; then --num-users 150 \ --duration 600 \ --gpu-mem-gb 0 \ - --cpu-mem-gb 2 \ + --cpu-mem-gb 4 \ --generation-mode realistic \ --performance-profile throughput \ --cache-dir "$cache_dir" \ @@ -451,7 +451,7 @@ if should_run 'gpu-only'; then --num-users $users_baseline \ --duration "$tier_duration" \ --gpu-mem-gb $gpu_mem_gb \ - --cpu-mem-gb 0 \ + --cpu-mem-gb 4 \ --generation-mode realistic \ "${rag_args[@]}" \ --seed 42 \ @@ -517,7 +517,7 @@ if should_run 'storage-only'; then --num-users $users_baseline \ --duration "$tier_duration" \ --gpu-mem-gb 0 \ - --cpu-mem-gb 0.5 \ + --cpu-mem-gb 4 \ --generation-mode realistic \ --cache-dir $cache_dir \ "${rag_args[@]}" \ @@ -680,7 +680,7 @@ if should_run 'storage-saturation'; then --num-users $users_high \ --duration "$saturation_duration" \ --gpu-mem-gb 0 \ - --cpu-mem-gb 1 \ + --cpu-mem-gb 4 \ --generation-mode realistic \ --cache-dir $cache_dir \ "${rag_args[@]}" \ diff --git a/kv_cache_benchmark/kv-cache.py b/kv_cache_benchmark/kv-cache.py index 80c80254..65eb3576 100644 --- a/kv_cache_benchmark/kv-cache.py +++ b/kv_cache_benchmark/kv-cache.py @@ -1,12 +1,9 @@ #!/usr/bin/env python3 """ KV Cache Benchmark - Multi-Tier Performance Comparison -Hazem Awadallah, Kingston Digital, 2025 -Assisted by Github Copilot - -Integrated Multi-User KV Cache Benchmark - Enhanced Version -MLPerf Storage Working Group - Benchmark Implementation - +Kingston Digital, 2025 +Licensed under the Apache License, Version 2.0 (the "License") +MLPerf Storage Working Group This script provides a comprehensive, configurable benchmark for testing storage system performance for Large Language Model (LLM) Key-Value (KV) cache offloading. It simulates a realistic multi-tenant inference environment with a sophisticated multi-tier cache. @@ -1057,6 +1054,16 @@ class KVCacheGenerator: def __init__(self, model_config: ModelConfig, global_seed: Optional[int] = None): self.model_config = model_config self.global_seed = 0 if global_seed is None else int(global_seed) + + # OPTIMIZATION: Pre-allocate a large buffer of random noise (e.g., 256MB) + # We will slice this buffer to satisfy requests instead of generating new noise every time. + # This removes the CPU bottleneck seen in the flamegraph (random_uniform + float conversion). + self.buffer_size_elements = 128 * 1024 * 1024 # 128 million elements (~256MB for float16) + self.dtype = np.float16 if 'float16' in self.model_config.dtype else np.float32 + + print(f"[KVCacheGenerator] Pre-generating {self.buffer_size_elements * 2 / 1024**2:.0f} MB noise buffer...") + rng = np.random.default_rng(self.global_seed) + self.precomputed_buffer = rng.uniform(-1.0, 1.0, size=self.buffer_size_elements).astype(self.dtype) def _seed_from_key(self, key: str) -> int: # Use stable cryptographic hash to get deterministic 64-bit seed @@ -1067,30 +1074,40 @@ def _seed_from_key(self, key: str) -> int: def generate(self, sequence_length: int, key: Optional[str] = None) -> np.ndarray: """ Generates a NumPy array with the correct shape and dtype for a KV cache. - The data itself is random noise, but is generated deterministically if a key is provided. + Uses a pre-computed buffer to avoid CPU bottlenecks during benchmarking. """ # The shape of a KV cache tensor is typically: # (num_layers, 2 (for K/V), sequence_length, num_kv_heads, head_dimension) kv_shape = ( self.model_config.num_layers, 2, # K and V - sequence_length, + int(sequence_length), # Ensure sequence_length is int self.model_config.kv_heads, self.model_config.kv_dim_per_head ) - - dtype = np.float16 if 'float16' in self.model_config.dtype else np.float32 - if key is None: - # Fallback to global RNG if no key is provided (less deterministic in multithreading) - rng = np.random.default_rng(self.global_seed) + total_elements = int(np.prod(kv_shape)) # Ensure total_elements is int + + # If the request fits in our precomputed buffer, just slice and reshape (Zero Copy if possible) + if total_elements <= self.buffer_size_elements: + # We use a rolling start index based on the key hash to simulate "different" data + # without the cost of generation. + if key: + seed = self._seed_from_key(key) + divisor = self.buffer_size_elements - total_elements + start_idx = int(seed % divisor) if divisor > 0 else 0 + else: + start_idx = 0 + + flat_view = self.precomputed_buffer[start_idx : start_idx + total_elements] + return flat_view.reshape(kv_shape) + else: - # Generate a seed deterministically from the key and global seed - seed = self._seed_from_key(key) - rng = np.random.default_rng(seed & 0xFFFFFFFF) - - data = rng.uniform(-1.0, 1.0, size=kv_shape).astype(dtype) - return data + # Fallback for extremely large requests (rare): Tile the buffer + # This is slower but safe. + repeats = int((total_elements + self.buffer_size_elements - 1) // self.buffer_size_elements) + large_data = np.tile(self.precomputed_buffer, repeats)[:total_elements] + return large_data.reshape(kv_shape) # ============================================================================ @@ -1114,7 +1131,8 @@ def __init__(self, cache_dir: str = None, eviction_policy: str = 'lru', performance_profile: str = 'latency', - seed: Optional[int] = None): + seed: Optional[int] = None, + max_concurrent_allocs: int = 0): self.model_config = model_config self.gpu_memory_limit = gpu_memory_gb * 1024**3 @@ -1122,6 +1140,7 @@ def __init__(self, self.eviction_policy = eviction_policy self.performance_profile = performance_profile self.seed = seed + self.max_concurrent_allocs = max_concurrent_allocs # Initialize storage backends for each tier. self.backends = {} @@ -1146,6 +1165,13 @@ def __init__(self, self.metadata_lock = threading.Lock() # For coarse-grained operations on the cache_entries dict itself. self.memory_lock = threading.Lock() # For updating the gpu_memory_used and cpu_memory_used counters. self.stats_lock = threading.Lock() # For updating the performance statistics dictionary. + + # Semaphore to limit concurrent allocations (bounds RAM usage). + # If max_concurrent_allocs is 0 or None, no limit is applied. + if self.max_concurrent_allocs and self.max_concurrent_allocs > 0: + self.allocation_semaphore = threading.Semaphore(self.max_concurrent_allocs) + else: + self.allocation_semaphore = None # Dictionary for collecting a wide range of performance metrics. self.stats = { @@ -1184,6 +1210,295 @@ def _get_entry_lock(self, key: str) -> threading.Lock: self.entry_locks[key] = threading.Lock() return self.entry_locks[key] + # ======================================================================== + # WATERFALL LRU EVICTION METHODS + # These methods implement a hierarchical cache eviction strategy where + # new (hot) data always targets the fastest tier, and LRU entries cascade + # down the hierarchy: GPU -> CPU -> NVMe + # ======================================================================== + + def _get_tier_order(self) -> List[str]: + """ + Returns the tier hierarchy from fastest to slowest. + If GPU is not available, CPU becomes the top tier. + """ + tiers = [] + if 'gpu' in self.backends: + tiers.append('gpu') + tiers.extend(['cpu', 'nvme']) + return tiers + + def _get_tier_limit(self, tier: str) -> float: + """Get the memory limit for a tier in bytes.""" + if tier == 'gpu': + return self.gpu_memory_limit + elif tier == 'cpu': + return self.cpu_memory_limit + else: + return float('inf') # NVMe is considered unlimited + + def _get_tier_usage(self, tier: str) -> float: + """Get the current memory usage for a tier in bytes.""" + if tier == 'gpu': + return self.gpu_memory_used + elif tier == 'cpu': + return self.cpu_memory_used + else: + return 0 # NVMe usage not tracked + + def _update_tier_usage(self, tier: str, delta: int): + """Update the memory usage tracking for a tier.""" + if tier == 'gpu': + self.gpu_memory_used = max(0, self.gpu_memory_used + delta) + elif tier == 'cpu': + self.cpu_memory_used = max(0, self.cpu_memory_used + delta) + # NVMe doesn't track usage + + def _get_lru_entries_in_tier(self, tier: str) -> List[Tuple[str, dict]]: + """ + Get all cache entries in a specific tier, sorted by LRU order. + Returns list of (key, entry_dict) tuples, oldest access first. + """ + with self.metadata_lock: + entries = [ + (k, dict(v)) # Copy to avoid mutation issues + for k, v in self.cache_entries.items() + if v['location'] == tier + ] + # Sort by last_access (primary), then by access_count (secondary) + # Lower values = older/colder = evict first + entries.sort(key=lambda x: (x[1]['last_access'], x[1].get('access_count', 0))) + return entries + + def _demote_entry(self, key: str, from_tier: str, to_tier: str) -> Tuple[bool, float]: + """ + Move a cache entry from one tier to a lower (slower) tier. + + This is the core operation for waterfall eviction. It reads the data + from the source tier, writes it to the destination tier, and updates + all metadata atomically. + + Args: + key: The cache key to demote + from_tier: Source tier ('gpu' or 'cpu') + to_tier: Destination tier ('cpu' or 'nvme') + + Returns: + Tuple of (success: bool, total_latency: float) + """ + entry_lock = self._get_entry_lock(key) + + with entry_lock: + # Verify entry still exists and is in the expected tier + with self.metadata_lock: + if key not in self.cache_entries: + return False, 0.0 + entry = self.cache_entries[key] + current_location = entry['location'] + if current_location != from_tier: + # Entry was already moved by another thread - that's okay + return True, 0.0 + size = entry['size'] + + try: + # Step 1: Read from source tier + data, read_timing = self.backends[from_tier].read(key) + + # Step 2: Write to destination tier + write_timing = self.backends[to_tier].write(key, data) + + # Step 3: Delete from source tier (only after successful write) + self.backends[from_tier].delete(key) + + # Step 4: Update metadata atomically + with self.metadata_lock: + if key in self.cache_entries: + self.cache_entries[key]['location'] = to_tier + + # Step 5: Update memory tracking + # NOTE: We only decrement the source tier here. The destination tier's + # space was already reserved atomically by _ensure_space_in_tier() before + # this demotion was triggered. Adding to to_tier here would double-count. + with self.memory_lock: + self._update_tier_usage(from_tier, -size) + + # Step 6: Update statistics + with self.stats_lock: + self.stats['evictions'] += 1 + if to_tier == 'cpu': + self.stats['offloads_cpu'] += 1 + elif to_tier == 'nvme': + self.stats['offloads_nvme'] += 1 + # Track tokens processed for NVMe throughput calculation + # Assuming size is bytes, and we know dtype size from model config + # But simpler: we can estimate tokens from size if needed, or just track bytes + # The user asked for 'nvme_tokens_processed'. + # We can approximate tokens = size / (2 * layers * heads * dim * dtype_size) + # Or just use the 'num_tokens' if we had it. + # Since we don't have num_tokens easily here without looking up the key again or storing it, + # let's look at the entry dict which should have it if we stored it. + # The current cache_entries dict stores: 'location', 'size', 'last_access', 'access_count'. + # It does NOT store num_tokens. + # However, size is directly proportional. + # Let's just track bytes for now and convert later if needed, OR + # better yet, let's add num_tokens to the cache entry metadata in allocate_cache. + # For now, to fix the immediate request without changing data structures too much: + # We will estimate tokens based on size. + # size = num_tokens * layers * 2 * heads * dim * 2 (for float16) + # so num_tokens = size / (layers * 4 * heads * dim) + bytes_per_token = ( + self.model_config.num_layers * + 2 * # K and V + self.model_config.kv_heads * + self.model_config.kv_dim_per_head * + 2 # float16 bytes + ) + tokens = int(size / bytes_per_token) + self.stats['nvme_tokens_processed'] += tokens + + total_latency = read_timing.total + write_timing.total + return True, total_latency + + except Exception as e: + print(f"[KVCache] Failed to demote {key} from {from_tier} to {to_tier}: {e}") + return False, 0.0 + + def _ensure_space_in_tier(self, tier: str, required_bytes: int, recursion_depth: int = 0) -> bool: + """ + Ensure there's enough space in a tier by evicting LRU entries. + + This implements the waterfall eviction strategy: + 1. If the tier has space, return immediately + 2. Otherwise, find the LRU entry in this tier + 3. Recursively ensure space in the next tier down + 4. Demote the LRU entry to the next tier + 5. Repeat until enough space is available + + Args: + tier: The tier to make space in ('gpu' or 'cpu') + required_bytes: Number of bytes needed + recursion_depth: Safety counter to prevent infinite recursion + + Returns: + True if space was successfully made available, False otherwise + """ + # NVMe is the sink - always has space + if tier == 'nvme': + return True + + # Safety limit to prevent runaway eviction cascades + max_recursion = 10 + if recursion_depth > max_recursion: + print(f"[KVCache] Warning: Hit recursion limit in _ensure_space_in_tier") + return False + + tier_order = self._get_tier_order() + try: + tier_idx = tier_order.index(tier) + except ValueError: + return False + + next_tier = tier_order[tier_idx + 1] if tier_idx + 1 < len(tier_order) else None + if next_tier is None: + return False + + limit = self._get_tier_limit(tier) + target_usage = limit * 0.8 # Keep 20% buffer consistent with original code + + # If the entry is larger than the tier can physically hold, skip to next tier + if required_bytes > limit * 0.95: # Allow up to 95% for a single large entry + return False + + # Calculate a reasonable eviction limit based on tier capacity. + # For large models (e.g., 70B), entries can be hundreds of MB each, + # so we may need to evict many entries to make room for one large request. + # Use the number of entries in the tier as a guide, with a minimum of 1000. + entries_in_tier = len(self._get_lru_entries_in_tier(tier)) + # FIX: Cap the max evictions to prevent infinite loops if we can't clear enough space + # The previous logic could loop forever if entries_in_tier kept growing or didn't reduce fast enough. + # We set a hard cap of 5000 or slightly more than current entries. + max_evictions_per_call = min(5000, max(1000, entries_in_tier + 100)) + eviction_count = 0 + + while eviction_count < max_evictions_per_call: + # Check if we have enough space now + with self.memory_lock: + current_usage = self._get_tier_usage(tier) + # Normal case: fit within the 80% target + if current_usage + required_bytes <= target_usage: + # FIX: Atomic Reservation + # We must reserve the space NOW, inside the lock, to prevent other threads + # from seeing this space as free and over-subscribing the tier. + self._update_tier_usage(tier, required_bytes) + return True + + # Large entry case: if we've cleared the tier, allow up to 95% of limit + if current_usage < limit * 0.05 and required_bytes <= limit * 0.95: + # FIX: Atomic Reservation here too + self._update_tier_usage(tier, required_bytes) + return True + + # Find the LRU entry in this tier + lru_entries = self._get_lru_entries_in_tier(tier) + + if not lru_entries: + # No entries to evict. This can happen due to: + # 1. Race condition: in-flight writes not yet registered in cache_entries + # 2. Accounting mismatch from failed writes + # Recalculate actual usage from entries to fix any drift. + with self.metadata_lock: + actual_usage = sum( + entry['size'] for entry in self.cache_entries.values() + if entry['location'] == tier + ) + with self.memory_lock: + if tier == 'gpu': + self.gpu_memory_used = actual_usage + elif tier == 'cpu': + self.cpu_memory_used = actual_usage + + # Check if we now have space after recalculation + # Note: We need to re-acquire lock to check and reserve safely, + # but since we just updated it, let's do a quick check. + with self.memory_lock: + current_usage = self._get_tier_usage(tier) + if current_usage + required_bytes <= target_usage: + self._update_tier_usage(tier, required_bytes) + return True + + # Tier is empty but entry still doesn't fit — too large for this tier + return False + + # Early exit optimization: if tier is nearly empty (< 20% used) but + # we still can't fit, the entry is probably too large for this tier + total_size_in_tier = sum(e['size'] for _, e in lru_entries) + if total_size_in_tier < limit * 0.2 and required_bytes > target_usage * 0.5: + # Tier almost empty but entry > 50% of usable space — skip to next tier + return False + + lru_key, lru_entry = lru_entries[0] + lru_size = lru_entry['size'] + + # Recursively ensure the next tier has space for this entry + if not self._ensure_space_in_tier(next_tier, lru_size, recursion_depth + 1): + print(f"[KVCache] Warning: Could not make space in {next_tier} for demotion") + # If we can't move the LRU item, we can't make space. + # We should probably abort to avoid spinning. + return False + + # Demote the LRU entry to the next tier + success, _ = self._demote_entry(lru_key, tier, next_tier) + if not success: + # Entry might have been moved by another thread, try next LRU + pass + + eviction_count += 1 + + # Hit eviction limit — this can happen under heavy concurrent load + # when many threads are competing for limited tier space. This is + # expected behavior; the entry will fall through to the next tier. + return False + def allocate_cache(self, key: str, num_tokens: int, phase: InferencePhase = InferencePhase.PREFILL) -> Tuple[bool, str, float]: """ Allocates and writes a new KV cache entry to the most appropriate tier. @@ -1202,7 +1517,22 @@ def allocate_cache(self, key: str, num_tokens: int, phase: InferencePhase = Infe if key in self.cache_entries: return True, self.cache_entries[key]['location'], 0.0 - # Generate the KV cache data. This is computationally expensive and done outside locks. + # Use semaphore to limit concurrent allocations if configured. + # This bounds RAM usage by limiting how many threads can hold large + # data arrays simultaneously. + if self.allocation_semaphore: + self.allocation_semaphore.acquire() + + try: + return self._allocate_cache_inner(key, num_tokens, phase) + finally: + if self.allocation_semaphore: + self.allocation_semaphore.release() + + def _allocate_cache_inner(self, key: str, num_tokens: int, phase: InferencePhase) -> Tuple[bool, str, float]: + """Inner implementation of allocate_cache, called within semaphore.""" + + # Generate the KV cache data. This is the RAM-heavy operation. try: data = self.generator.generate(sequence_length=num_tokens, key=key) except MemoryError: @@ -1222,20 +1552,46 @@ def allocate_cache(self, key: str, num_tokens: int, phase: InferencePhase = Infe self.stats['write_operations'] += 1 self.stats['total_write_bytes'] += size_bytes - # --- Tiering Logic --- - # Decide which tier to write to based on available memory. - with self.memory_lock: - # Tier 1: GPU. Check if there's space in the GPU budget (with a 20% buffer). - if 'gpu' in self.backends and self.gpu_memory_used + size_bytes < self.gpu_memory_limit * 0.8: - self.gpu_memory_used += size_bytes - allocated_tier = 'gpu' - # Tier 2: CPU. Check if there's space in the CPU budget. - elif self.cpu_memory_used + size_bytes < self.cpu_memory_limit * 0.8: - self.cpu_memory_used += size_bytes - allocated_tier = 'cpu' - # Tier 3: NVMe. If no space in RAM, offload to disk. - else: + # --- Waterfall LRU Tiering Logic --- + # New data is always "hot", so we try to place it in the fastest tier. + # If the fast tier is full, we evict LRU entries down the hierarchy + # (GPU -> CPU -> NVMe) to make room at the top. + # + # This ensures the invariant: hottest data lives in the fastest tier. + # + # +-----------+ + # | GPU | <- New writes target here first + # +-----------+ + # | LRU eviction (demote to CPU) + # v + # +-----------+ + # | CPU | + # +-----------+ + # | LRU eviction (demote to NVMe) + # v + # +-----------+ + # | NVMe | <- Cold data sinks here + # +-----------+ + # + tier_order = self._get_tier_order() + allocated_tier = None + + # Try each tier from fastest to slowest + for tier in tier_order: + if tier == 'nvme': + # NVMe is the fallback - always has space allocated_tier = 'nvme' + break + + # Try to ensure space in this tier (may trigger cascade evictions) + if self._ensure_space_in_tier(tier, size_bytes): + # Space is already reserved by _ensure_space_in_tier atomically + allocated_tier = tier + break + + # Final fallback to NVMe if all else fails + if allocated_tier is None: + allocated_tier = 'nvme' # Perform the actual write operation to the chosen backend. try: @@ -1275,10 +1631,7 @@ def allocate_cache(self, key: str, num_tokens: int, phase: InferencePhase = Infe except Exception as e: # If the write fails, roll back the memory reservation. with self.memory_lock: - if allocated_tier == 'gpu': - self.gpu_memory_used -= size_bytes - elif allocated_tier == 'cpu': - self.cpu_memory_used -= size_bytes + self._update_tier_usage(allocated_tier, -size_bytes) del data return False, 'none', 0.0 @@ -2033,7 +2386,8 @@ def __init__(self, performance_profile: str = 'latency', use_burst_trace: bool = False, burst_trace_path: Optional[str] = None, - seed: Optional[int] = None): + seed: Optional[int] = None, + max_concurrent_allocs: int = 0): self.model_config = model_config self.num_users = num_users @@ -2050,6 +2404,7 @@ def __init__(self, self.use_burst_trace = use_burst_trace self.burst_trace_path = burst_trace_path self.seed = seed + self.max_concurrent_allocs = max_concurrent_allocs self.burst_requests: List[Tuple[int, int]] = [] if self.use_burst_trace: self._load_burst_trace() @@ -2061,7 +2416,8 @@ def __init__(self, cpu_memory_gb=cpu_memory_gb, cache_dir=cache_dir, performance_profile=performance_profile, - seed=seed + seed=seed, + max_concurrent_allocs=max_concurrent_allocs ) self.conversation_manager = ConversationManager() self.prefix_cache_manager = PrefixCacheManager(self.cache) if enable_prefix_caching else None @@ -2334,7 +2690,8 @@ def process_requests(self, stop_event: threading.Event): with self.results_lock: self.results['prefill_latencies'].append(write_latency) # 4. Simulate a RAG operation by reading random chunk caches. - if self.rag_manager and random.random() < 0.1: # 10% of requests are RAG queries + # NOTE: Check that documents exist to avoid race condition with RAG ingestion thread + if self.rag_manager and self.rag_manager.documents and random.random() < 0.1: # 10% of requests are RAG queries doc_id = random.choice(list(self.rag_manager.documents.keys())) chunks = self.rag_manager.retrieve_chunks(doc_id) for chunk in chunks: # Read the KV cache for each retrieved chunk. @@ -2473,6 +2830,8 @@ def run(self) -> Dict: print(f" - Mode: {self.autoscaler.mode}") print(f" - QoS Support: Enabled (Interactive/Responsive/Batch)") print(f" - Trace-Driven (BurstGPT): {'Enabled' if self.use_burst_trace else 'Disabled'}") + if self.max_concurrent_allocs > 0: + print(f" - Max Concurrent Allocations: {self.max_concurrent_allocs} (bounds RAM usage)") print("=" * 80) users = [] @@ -2812,6 +3171,9 @@ def main(): parser.add_argument('--output', type=str, default=f"benchmark_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", help='Output file for results') parser.add_argument('--seed', type=int, default=None, help='Seed for random number generators to ensure reproducibility.') + parser.add_argument('--max-concurrent-allocs', type=int, default=0, + help='Limit concurrent allocations to bound RAM usage. 0 = unlimited. ' + 'Recommended: 8-16 for large models to prevent memory explosion.') args = parser.parse_args() @@ -2846,7 +3208,8 @@ def main(): performance_profile=args.performance_profile, use_burst_trace=args.use_burst_trace, burst_trace_path=args.burst_trace_path, - seed=args.seed + seed=args.seed, + max_concurrent_allocs=args.max_concurrent_allocs ) results = benchmark.run() @@ -2869,4 +3232,4 @@ def convert_numpy(obj): print(f"\nResults saved to {args.output}") if __name__ == "__main__": - main() \ No newline at end of file + main() From 27d506d7bf45c05b4b5918d2910be47673f9bae0 Mon Sep 17 00:00:00 2001 From: Parshuram Sangle Date: Tue, 27 Jan 2026 00:54:26 -0600 Subject: [PATCH 03/43] vdb_benchmark: adding AISAQ indexing support --- vdb_benchmark/README.md | 6 +++-- vdb_benchmark/docker-compose.yml | 12 ++++----- vdb_benchmark/tests/README.md | 2 +- .../vdbbench/configs/1m_aisaq_512dim.yaml | 27 +++++++++++++++++++ .../vdbbench/configs/1m_diskann_512dim.yaml | 26 ++++++++++++++++++ vdb_benchmark/vdbbench/load_vdb.py | 8 ++++++ 6 files changed, 72 insertions(+), 9 deletions(-) create mode 100644 vdb_benchmark/vdbbench/configs/1m_aisaq_512dim.yaml create mode 100644 vdb_benchmark/vdbbench/configs/1m_diskann_512dim.yaml diff --git a/vdb_benchmark/README.md b/vdb_benchmark/README.md index e8ea20e4..38aed361 100644 --- a/vdb_benchmark/README.md +++ b/vdb_benchmark/README.md @@ -32,7 +32,9 @@ The docker-compose.yml file will configure a 3-container instance of Milvus data - Minio Object Storage - etcd -The docker-compose.yml file uses ```/mnt/vdb``` as the root directory for the required docker volumes. You can modify the compose file for your environment or ensure that your target storage is mounted at this location. +The docker-compose.yml file uses ```/mnt/vdb``` as the root directory for the required docker volumes. +You can either set environment variable DOCKER_VOLUME_DIRECTORY or modify the compose file for your environment to point to target location. +Otherwise ensure that your target storage is mounted at ```/mnt/vdb```. For testing more than one storage solution, there are two methods: 1. Create a set of containers for each storage solution with modified docker-compose.yml files pointing to different root directories. Each set of containers will also need a different port to listen on. You may need to limit how many instances you can run depending on the available memory in your system @@ -119,7 +121,7 @@ python vdbbench/simple_bench.py --host 127.0.0.1 --collection For comparison with HNSW indexing, use ```vdbbench/configs/10m_hnsw.yaml``` and update collection_name accordingly. ## Supported Databases -Milvus with DiskANN & HNSW indexing (currently implemented) +Milvus with DiskANN, HNSW and AISAQ indexing (currently implemented) # Contributing Contributions are welcome! Please feel free to submit a Pull Request. diff --git a/vdb_benchmark/docker-compose.yml b/vdb_benchmark/docker-compose.yml index 4c69af29..bb823c4d 100644 --- a/vdb_benchmark/docker-compose.yml +++ b/vdb_benchmark/docker-compose.yml @@ -3,14 +3,14 @@ version: '3.5' services: etcd: container_name: milvus-etcd - image: quay.io/coreos/etcd:v3.5.18 + image: quay.io/coreos/etcd:v3.5.25 environment: - ETCD_AUTO_COMPACTION_MODE=revision - ETCD_AUTO_COMPACTION_RETENTION=1000 - ETCD_QUOTA_BACKEND_BYTES=4294967296 - ETCD_SNAPSHOT_COUNT=50000 volumes: - - /mnt/vdb/etcd:/etcd + - ${DOCKER_VOLUME_DIRECTORY:-/mnt/vdb}/etcd:/etcd command: etcd -advertise-client-urls=http://etcd:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd ports: - "2379:2379" @@ -22,7 +22,7 @@ services: minio: container_name: milvus-minio - image: minio/minio:RELEASE.2023-03-20T20-16-18Z + image: minio/minio:RELEASE.2024-12-18T13-15-44Z environment: MINIO_ACCESS_KEY: minioadmin MINIO_SECRET_KEY: minioadmin @@ -30,7 +30,7 @@ services: - "9001:9001" - "9000:9000" volumes: - - /mnt/vdb/minio:/minio_data + - ${DOCKER_VOLUME_DIRECTORY:-/mnt/vdb}/minio:/minio_data command: minio server /minio_data --console-address ":9001" healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] @@ -40,7 +40,7 @@ services: standalone: container_name: milvus-standalone - image: milvusdb/milvus:v2.5.10 + image: milvusdb/milvus:v2.6.7 command: ["milvus", "run", "standalone"] security_opt: - seccomp:unconfined @@ -49,7 +49,7 @@ services: ETCD_ENDPOINTS: etcd:2379 MINIO_ADDRESS: minio:9000 volumes: - - /mnt/vdb/milvus:/var/lib/milvus + - ${DOCKER_VOLUME_DIRECTORY:-/mnt/vdb}/milvus:/var/lib/milvus healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"] interval: 30s diff --git a/vdb_benchmark/tests/README.md b/vdb_benchmark/tests/README.md index f40c101b..4450a2f9 100755 --- a/vdb_benchmark/tests/README.md +++ b/vdb_benchmark/tests/README.md @@ -45,7 +45,7 @@ tests/ 1. Install test dependencies: ```bash -pip install -r tests/requirements-test.txt +pip install -r tests/requirements.txt ``` 2. Install vdb-bench in development mode: diff --git a/vdb_benchmark/vdbbench/configs/1m_aisaq_512dim.yaml b/vdb_benchmark/vdbbench/configs/1m_aisaq_512dim.yaml new file mode 100644 index 00000000..f044c0c3 --- /dev/null +++ b/vdb_benchmark/vdbbench/configs/1m_aisaq_512dim.yaml @@ -0,0 +1,27 @@ +database: + host: 127.0.0.1 + port: 19530 + database: milvus + max_receive_message_length: 514_983_574 + max_send_message_length: 514_983_574 + +dataset: + collection_name: mlps_1m_1shards_512dim_uniform_aisaq_perf + num_vectors: 1_000_000 + dimension: 512 + distribution: uniform + chunk_size: 100_000 + batch_size: 1000 + num_shards: 1 + vector_dtype: FLOAT_VECTOR + +index: + index_type: AISAQ + metric_type: COSINE + #index_params + inline_pq: 32 + max_degree: 32 + search_list_size: 100 + +workflow: + compact: True diff --git a/vdb_benchmark/vdbbench/configs/1m_diskann_512dim.yaml b/vdb_benchmark/vdbbench/configs/1m_diskann_512dim.yaml new file mode 100644 index 00000000..c4f0d466 --- /dev/null +++ b/vdb_benchmark/vdbbench/configs/1m_diskann_512dim.yaml @@ -0,0 +1,26 @@ +database: + host: 127.0.0.1 + port: 19530 + database: milvus + max_receive_message_length: 514_983_574 + max_send_message_length: 514_983_574 + +dataset: + collection_name: mlps_1m_1shards_512dim_uniform_diskann + num_vectors: 1_000_000 + dimension: 512 + distribution: uniform + chunk_size: 100_000 + batch_size: 1000 + num_shards: 1 + vector_dtype: FLOAT_VECTOR + +index: + index_type: DISKANN + metric_type: COSINE + #index_params + max_degree: 32 + search_list_size: 100 + +workflow: + compact: True diff --git a/vdb_benchmark/vdbbench/load_vdb.py b/vdb_benchmark/vdbbench/load_vdb.py index 0a7a9324..b8261303 100644 --- a/vdb_benchmark/vdbbench/load_vdb.py +++ b/vdb_benchmark/vdbbench/load_vdb.py @@ -47,6 +47,7 @@ def parse_args(): parser.add_argument("--search-list-size", type=int, default=200, help="DiskANN SearchListSize parameter") parser.add_argument("--M", type=int, default=16, help="HNSW M parameter") parser.add_argument("--ef-construction", type=int, default=200, help="HNSW efConstruction parameter") + parser.add_argument("--inline-pq", type=int, default=16, help="AISAQ inline_pq parameter, performance(max_degree) vs scale(0) mode") # Monitoring parameters parser.add_argument("--monitor-interval", type=int, default=5, help="Interval in seconds for monitoring index building") @@ -78,6 +79,7 @@ def parse_args(): 'search_list_size': args.search_list_size == 200, 'M': args.M == 16, 'ef_construction': args.ef_construction == 200, + 'inline_pq': args.inline_pq == 16, 'monitor_interval': args.monitor_interval == 5, 'compact': not args.compact, # Default is False 'force': not args.force, # Default is False @@ -301,6 +303,12 @@ def main(): "MaxDegree": args.max_degree, "SearchListSize": args.search_list_size } + elif args.index_type == "AISAQ": + index_params["params"] = { + "inline_pq": args.inline_pq, + "max_degree": args.max_degree, + "search_list_size": args.search_list_size + } else: raise ValueError(f"Unsupported index_type: {args.index_type}") From d1fc97a1516b1bfe7ee246d279e4a747b54e8f44 Mon Sep 17 00:00:00 2001 From: Hazem Awadallah Date: Tue, 27 Jan 2026 15:42:46 -0800 Subject: [PATCH 04/43] feat(kv-cache): MLPerf v3.0 compliance and configuration overhaul - Add ConfigLoader class with YAML config file support and schema validation - Add cfg() helper function for config-driven parameter access - Add validate_args() with safety limits for protected system paths - Rename all nvme_* metrics to storage_* for MLPerf terminology compliance - Add extended QoS percentiles: P99.9 and P99.99 latency tracking - Add per-tier bandwidth metrics (read/write GB/s per tier) - Add per-tier KV bytes tracking for detailed storage analysis - Fix GPU metadata desync bug via on_eviction_callback pattern - Change eviction from single-shot to iterative loop until space freed - Replace print statements with Python logging module - Add waterfall LRU eviction with configurable high/low watermarks - Add storage_health section with PASS/FAIL criteria - Add storage_throughput_tokens_per_sec as primary MLPerf metric --- kv_cache_benchmark/kv-cache.py | 1114 +++++++++++++++++++++++++------- 1 file changed, 868 insertions(+), 246 deletions(-) diff --git a/kv_cache_benchmark/kv-cache.py b/kv_cache_benchmark/kv-cache.py index 106418a5..70194664 100644 --- a/kv_cache_benchmark/kv-cache.py +++ b/kv_cache_benchmark/kv-cache.py @@ -47,6 +47,17 @@ from collections import defaultdict import argparse import csv +import logging + +# Configure module-level logger +logger = logging.getLogger(__name__) + +# Optional YAML support for config file loading +try: + import yaml + YAML_AVAILABLE = True +except ImportError: + YAML_AVAILABLE = False # Attempt to import optional GPU libraries (torch, cupy) # The benchmark can run in a CPU-only environment if these are not found. @@ -82,6 +93,207 @@ OPENPYXL_AVAILABLE = False +# ============================================================================ +# CONFIGURATION LOADER +# Loads benchmark configuration from YAML files with strict validation. +# ============================================================================ + +class ConfigLoader: + """ + Loads and validates benchmark configuration from YAML files. + + Raises errors on invalid/unknown keys to prevent silent misconfigurations + in MLPerf competition submissions. + """ + + # Define the valid configuration schema with expected types + VALID_SCHEMA = { + 'user_templates': { + 'chatbot': {'context_range': list, 'generation_range': list, 'think_time_range': list}, + 'coding': {'context_range': list, 'generation_range': list, 'think_time_range': list}, + 'document': {'context_range': list, 'generation_range': list, 'think_time_range': list}, + }, + 'generation_timing': { + 'none': (int, float), + 'fast': (int, float), + 'realistic': (int, float), + }, + 'qos_profiles': { + 'interactive': {'target_latency_p95_ms': (int, float), 'target_latency_p99_ms': (int, float), + 'target_latency_p999_ms': (int, float), 'target_latency_p9999_ms': (int, float), 'priority': int}, + 'responsive': {'target_latency_p95_ms': (int, float), 'target_latency_p99_ms': (int, float), + 'target_latency_p999_ms': (int, float), 'target_latency_p9999_ms': (int, float), 'priority': int}, + 'batch': {'target_latency_p95_ms': (int, float), 'target_latency_p99_ms': (int, float), + 'target_latency_p999_ms': (int, float), 'target_latency_p9999_ms': (int, float), 'priority': int}, + }, + 'qos_distribution': { + 'interactive_probability': (int, float), + 'responsive_threshold': (int, float), + }, + 'eviction': { + 'max_recursion_depth': int, + 'target_usage_ratio': (int, float), + 'large_entry_limit_ratio': (int, float), + 'max_evictions_hard_cap': int, + 'max_evictions_min': int, + }, + 'gpu_backend': { + 'memory_fraction': (int, float), + 'max_eviction_attempts': int, + 'free_memory_threshold': (int, float), + }, + 'prefix_cache': { + 'min_prefix_length': int, + 'max_prefix_entries': int, + 'system_prompt_hit_probability': (int, float), + }, + 'rag': { + 'chunk_size_tokens': int, + 'top_k_chunks': int, + 'max_chunk_bytes': int, + }, + 'conversation': { + 'max_conversations': int, + 'max_turns_per_conv': int, + 'end_conversation_probability': (int, float), + }, + 'autoscaler': { + 'min_users': int, + 'max_users': int, + 'scale_up_factor': (int, float), + 'scale_down_factor': (int, float), + 'consecutive_samples_required': int, + }, + 'decode': { + 'batch_size': int, + }, + 'sharegpt': { + 'max_context_tokens': int, + 'max_generation_tokens': int, + 'chars_per_token_estimate': int, + }, + 'saturation_detection': { + 'read_latency_p95_threshold_ms': (int, float), + 'write_latency_p95_threshold_ms': (int, float), + 'queue_depth_threshold': int, + 'history_window_size': int, + }, + 'validation_limits': { + 'max_users': int, + 'max_duration_seconds': int, + 'max_gpu_memory_gb': int, + 'max_cpu_memory_gb': int, + }, + } + + def __init__(self, config_path: Optional[str] = None): + """ + Initialize the ConfigLoader. + + Args: + config_path: Path to YAML config file. If None, uses built-in defaults. + """ + self.config_path = config_path + self.config = {} + + if config_path: + self._load_and_validate(config_path) + + def _load_and_validate(self, config_path: str) -> None: + """Load YAML config and validate strictly against schema.""" + if not YAML_AVAILABLE: + raise RuntimeError("pyyaml is required for config file support. Install with: pip install pyyaml") + + path = Path(config_path) + if not path.exists(): + raise FileNotFoundError(f"Config file not found: {config_path}") + + with open(path, 'r') as f: + self.config = yaml.safe_load(f) or {} + + # Validate all keys against schema + self._validate_keys(self.config, self.VALID_SCHEMA, path_prefix='') + + logger.info(f"Loaded configuration from {config_path}") + + def _validate_keys(self, config: dict, schema: dict, path_prefix: str) -> None: + """Recursively validate config keys against schema. Raises on unknown keys.""" + for key, value in config.items(): + full_path = f"{path_prefix}.{key}" if path_prefix else key + + if key not in schema: + raise ValueError(f"Unknown configuration key: '{full_path}'. " + f"Valid keys at this level: {list(schema.keys())}") + + expected_type = schema[key] + + # If schema expects a dict, recurse + if isinstance(expected_type, dict): + if not isinstance(value, dict): + raise ValueError(f"Config key '{full_path}' must be a dict, got {type(value).__name__}") + self._validate_keys(value, expected_type, full_path) + else: + # Validate type + if isinstance(expected_type, tuple): + if not isinstance(value, expected_type): + raise ValueError(f"Config key '{full_path}' must be one of {expected_type}, " + f"got {type(value).__name__}") + elif not isinstance(value, expected_type): + raise ValueError(f"Config key '{full_path}' must be {expected_type.__name__}, " + f"got {type(value).__name__}") + + def get(self, *keys, default=None): + """ + Get a nested configuration value. + + Args: + *keys: Path to the config value (e.g., 'qos_profiles', 'interactive', 'priority') + default: Default value if key not found + + Returns: + The config value or default + """ + value = self.config + for key in keys: + if isinstance(value, dict) and key in value: + value = value[key] + else: + return default + return value + + +# Global config instance (set from main() when --config is provided) +_global_config: Optional[ConfigLoader] = None + + +def get_config() -> Optional[ConfigLoader]: + """Get the global configuration loader instance.""" + return _global_config + + +def set_config(config: ConfigLoader) -> None: + """Set the global configuration loader instance.""" + global _global_config + _global_config = config + + +def cfg(*keys, default=None): + """ + Get a configuration value from the global config, with fallback to default. + + Args: + *keys: Path to the config value (e.g., 'qos_profiles', 'interactive', 'priority') + default: Default value if config not loaded or key not found + + Returns: + The config value or default + """ + config = get_config() + if config is None: + return default + return config.get(*keys, default=default) + + # ============================================================================ # CORE DATA MODELS # Defines the basic data structures used throughout the benchmark. @@ -213,8 +425,10 @@ class QoSSLA: Defines the performance targets and tracks violations. """ qos_level: QoSLevel - target_latency_p95_ms: float # The 95th percentile latency target. - target_latency_p99_ms: float # The 99th percentile latency target. + target_latency_p95_ms: float # The 95th percentile latency target. + target_latency_p99_ms: float # The 99th percentile latency target. + target_latency_p999_ms: float # The 99.9th percentile latency target (3 nines). + target_latency_p9999_ms: float # The 99.99th percentile latency target (4 nines). priority: int # An integer priority level (higher is more important). # SLA violation tracking @@ -229,29 +443,46 @@ def sla_compliance(self) -> float: return 1.0 - (self.violations / self.total_requests) -# Pre-defined QoS profiles mapping each level to a specific SLA. -QOS_PROFILES = { - QoSLevel.INTERACTIVE: QoSSLA( - qos_level=QoSLevel.INTERACTIVE, - target_latency_p95_ms=50, - target_latency_p99_ms=100, - priority=3 - ), - QoSLevel.RESPONSIVE: QoSSLA( - qos_level=QoSLevel.RESPONSIVE, - target_latency_p95_ms=100, - target_latency_p99_ms=200, - priority=2 - ), - QoSLevel.BATCH: QoSSLA( - qos_level=QoSLevel.BATCH, - target_latency_p95_ms=1000, - target_latency_p99_ms=5000, - priority=1 - ) +# Default QoS profile values (overridden by config.yaml when loaded) +_DEFAULT_QOS_PROFILES = { + 'interactive': {'target_latency_p95_ms': 50, 'target_latency_p99_ms': 100, + 'target_latency_p999_ms': 150, 'target_latency_p9999_ms': 200, 'priority': 3}, + 'responsive': {'target_latency_p95_ms': 100, 'target_latency_p99_ms': 200, + 'target_latency_p999_ms': 350, 'target_latency_p9999_ms': 500, 'priority': 2}, + 'batch': {'target_latency_p95_ms': 1000, 'target_latency_p99_ms': 5000, + 'target_latency_p999_ms': 7500, 'target_latency_p9999_ms': 10000, 'priority': 1}, } +def get_qos_profiles() -> Dict[QoSLevel, QoSSLA]: + """ + Returns QoS profiles, using config.yaml values if loaded, otherwise defaults. + """ + profiles = {} + for level in QoSLevel: + level_key = level.value # 'interactive', 'responsive', 'batch' + defaults = _DEFAULT_QOS_PROFILES[level_key] + + profiles[level] = QoSSLA( + qos_level=level, + target_latency_p95_ms=cfg('qos_profiles', level_key, 'target_latency_p95_ms', + default=defaults['target_latency_p95_ms']), + target_latency_p99_ms=cfg('qos_profiles', level_key, 'target_latency_p99_ms', + default=defaults['target_latency_p99_ms']), + target_latency_p999_ms=cfg('qos_profiles', level_key, 'target_latency_p999_ms', + default=defaults['target_latency_p999_ms']), + target_latency_p9999_ms=cfg('qos_profiles', level_key, 'target_latency_p9999_ms', + default=defaults['target_latency_p9999_ms']), + priority=cfg('qos_profiles', level_key, 'priority', default=defaults['priority']), + ) + return profiles + + +# For backward compatibility, QOS_PROFILES can still be used as a dict +# but code should prefer get_qos_profiles() to pick up config changes +QOS_PROFILES = get_qos_profiles() + + @dataclass class UserProfile: """Represents a simulated user with specific behavior patterns.""" @@ -352,10 +583,10 @@ class ConversationState: class ConversationManager: """Manages the lifecycle of all multi-turn conversations and enables cache reuse.""" - def __init__(self, max_conversations: int = 1000, max_turns_per_conv: int = 50): + def __init__(self, max_conversations: int = None, max_turns_per_conv: int = None): self.conversations: Dict[str, ConversationState] = {} - self.max_conversations = max_conversations - self.max_turns_per_conv = max_turns_per_conv + self.max_conversations = max_conversations if max_conversations is not None else cfg('conversation', 'max_conversations', default=1000) + self.max_turns_per_conv = max_turns_per_conv if max_turns_per_conv is not None else cfg('conversation', 'max_turns_per_conv', default=50) self.lock = threading.Lock() # Protects access to the shared conversations dictionary. def start_conversation(self, user_id: str, system_prompt: Optional[str] = None) -> str: @@ -535,8 +766,8 @@ class PrefixMatcher: "You are a professional writing assistant.", ] - def __init__(self, min_prefix_length: int = 50): - self.min_prefix_length = min_prefix_length + def __init__(self, min_prefix_length: int = None): + self.min_prefix_length = min_prefix_length if min_prefix_length is not None else cfg('prefix_cache', 'min_prefix_length', default=50) self.prefix_index: Dict[str, PrefixCacheEntry] = {} self.prefix_frequency: Dict[str, int] = {} self.lock = threading.Lock() @@ -548,8 +779,9 @@ def hash_prefix(self, text: str, token_count: int) -> str: def detect_system_prompt(self, context_tokens: int) -> Optional[PrefixCacheEntry]: """Simulates the detection of a common system prompt at the start of a request.""" - # In this simulation, 20% of requests are assumed to start with a common system prompt. - if random.random() < 0.2: + # Probability of requests having a common system prompt (configurable, default 20%). + system_prompt_hit_probability = cfg('prefix_cache', 'system_prompt_hit_probability', default=0.2) + if random.random() < system_prompt_hit_probability: system_prompt = random.choice(self.COMMON_SYSTEM_PROMPTS) prefix_hash = self.hash_prefix(system_prompt, len(system_prompt.split())) @@ -578,9 +810,9 @@ def detect_system_prompt(self, context_tokens: int) -> Optional[PrefixCacheEntry class PrefixCacheManager: """Orchestrates the prefix matching and caching logic.""" - def __init__(self, cache, max_prefix_entries: int = 1000): + def __init__(self, cache, max_prefix_entries: int = None): self.cache = cache # A reference to the main MultiTierCache. - self.max_prefix_entries = max_prefix_entries + self.max_prefix_entries = max_prefix_entries if max_prefix_entries is not None else cfg('prefix_cache', 'max_prefix_entries', default=1000) self.prefix_matcher = PrefixMatcher() self.lock = threading.Lock() @@ -672,10 +904,10 @@ def total_context_tokens(self) -> int: class RAGDocumentManager: """Manages the ingestion and retrieval of RAG document chunks.""" - def __init__(self, cache, chunk_size: int = 512, top_k_chunks: int = 5): + def __init__(self, cache, chunk_size: int = None, top_k_chunks: int = None): self.cache = cache # A reference to the main MultiTierCache. - self.chunk_size = chunk_size - self.top_k_chunks = top_k_chunks + self.chunk_size = chunk_size if chunk_size is not None else cfg('rag', 'chunk_size_tokens', default=512) + self.top_k_chunks = top_k_chunks if top_k_chunks is not None else cfg('rag', 'top_k_chunks', default=5) self.documents: Dict[str, RAGDocument] = {} self.chunk_index: Dict[str, RAGChunk] = {} @@ -685,12 +917,12 @@ def ingest_document(self, doc_id: str, total_tokens: int, model_config: ModelCon This involves splitting it into chunks and pre-calculating and storing the KV cache for each chunk in the multi-tier cache. """ - max_chunk_bytes = 256 * 1024**2 # Target ~256MB per chunk to limit memory pressure. + max_chunk_bytes = cfg('rag', 'max_chunk_bytes', default=256 * 1024**2) # Target ~256MB per chunk bytes_per_token = max(model_config.kv_cache_size_per_token, 1) max_tokens_per_chunk = max(1, min(self.chunk_size, max_chunk_bytes // bytes_per_token)) if max_tokens_per_chunk < self.chunk_size: - print(f"[RAG] Adjusting chunk size for {doc_id} to {max_tokens_per_chunk} tokens " + logger.debug(f"Adjusting chunk size for {doc_id} to {max_tokens_per_chunk} tokens " f"to stay under {max_chunk_bytes / 1024**2:.0f} MB per chunk.") num_chunks = (total_tokens + max_tokens_per_chunk - 1) // max_tokens_per_chunk @@ -721,14 +953,14 @@ def ingest_document(self, doc_id: str, total_tokens: int, model_config: ModelCon num_tokens=chunk_tokens ) except MemoryError: - print(f"[RAG] MemoryError while ingesting chunk {chunk.chunk_id}; skipping remaining chunks.") + logger.error(f"MemoryError while ingesting chunk {chunk.chunk_id}; skipping remaining chunks.") break except Exception as exc: - print(f"[RAG] Error ingesting chunk {chunk.chunk_id}: {exc}") + logger.error(f"Error ingesting chunk {chunk.chunk_id}: {exc}") continue if not success: - print(f"[RAG] Warning: Failed to allocate cache for chunk {chunk.chunk_id}.") + logger.warning(f"Failed to allocate cache for chunk {chunk.chunk_id}.") continue chunk.storage_tier = location @@ -808,14 +1040,26 @@ class GPUMemoryBackend(StorageBackend): Uses PyTorch or CuPy for GPU operations. This is the fastest tier. """ - def __init__(self, use_torch=True): + def __init__(self, use_torch=True, on_eviction_callback=None): + """ + Initialize the GPU memory backend. + + Args: + use_torch: Whether to use PyTorch (vs CuPy) for GPU operations. + on_eviction_callback: Optional callback function called when entries are evicted + during OOM handling. Signature: callback(key: str, tier: str) + This allows the parent CacheManager to sync its metadata. + """ + self.on_eviction_callback = on_eviction_callback + if use_torch and TORCH_AVAILABLE: self.backend = 'torch' self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if self.device.type == 'cpu': raise RuntimeError("No GPU available for PyTorch backend") # Pre-allocate a large chunk of GPU memory to simulate a real server environment. - torch.cuda.set_per_process_memory_fraction(0.8, 0) + memory_fraction = cfg('gpu_backend', 'memory_fraction', default=0.8) + torch.cuda.set_per_process_memory_fraction(memory_fraction, 0) torch.cuda.empty_cache() elif CUPY_AVAILABLE: self.backend = 'cupy' @@ -832,16 +1076,65 @@ def write(self, key: str, data: np.ndarray) -> StorageBackend.IOTiming: Writes a NumPy array from CPU to GPU VRAM. Uses pinned memory and non-blocking transfers for maximum performance. """ - # Simple eviction mechanism if GPU runs out of memory. + # FIX: Iterative eviction mechanism for GPU OOM handling. + # The original code only evicted ONE entry which is insufficient for large allocations. + # We now evict multiple entries until there's enough space or we've exhausted options. if self.backend == 'torch' and torch.cuda.is_available(): - free_memory = torch.cuda.mem_get_info()[0] - if data.nbytes > free_memory * 0.9: + required_bytes = data.nbytes + max_eviction_attempts = cfg('gpu_backend', 'max_eviction_attempts', default=100) + eviction_count = 0 + # Threshold for free memory (inverted: 0.1 means keep 10% free, so use 90%) + free_memory_threshold = cfg('gpu_backend', 'free_memory_threshold', default=0.1) + usable_fraction = 1.0 - free_memory_threshold # e.g., 0.9 if threshold is 0.1 + + while eviction_count < max_eviction_attempts: + free_memory = torch.cuda.mem_get_info()[0] + # Use configurable threshold to leave headroom + if required_bytes <= free_memory * usable_fraction: + break # We have enough space + + # Try clearing the CUDA cache first torch.cuda.empty_cache() - if data.nbytes > torch.cuda.mem_get_info()[0] * 0.9: - if len(self.cache) > 0: - oldest_key = list(self.cache.keys())[0] - del self.cache[oldest_key] - torch.cuda.empty_cache() + free_memory = torch.cuda.mem_get_info()[0] + if required_bytes <= free_memory * usable_fraction: + break + + # If no entries to evict, we're out of options + if len(self.cache) == 0: + # Log warning and let the allocation proceed (it may OOM) + logger.warning( + f"GPU OOM: Need {required_bytes / 1024**2:.1f}MB, " + f"have {free_memory / 1024**2:.1f}MB, no entries to evict" + ) + break + + # Evict the oldest entry (first key in dict, which is insertion-ordered) + oldest_key = next(iter(self.cache)) + evicted_tensor = self.cache.pop(oldest_key) + evicted_size = evicted_tensor.element_size() * evicted_tensor.nelement() + del evicted_tensor + + # Also clean up pinned memory if present + if oldest_key in self.pinned_memory: + del self.pinned_memory[oldest_key] + + # Notify parent CacheManager to sync its metadata + if self.on_eviction_callback: + try: + self.on_eviction_callback(oldest_key, 'gpu', evicted_size) + except Exception as e: + logger.warning(f"GPU eviction callback failed for {oldest_key}: {e}") + + eviction_count += 1 + logger.debug( + f"GPU eviction #{eviction_count}: evicted {oldest_key} " + f"({evicted_size / 1024**2:.1f}MB)" + ) + + # Final cache clear after evictions + if eviction_count > 0: + torch.cuda.empty_cache() + logger.debug(f"GPU: evicted {eviction_count} entries to make room for {key}") start = time.perf_counter() @@ -1063,8 +1356,7 @@ def clear(self): def __del__(self): """Cleans up the temporary directory when the object is destroyed.""" if self.temp_dir: - import shutil - shutil.rmtree(self.temp_dir, ignore_errors=True) + self.temp_dir.cleanup() class KVCacheGenerator: @@ -1080,7 +1372,7 @@ def __init__(self, model_config: ModelConfig, global_seed: Optional[int] = None) self.buffer_size_elements = 128 * 1024 * 1024 # 128 million elements (~256MB for float16) self.dtype = np.float16 if 'float16' in self.model_config.dtype else np.float32 - print(f"[KVCacheGenerator] Pre-generating {self.buffer_size_elements * 2 / 1024**2:.0f} MB noise buffer...") + logger.info(f"Pre-generating {self.buffer_size_elements * 2 / 1024**2:.0f} MB noise buffer...") rng = np.random.default_rng(self.global_seed) self.precomputed_buffer = rng.uniform(-1.0, 1.0, size=self.buffer_size_elements).astype(self.dtype) @@ -1165,9 +1457,13 @@ def __init__(self, self.backends = {} try: if TORCH_AVAILABLE or CUPY_AVAILABLE: - self.backends['gpu'] = GPUMemoryBackend(use_torch=TORCH_AVAILABLE) + # Pass eviction callback to sync metadata when GPU OOM forces evictions + self.backends['gpu'] = GPUMemoryBackend( + use_torch=TORCH_AVAILABLE, + on_eviction_callback=self._handle_gpu_eviction + ) except Exception as e: - print(f"Warning: Could not initialize GPU backend: {e}") + logger.warning(f"Could not initialize GPU backend: {e}") self.backends['cpu'] = CPUMemoryBackend() self.backends['nvme'] = NVMeBackend(base_path=cache_dir) @@ -1193,22 +1489,41 @@ def __init__(self, self.allocation_semaphore = None # Dictionary for collecting a wide range of performance metrics. + # NAMING CONVENTION (MLPerf v3.0): + # - "storage" refers to the NVMe/SSD tier (was "nvme" in earlier versions) + # - "tier_X_kv_bytes_written" = KV cache bytes written to tier X + # - "tier_X_kv_bytes_read" = KV cache bytes read from tier X self.stats = { 'cache_hits': 0, 'cache_misses': 0, 'evictions': 0, - 'offloads_cpu': 0, # Prefills that went directly to CPU. - 'offloads_nvme': 0, # Prefills that went directly to NVMe. + 'offloads_cpu': 0, # Writes that went directly to CPU tier. + 'offloads_storage': 0, # Writes that went directly to Storage tier. # Latency lists for each tier and operation. - 'gpu_read_latencies': [], 'cpu_read_latencies': [], 'nvme_read_latencies': [], - 'gpu_write_latencies': [], 'cpu_write_latencies': [], 'nvme_write_latencies': [], - 'nvme_read_device_latencies': [], 'nvme_read_host_latencies': [], - 'nvme_write_device_latencies': [], 'nvme_write_host_latencies': [], - - # Phase-specific I/O metrics. + # + # LATENCY TERMINOLOGY: + # - Total latency = Host + Device latency (full operation time) + # - Host latency = CPU/memory work (serialization, copying, page cache ops) + # - Device latency = Actual storage I/O (fsync for writes, file read for reads) + # + # For Storage tier (NVMe/SSD): + # Write: host = np.save() time, device = fsync() time + # Read: host = page cache drop + array copy, device = np.load() time + # + 'gpu_read_latencies': [], 'cpu_read_latencies': [], 'storage_read_latencies': [], + 'gpu_write_latencies': [], 'cpu_write_latencies': [], 'storage_write_latencies': [], + # Storage-tier-specific breakdown (device = disk I/O, host = serialization) + 'storage_read_device_latencies': [], 'storage_read_host_latencies': [], + 'storage_write_device_latencies': [], 'storage_write_host_latencies': [], + + # Phase-specific I/O metrics (aggregate - kept for backward compatibility). 'prefill_writes': 0, 'decode_reads': 0, - 'prefill_bytes_written': 0, 'decode_bytes_read': 0, + + # Tier-specific KV cache bytes (NEW NAMING - MLPerf v3.0) + # Written = data stored to tier, Read = data retrieved from tier + 'tier_gpu_kv_bytes_written': 0, 'tier_cpu_kv_bytes_written': 0, 'tier_storage_kv_bytes_written': 0, + 'tier_gpu_kv_bytes_read': 0, 'tier_cpu_kv_bytes_read': 0, 'tier_storage_kv_bytes_read': 0, # Cache type metrics for analyzing hit sources. 'system_prompt_hits': 0, 'common_phrase_hits': 0, @@ -1218,8 +1533,8 @@ def __init__(self, 'total_read_bytes': 0, 'total_write_bytes': 0, 'read_operations': 0, 'write_operations': 0, - # New counter for NVMe tokens processed (for throughput assessment) - 'nvme_tokens_processed': 0, + # Counter for storage tier tokens processed (for throughput assessment) + 'storage_tokens_processed': 0, } def _get_entry_lock(self, key: str) -> threading.Lock: @@ -1229,6 +1544,33 @@ def _get_entry_lock(self, key: str) -> threading.Lock: self.entry_locks[key] = threading.Lock() return self.entry_locks[key] + def _handle_gpu_eviction(self, key: str, tier: str, evicted_size: int) -> None: + """ + Callback invoked by GPUMemoryBackend when it evicts entries during OOM handling. + + This syncs the CacheManager's metadata with the actual GPU cache state. + Without this callback, cache_entries would still reference evicted entries, + causing KeyErrors on subsequent read attempts. + + Args: + key: The cache key that was evicted + tier: The tier from which eviction occurred (always 'gpu' for this callback) + evicted_size: Size in bytes of the evicted entry + """ + with self.metadata_lock: + if key in self.cache_entries: + del self.cache_entries[key] + if key in self.entry_locks: + del self.entry_locks[key] + + with self.memory_lock: + self.gpu_memory_used = max(0, self.gpu_memory_used - evicted_size) + + with self.stats_lock: + self.stats['evictions'] += 1 + + logger.debug(f"GPU eviction synced: removed {key} from cache metadata") + # ======================================================================== # WATERFALL LRU EVICTION METHODS # These methods implement a hierarchical cache eviction strategy where @@ -1347,39 +1689,24 @@ def _demote_entry(self, key: str, from_tier: str, to_tier: str) -> Tuple[bool, f if to_tier == 'cpu': self.stats['offloads_cpu'] += 1 elif to_tier == 'nvme': - self.stats['offloads_nvme'] += 1 - # Track tokens processed for NVMe throughput calculation - # Assuming size is bytes, and we know dtype size from model config - # But simpler: we can estimate tokens from size if needed, or just track bytes - # The user asked for 'nvme_tokens_processed'. - # We can approximate tokens = size / (2 * layers * heads * dim * dtype_size) - # Or just use the 'num_tokens' if we had it. - # Since we don't have num_tokens easily here without looking up the key again or storing it, - # let's look at the entry dict which should have it if we stored it. - # The current cache_entries dict stores: 'location', 'size', 'last_access', 'access_count'. - # It does NOT store num_tokens. - # However, size is directly proportional. - # Let's just track bytes for now and convert later if needed, OR - # better yet, let's add num_tokens to the cache entry metadata in allocate_cache. - # For now, to fix the immediate request without changing data structures too much: - # We will estimate tokens based on size. - # size = num_tokens * layers * 2 * heads * dim * 2 (for float16) - # so num_tokens = size / (layers * 4 * heads * dim) - bytes_per_token = ( - self.model_config.num_layers * - 2 * # K and V - self.model_config.kv_heads * - self.model_config.kv_dim_per_head * - 2 # float16 bytes - ) - tokens = int(size / bytes_per_token) - self.stats['nvme_tokens_processed'] += tokens + self.stats['offloads_storage'] += 1 + # Track tokens processed for Storage tier throughput calculation + # FIX: Use pre-computed property to avoid integer overflow on 32-bit systems. + # The ModelConfig.kv_cache_size_per_token property already computes this correctly. + # Python 3's // operator uses arbitrary-precision integers, avoiding overflow. + bytes_per_token = self.model_config.kv_cache_size_per_token + if bytes_per_token > 0: + # Pure integer division - Python 3 int has unlimited precision + tokens = size // bytes_per_token + self.stats['storage_tokens_processed'] += tokens + else: + logger.warning("bytes_per_token is 0, skipping token count update") total_latency = read_timing.total + write_timing.total return True, total_latency except Exception as e: - print(f"[KVCache] Failed to demote {key} from {from_tier} to {to_tier}: {e}") + logger.error(f"Failed to demote {key} from {from_tier} to {to_tier}: {e}") return False, 0.0 def _ensure_space_in_tier(self, tier: str, required_bytes: int, recursion_depth: int = 0) -> bool: @@ -1405,10 +1732,10 @@ def _ensure_space_in_tier(self, tier: str, required_bytes: int, recursion_depth: if tier == 'nvme': return True - # Safety limit to prevent runaway eviction cascades - max_recursion = 10 + # Safety limit to prevent runaway eviction cascades (configurable) + max_recursion = cfg('eviction', 'max_recursion_depth', default=10) if recursion_depth > max_recursion: - print(f"[KVCache] Warning: Hit recursion limit in _ensure_space_in_tier") + logger.warning("Hit recursion limit in _ensure_space_in_tier") return False tier_order = self._get_tier_order() @@ -1422,28 +1749,31 @@ def _ensure_space_in_tier(self, tier: str, required_bytes: int, recursion_depth: return False limit = self._get_tier_limit(tier) - target_usage = limit * 0.8 # Keep 20% buffer consistent with original code + target_usage_ratio = cfg('eviction', 'target_usage_ratio', default=0.8) + target_usage = limit * target_usage_ratio # Keep buffer consistent with config # If the entry is larger than the tier can physically hold, skip to next tier - if required_bytes > limit * 0.95: # Allow up to 95% for a single large entry + large_entry_limit_ratio = cfg('eviction', 'large_entry_limit_ratio', default=0.95) + if required_bytes > limit * large_entry_limit_ratio: return False # Calculate a reasonable eviction limit based on tier capacity. # For large models (e.g., 70B), entries can be hundreds of MB each, # so we may need to evict many entries to make room for one large request. - # Use the number of entries in the tier as a guide, with a minimum of 1000. + # Use the number of entries in the tier as a guide, with a minimum from config. entries_in_tier = len(self._get_lru_entries_in_tier(tier)) # FIX: Cap the max evictions to prevent infinite loops if we can't clear enough space # The previous logic could loop forever if entries_in_tier kept growing or didn't reduce fast enough. - # We set a hard cap of 5000 or slightly more than current entries. - max_evictions_per_call = min(5000, max(1000, entries_in_tier + 100)) + max_evictions_hard_cap = cfg('eviction', 'max_evictions_hard_cap', default=5000) + max_evictions_min = cfg('eviction', 'max_evictions_min', default=1000) + max_evictions_per_call = min(max_evictions_hard_cap, max(max_evictions_min, entries_in_tier + 100)) eviction_count = 0 while eviction_count < max_evictions_per_call: # Check if we have enough space now with self.memory_lock: current_usage = self._get_tier_usage(tier) - # Normal case: fit within the 80% target + # Normal case: fit within the target if current_usage + required_bytes <= target_usage: # FIX: Atomic Reservation # We must reserve the space NOW, inside the lock, to prevent other threads @@ -1451,8 +1781,8 @@ def _ensure_space_in_tier(self, tier: str, required_bytes: int, recursion_depth: self._update_tier_usage(tier, required_bytes) return True - # Large entry case: if we've cleared the tier, allow up to 95% of limit - if current_usage < limit * 0.05 and required_bytes <= limit * 0.95: + # Large entry case: if we've cleared the tier, allow up to large_entry_limit_ratio of limit + if current_usage < limit * 0.05 and required_bytes <= limit * large_entry_limit_ratio: # FIX: Atomic Reservation here too self._update_tier_usage(tier, required_bytes) return True @@ -1500,7 +1830,7 @@ def _ensure_space_in_tier(self, tier: str, required_bytes: int, recursion_depth: # Recursively ensure the next tier has space for this entry if not self._ensure_space_in_tier(next_tier, lru_size, recursion_depth + 1): - print(f"[KVCache] Warning: Could not make space in {next_tier} for demotion") + logger.warning(f"Could not make space in {next_tier} for demotion") # If we can't move the LRU item, we can't make space. # We should probably abort to avoid spinning. return False @@ -1555,10 +1885,10 @@ def _allocate_cache_inner(self, key: str, num_tokens: int, phase: InferencePhase try: data = self.generator.generate(sequence_length=num_tokens, key=key) except MemoryError: - print(f"[KVCache] MemoryError generating cache for key {key} ({num_tokens} tokens)") + logger.error(f"MemoryError generating cache for key {key} ({num_tokens} tokens)") return False, 'none', 0.0 except Exception as exc: - print(f"[KVCache] Failed to generate cache for key {key}: {exc}") + logger.error(f"Failed to generate cache for key {key}: {exc}") return False, 'none', 0.0 size_bytes = data.nbytes @@ -1567,7 +1897,6 @@ def _allocate_cache_inner(self, key: str, num_tokens: int, phase: InferencePhase with self.stats_lock: if phase == InferencePhase.PREFILL: self.stats['prefill_writes'] += 1 - self.stats['prefill_bytes_written'] += size_bytes self.stats['write_operations'] += 1 self.stats['total_write_bytes'] += size_bytes @@ -1608,7 +1937,7 @@ def _allocate_cache_inner(self, key: str, num_tokens: int, phase: InferencePhase allocated_tier = tier break - # Final fallback to NVMe if all else fails + # Final fallback to storage tier if all else fails if allocated_tier is None: allocated_tier = 'nvme' @@ -1630,17 +1959,23 @@ def _allocate_cache_inner(self, key: str, num_tokens: int, phase: InferencePhase 'access_count': 1 } - # Record latency and offload stats. + # Record latency, offload stats, and tier-specific KV bytes written. with self.stats_lock: + # Map internal tier name to stats key ('nvme' -> 'storage') + tier_stats_name = 'storage' if allocated_tier == 'nvme' else allocated_tier + + # Track KV bytes written per tier + self.stats[f'tier_{tier_stats_name}_kv_bytes_written'] += size_bytes + if allocated_tier == 'cpu': self.stats['offloads_cpu'] += 1 self.stats['cpu_write_latencies'].append(timing.total) elif allocated_tier == 'nvme': - self.stats['offloads_nvme'] += 1 - self.stats['nvme_write_latencies'].append(timing.total) - self.stats['nvme_write_device_latencies'].append(timing.device) - self.stats['nvme_write_host_latencies'].append(timing.host) - self.stats['nvme_tokens_processed'] += num_tokens + self.stats['offloads_storage'] += 1 + self.stats['storage_write_latencies'].append(timing.total) + self.stats['storage_write_device_latencies'].append(timing.device) + self.stats['storage_write_host_latencies'].append(timing.host) + self.stats['storage_tokens_processed'] += num_tokens elif allocated_tier == 'gpu': self.stats['gpu_write_latencies'].append(timing.total) @@ -1698,15 +2033,20 @@ def access_cache(self, key: str, phase: InferencePhase = InferencePhase.DECODE, elif cache_type == 'multi_turn': self.stats['multi_turn_hits'] += 1 else: self.stats['user_cache_hits'] += 1 - # Track phase-specific I/O. + # Map internal tier name to stats key ('nvme' -> 'storage') + tier_stats_name = 'storage' if location == 'nvme' else location + + # Track KV bytes read per tier + self.stats[f'tier_{tier_stats_name}_kv_bytes_read'] += entry_size + + # Track aggregate decode reads if phase == InferencePhase.DECODE: self.stats['decode_reads'] += 1 - self.stats['decode_bytes_read'] += entry_size self.stats['read_operations'] += 1 self.stats['total_read_bytes'] += entry_size - # Perform the actual read from the correct backend (GPU, CPU, or NVMe). + # Perform the actual read from the correct backend (GPU, CPU, or Storage). try: _, timing = self.backends[location].read(key) @@ -1717,16 +2057,15 @@ def access_cache(self, key: str, phase: InferencePhase = InferencePhase.DECODE, elif location == 'cpu': self.stats['cpu_read_latencies'].append(timing.total) else: - self.stats['nvme_read_latencies'].append(timing.total) - self.stats['nvme_read_device_latencies'].append(timing.device) - self.stats['nvme_read_host_latencies'].append(timing.host) + self.stats['storage_read_latencies'].append(timing.total) + self.stats['storage_read_device_latencies'].append(timing.device) + self.stats['storage_read_host_latencies'].append(timing.host) - #The access_cache function already retrieves the size of the entry in bytes: entry_size = entry['size']. - #The number of tokens can be calculated by dividing entry_size by the size of a single token's KV cache, which is available via self.model_config.kv_cache_size_per_token. - #This calculation should happen only when the read is from the 'nvme' tier. + # Calculate tokens from entry size for throughput assessment. + # This calculation applies only for the storage tier. if self.model_config.kv_cache_size_per_token > 0: num_tokens = entry_size / self.model_config.kv_cache_size_per_token - self.stats['nvme_tokens_processed'] += num_tokens + self.stats['storage_tokens_processed'] += num_tokens return location, timing.total except Exception as e: @@ -1743,10 +2082,10 @@ def _evaluate_storage_performance(self, duration: float) -> Dict: # Throughput-focused profile for MLPerf submission if self.performance_profile == 'throughput': - # Criterion: Throughput should be based on tokens processed by the NVMe tier. - nvme_tokens = self.stats.get('nvme_tokens_processed', 0) + # Criterion: Throughput should be based on tokens processed by the storage tier. + storage_tokens = self.stats.get('storage_tokens_processed', 0) # Correctly use the benchmark's full duration for an accurate tok/s calculation. - throughput = nvme_tokens / duration if duration > 0 else 0 + throughput = storage_tokens / duration if duration > 0 else 0 passed = throughput > 0 # Simple check to ensure it ran criteria.append({ @@ -1763,29 +2102,33 @@ def _evaluate_storage_performance(self, duration: float) -> Dict: } # Latency-focused profile (default) - # Criterion 1: NVMe Write P95 latency should be less than 500ms. - nvme_write_device = self.stats.get('nvme_write_device_latencies', []) - nvme_write_total = self.stats.get('nvme_write_latencies', []) - nvme_write_basis = nvme_write_device if nvme_write_device else nvme_write_total - if nvme_write_basis: - nvme_write_p95 = np.percentile(nvme_write_basis, 95) * 1000 - passed = nvme_write_p95 < 500 + # Criterion 1: Storage tier Write Device P95 latency should be less than 500ms. + # "Device" = actual disk I/O (fsync), excludes serialization overhead. + storage_write_device = self.stats.get('storage_write_device_latencies', []) + storage_write_total = self.stats.get('storage_write_latencies', []) + storage_write_basis = storage_write_device if storage_write_device else storage_write_total + latency_type = 'Device' if storage_write_device else 'Total' + if storage_write_basis: + storage_write_p95 = np.percentile(storage_write_basis, 95) * 1000 + passed = storage_write_p95 < 500 criteria.append({ - 'name': 'NVMe Write P95 < 500ms', - 'target': 500, 'actual': nvme_write_p95, 'unit': 'ms', 'passed': passed + 'name': f'Storage Tier Write {latency_type} P95 < 500ms', + 'target': 500, 'actual': storage_write_p95, 'unit': 'ms', 'passed': passed }) all_passed = all_passed and passed - # Criterion 2: NVMe Read P95 latency should be less than 200ms. - nvme_read_device = self.stats.get('nvme_read_device_latencies', []) - nvme_read_total = self.stats.get('nvme_read_latencies', []) - nvme_read_basis = nvme_read_device if nvme_read_device else nvme_read_total - if nvme_read_basis: - nvme_read_p95 = np.percentile(nvme_read_basis, 95) * 1000 - passed = nvme_read_p95 < 200 + # Criterion 2: Storage tier Read Device P95 latency should be less than 200ms. + # "Device" = actual disk I/O (np.load), excludes page cache/copy overhead. + storage_read_device = self.stats.get('storage_read_device_latencies', []) + storage_read_total = self.stats.get('storage_read_latencies', []) + storage_read_basis = storage_read_device if storage_read_device else storage_read_total + latency_type = 'Device' if storage_read_device else 'Total' + if storage_read_basis: + storage_read_p95 = np.percentile(storage_read_basis, 95) * 1000 + passed = storage_read_p95 < 200 criteria.append({ - 'name': 'NVMe Read P95 < 200ms', - 'target': 200, 'actual': nvme_read_p95, 'unit': 'ms', 'passed': passed + 'name': f'Storage Tier Read {latency_type} P95 < 200ms', + 'target': 200, 'actual': storage_read_p95, 'unit': 'ms', 'passed': passed }) all_passed = all_passed and passed @@ -1841,22 +2184,45 @@ def get_stats(self, duration: float) -> Dict: # Get the pass/fail assessment. storage_health = self._evaluate_storage_performance(duration) + # Calculate per-tier bandwidth (GB/s) + tier_gpu_read_bytes = self.stats['tier_gpu_kv_bytes_read'] + tier_gpu_write_bytes = self.stats['tier_gpu_kv_bytes_written'] + tier_cpu_read_bytes = self.stats['tier_cpu_kv_bytes_read'] + tier_cpu_write_bytes = self.stats['tier_cpu_kv_bytes_written'] + tier_storage_read_bytes = self.stats['tier_storage_kv_bytes_read'] + tier_storage_write_bytes = self.stats['tier_storage_kv_bytes_written'] + stats = { 'cache_hit_rate': hit_rate, 'cache_hits': stats_snapshot['cache_hits'], 'cache_misses': stats_snapshot['cache_misses'], 'gpu_entries': gpu_entries, 'cpu_entries': cpu_entries, - 'nvme_entries': nvme_entries, + 'storage_entries': nvme_entries, # Renamed from nvme_entries 'gpu_memory_used_gb': gpu_mem_used / 1024**3, 'cpu_memory_used_gb': cpu_mem_used / 1024**3, 'offloads_cpu': stats_snapshot['offloads_cpu'], - 'offloads_nvme': stats_snapshot['offloads_nvme'], + 'offloads_storage': stats_snapshot['offloads_storage'], # Renamed from offloads_nvme 'storage_health': storage_health, 'prefill_writes': self.stats['prefill_writes'], 'decode_reads': self.stats['decode_reads'], - 'prefill_bytes_written_gb': self.stats['prefill_bytes_written'] / 1024**3, - 'decode_bytes_read_gb': self.stats['decode_bytes_read'] / 1024**3, + + # Tier-specific KV cache bytes (NEW NAMING - MLPerf v3.0) + 'tier_gpu_kv_bytes_written_gb': tier_gpu_write_bytes / 1024**3, + 'tier_cpu_kv_bytes_written_gb': tier_cpu_write_bytes / 1024**3, + 'tier_storage_kv_bytes_written_gb': tier_storage_write_bytes / 1024**3, + 'tier_gpu_kv_bytes_read_gb': tier_gpu_read_bytes / 1024**3, + 'tier_cpu_kv_bytes_read_gb': tier_cpu_read_bytes / 1024**3, + 'tier_storage_kv_bytes_read_gb': tier_storage_read_bytes / 1024**3, + + # Per-tier bandwidth metrics (GB/s) + 'tier_gpu_read_bandwidth_gbps': (tier_gpu_read_bytes / 1024**3) / duration if duration > 0 else 0, + 'tier_gpu_write_bandwidth_gbps': (tier_gpu_write_bytes / 1024**3) / duration if duration > 0 else 0, + 'tier_cpu_read_bandwidth_gbps': (tier_cpu_read_bytes / 1024**3) / duration if duration > 0 else 0, + 'tier_cpu_write_bandwidth_gbps': (tier_cpu_write_bytes / 1024**3) / duration if duration > 0 else 0, + 'tier_storage_read_bandwidth_gbps': (tier_storage_read_bytes / 1024**3) / duration if duration > 0 else 0, + 'tier_storage_write_bandwidth_gbps': (tier_storage_write_bytes / 1024**3) / duration if duration > 0 else 0, + 'system_prompt_hits': self.stats['system_prompt_hits'], 'common_phrase_hits': self.stats['common_phrase_hits'], 'user_cache_hits': self.stats['user_cache_hits'], @@ -1868,33 +2234,41 @@ def get_stats(self, duration: float) -> Dict: 'read_write_ratio': self.stats['total_read_bytes'] / max(self.stats['total_write_bytes'], 1), 'read_iops': self.stats['read_operations'], 'write_iops': self.stats['write_operations'], - 'nvme_tokens_processed': self.stats['nvme_tokens_processed'], + 'storage_tokens_processed': self.stats['storage_tokens_processed'], } - # Add latency percentiles for each tier. - for tier in ['gpu', 'cpu', 'nvme']: + # Add latency percentiles for each tier (including p99.9 and p99.99). + # Map internal tier names to output names ('nvme' -> 'storage') + tier_mapping = {'gpu': 'gpu', 'cpu': 'cpu', 'nvme': 'storage'} + for internal_tier, output_tier in [('gpu', 'gpu'), ('cpu', 'cpu'), ('storage', 'storage')]: for op in ['read', 'write']: - latencies = self.stats[f'{tier}_{op}_latencies'] + latencies = self.stats.get(f'{internal_tier}_{op}_latencies', []) if latencies: lat_array = np.array(latencies) - stats[f'{tier}_{op}_p50_ms'] = np.percentile(lat_array, 50) * 1000 - stats[f'{tier}_{op}_p95_ms'] = np.percentile(lat_array, 95) * 1000 - stats[f'{tier}_{op}_p99_ms'] = np.percentile(lat_array, 99) * 1000 + stats[f'{output_tier}_{op}_p50_ms'] = np.percentile(lat_array, 50) * 1000 + stats[f'{output_tier}_{op}_p95_ms'] = np.percentile(lat_array, 95) * 1000 + stats[f'{output_tier}_{op}_p99_ms'] = np.percentile(lat_array, 99) * 1000 + stats[f'{output_tier}_{op}_p999_ms'] = np.percentile(lat_array, 99.9) * 1000 + stats[f'{output_tier}_{op}_p9999_ms'] = np.percentile(lat_array, 99.99) * 1000 - # Expose NVMe latency component breakdowns when present. + # Expose storage tier latency component breakdowns when present. for op in ['read', 'write']: - device_latencies = self.stats[f'nvme_{op}_device_latencies'] - host_latencies = self.stats[f'nvme_{op}_host_latencies'] + device_latencies = self.stats.get(f'storage_{op}_device_latencies', []) + host_latencies = self.stats.get(f'storage_{op}_host_latencies', []) if device_latencies: device_array = np.array(device_latencies) - stats[f'nvme_{op}_device_p50_ms'] = np.percentile(device_array, 50) * 1000 - stats[f'nvme_{op}_device_p95_ms'] = np.percentile(device_array, 95) * 1000 - stats[f'nvme_{op}_device_p99_ms'] = np.percentile(device_array, 99) * 1000 + stats[f'storage_{op}_device_p50_ms'] = np.percentile(device_array, 50) * 1000 + stats[f'storage_{op}_device_p95_ms'] = np.percentile(device_array, 95) * 1000 + stats[f'storage_{op}_device_p99_ms'] = np.percentile(device_array, 99) * 1000 + stats[f'storage_{op}_device_p999_ms'] = np.percentile(device_array, 99.9) * 1000 + stats[f'storage_{op}_device_p9999_ms'] = np.percentile(device_array, 99.99) * 1000 if host_latencies: host_array = np.array(host_latencies) - stats[f'nvme_{op}_host_p50_ms'] = np.percentile(host_array, 50) * 1000 - stats[f'nvme_{op}_host_p95_ms'] = np.percentile(host_array, 95) * 1000 - stats[f'nvme_{op}_host_p99_ms'] = np.percentile(host_array, 99) * 1000 + stats[f'storage_{op}_host_p50_ms'] = np.percentile(host_array, 50) * 1000 + stats[f'storage_{op}_host_p95_ms'] = np.percentile(host_array, 95) * 1000 + stats[f'storage_{op}_host_p99_ms'] = np.percentile(host_array, 99) * 1000 + stats[f'storage_{op}_host_p999_ms'] = np.percentile(host_array, 99.9) * 1000 + stats[f'storage_{op}_host_p9999_ms'] = np.percentile(host_array, 99.99) * 1000 return stats @@ -1975,23 +2349,32 @@ def collect_metrics(self, cache, queue_size): write_iops = int((write_delta / (16 * 1024)) / elapsed) if elapsed > 0 else 0 # Default to 0.0 if the keys don't exist (e.g., at the start of the run). - read_latency_p95_ms = stats.get('nvme_read_p95_ms', 0.0) - write_latency_p95_ms = stats.get('nvme_write_p95_ms', 0.0) + read_latency_p95_ms = stats.get('storage_read_p95_ms', 0.0) + write_latency_p95_ms = stats.get('storage_write_p95_ms', 0.0) # --- Saturation Detection Logic --- + # Read thresholds from config (with fallback to original hardcoded values) + read_lat_threshold = cfg('saturation_detection', 'read_latency_p95_threshold_ms', default=100) + write_lat_threshold = cfg('saturation_detection', 'write_latency_p95_threshold_ms', default=50) + queue_depth_threshold = cfg('saturation_detection', 'queue_depth_threshold', default=100) + is_saturated = False if len(self.metrics_history) >= 2: # Compare with the previous metric prev_metric = self.metrics_history[-2] - if (prev_metric.read_latency_p95_ms < 100 and prev_metric.write_latency_p95_ms < 50 and prev_metric.queue_depth < 100): + if (prev_metric.read_latency_p95_ms < read_lat_threshold and + prev_metric.write_latency_p95_ms < write_lat_threshold and + prev_metric.queue_depth < queue_depth_threshold): # If the previous metric was not saturated, check for a sudden increase in latency or queue depth if (abs(prev_metric.read_latency_p95_ms - read_latency_p95_ms) > 20 or abs(prev_metric.write_latency_p95_ms - write_latency_p95_ms) > 10 or abs(prev_metric.queue_depth - queue_depth) > 10): is_saturated = True else: - # If the previous metric was saturated, check if it's still above the thresholds - if (read_latency_p95_ms > 120 or write_latency_p95_ms > 60 or queue_depth > 120): + # If the previous metric was saturated, check if it's still above the thresholds (with 20% margin) + if (read_latency_p95_ms > read_lat_threshold * 1.2 or + write_latency_p95_ms > write_lat_threshold * 1.2 or + queue_depth > queue_depth_threshold * 1.2): is_saturated = True # Create a new StorageMetrics object for this sample @@ -2067,8 +2450,11 @@ def __init__(self, self.current_users = initial_users self.target_saturation = target_saturation self.scale_interval = scale_interval_seconds - self.min_users = 1 - self.max_users = 10000 + self.min_users = cfg('autoscaler', 'min_users', default=1) + self.max_users = cfg('autoscaler', 'max_users', default=10000) + self.scale_up_factor = cfg('autoscaler', 'scale_up_factor', default=1.2) + self.scale_down_factor = cfg('autoscaler', 'scale_down_factor', default=0.8) + self.consecutive_samples_required = cfg('autoscaler', 'consecutive_samples_required', default=2) self.scaling_history = [] self.lock = threading.Lock() @@ -2171,7 +2557,7 @@ def _calculate_capacity_action(self, current_throughput: float) -> Tuple[str, in self.downward_trend_count += 1 if self.downward_trend_count >= 2: self.capacity_test_finished = True - print(f"INFO: Peak capacity found at {self.peak_throughput:.2f} tok/s. Stopping test.") + logger.info(f"Peak capacity found at {self.peak_throughput:.2f} tok/s. Stopping test.") return 'stop', self.current_users return 'hold', self.current_users @@ -2331,8 +2717,8 @@ def validate_benchmark(self, benchmark_results: Dict) -> Dict: class UserSimulator: """Generates realistic user workloads based on pre-defined templates.""" - # Templates for different user personas (chatbot, coding, document analysis). - USER_TEMPLATES = { + # Default templates for different user personas (can be overridden from config). + DEFAULT_USER_TEMPLATES = { 'chatbot': { 'context_range': (256, 1024), 'generation_range': (50, 150), 'think_time_range': (0.1, 0.5), }, @@ -2344,11 +2730,25 @@ class UserSimulator: }, } + @classmethod + def _get_user_templates(cls) -> Dict: + """Get user templates from config, falling back to defaults.""" + templates = {} + for user_type in ['chatbot', 'coding', 'document']: + default = cls.DEFAULT_USER_TEMPLATES[user_type] + templates[user_type] = { + 'context_range': tuple(cfg('user_templates', user_type, 'context_range', default=list(default['context_range']))), + 'generation_range': tuple(cfg('user_templates', user_type, 'generation_range', default=list(default['generation_range']))), + 'think_time_range': tuple(cfg('user_templates', user_type, 'think_time_range', default=list(default['think_time_range']))), + } + return templates + @classmethod def generate_user(cls, user_id: str, user_type: str = 'chatbot', priority: int = 1, qos_level: QoSLevel = QoSLevel.BATCH) -> UserProfile: """Generates a single user profile based on a template.""" - template = cls.USER_TEMPLATES.get(user_type, cls.USER_TEMPLATES['chatbot']) + templates = cls._get_user_templates() + template = templates.get(user_type, templates['chatbot']) return UserProfile( user_id=user_id, context_length=random.randint(*template['context_range']), @@ -2361,16 +2761,19 @@ def generate_user(cls, user_id: str, user_type: str = 'chatbot', priority: int = @classmethod def generate_mixed_users(cls, num_users: int) -> List[UserProfile]: """Generates a list of users with a realistic distribution of types and QoS levels.""" + # Read QoS distribution from config + interactive_prob = cfg('qos_distribution', 'interactive_probability', default=0.15) + responsive_threshold = cfg('qos_distribution', 'responsive_threshold', default=0.50) + users = [] for i in range(num_users): user_type = random.choice(['chatbot', 'coding', 'document']) - # Simulate a realistic QoS distribution. - # 15% Interactive, 35% Responsive, 50% Batch. + # Simulate a realistic QoS distribution from config. rand = random.random() - if rand < 0.15: + if rand < interactive_prob: qos_level, priority = QoSLevel.INTERACTIVE, 3 - elif rand < 0.50: + elif rand < responsive_threshold: qos_level, priority = QoSLevel.RESPONSIVE, 2 else: qos_level, priority = QoSLevel.BATCH, 1 @@ -2413,7 +2816,7 @@ def __init__(self, dataset_path: str, max_conversations: int = 1000, seed: Optio def _load_dataset(self): """Load and process the ShareGPT dataset.""" if not os.path.exists(self.dataset_path): - print(f"[ShareGPT] Warning: Dataset not found at {self.dataset_path}") + logger.warning(f"Dataset not found at {self.dataset_path}") return try: @@ -2426,7 +2829,7 @@ def _load_dataset(self): pass if tokenizer is None: - print("[ShareGPT] Tiktoken not available, using approximate token counting") + logger.info("Tiktoken not available, using approximate token counting") with open(self.dataset_path, 'r', encoding='utf-8') as f: data = json.load(f) @@ -2504,12 +2907,12 @@ def _load_dataset(self): 'total_turns': sum(len(c['turns']) for c in self.conversations) } - print(f"[ShareGPT] Loaded {len(self.conversations)} conversations with {self.token_stats['total_turns']} turns") - print(f"[ShareGPT] Context tokens: mean={self.token_stats['context_mean']:.1f}, p50={self.token_stats['context_p50']:.1f}, p95={self.token_stats['context_p95']:.1f}") - print(f"[ShareGPT] Generation tokens: mean={self.token_stats['generation_mean']:.1f}, p50={self.token_stats['generation_p50']:.1f}, p95={self.token_stats['generation_p95']:.1f}") + logger.info(f"Loaded {len(self.conversations)} conversations with {self.token_stats['total_turns']} turns") + logger.info(f"Context tokens: mean={self.token_stats['context_mean']:.1f}, p50={self.token_stats['context_p50']:.1f}, p95={self.token_stats['context_p95']:.1f}") + logger.info(f"Generation tokens: mean={self.token_stats['generation_mean']:.1f}, p50={self.token_stats['generation_p50']:.1f}, p95={self.token_stats['generation_p95']:.1f}") except Exception as e: - print(f"[ShareGPT] Error loading dataset: {e}") + logger.error(f"Error loading dataset: {e}") self.conversations = [] def get_random_conversation(self) -> Optional[Dict]: @@ -2656,7 +3059,7 @@ def __init__(self, def _ingest_rag_documents(self, num_docs: int, stop_event: Optional[threading.Event] = None): """Ingests RAG documents for the workload.""" - print(f"Ingesting {num_docs} RAG documents...") + logger.info(f"Ingesting {num_docs} RAG documents...") for i in range(num_docs): if stop_event and stop_event.is_set(): break @@ -2675,7 +3078,7 @@ def _ingest_rag_documents(self, num_docs: int, stop_event: Optional[threading.Ev def _load_burst_trace(self): """Loads requests from the BurstGPT CSV trace file.""" if not self.burst_trace_path: - print("Error: --use-burst-trace flag requires --burst-trace-path to be set.") + logger.error("--use-burst-trace flag requires --burst-trace-path to be set.") sys.exit(1) try: with open(self.burst_trace_path, 'r', encoding='utf-8') as f: @@ -2687,12 +3090,12 @@ def _load_burst_trace(self): self.burst_requests.append((context_tokens, generate_tokens)) except (ValueError, KeyError): continue - print(f"Loaded {len(self.burst_requests)} requests from BurstGPT trace.") + logger.info(f"Loaded {len(self.burst_requests)} requests from BurstGPT trace.") except FileNotFoundError: - print(f"Error: Trace file not found at {self.burst_trace_path}") + logger.error(f"Trace file not found at {self.burst_trace_path}") sys.exit(1) except Exception as e: - print(f"Error reading trace file: {e}") + logger.error(f"Error reading trace file: {e}") sys.exit(1) def _generate_requests_from_trace(self, stop_event: threading.Event): @@ -2700,7 +3103,7 @@ def _generate_requests_from_trace(self, stop_event: threading.Event): request_index = 0 while not stop_event.is_set(): if not self.burst_requests: - print("Warning: BurstGPT trace is empty. No requests to generate.") + logger.warning("BurstGPT trace is empty. No requests to generate.") time.sleep(1) continue @@ -2748,7 +3151,7 @@ def _generate_requests_from_trace(self, stop_event: threading.Event): def _generate_requests_from_dataset(self, stop_event: threading.Event): """Generates InferenceRequest objects from the loaded ShareGPT dataset.""" if not self.sharegpt_loader or not self.sharegpt_loader.conversations: - print("Warning: ShareGPT dataset is empty or not loaded. Falling back to synthetic workload.") + logger.warning("ShareGPT dataset is empty or not loaded. Falling back to synthetic workload.") # Fall back to synthetic generation users = UserSimulator.generate_mixed_users(self.num_users) self.generate_requests(users, stop_event) @@ -2777,11 +3180,14 @@ def _generate_requests_from_dataset(self, stop_event: threading.Event): req_id = self.request_counter self.request_counter += 1 - # Assign QoS level based on request characteristics + # Assign QoS level based on request characteristics (from config) + interactive_prob = cfg('qos_distribution', 'interactive_probability', default=0.15) + responsive_threshold = cfg('qos_distribution', 'responsive_threshold', default=0.50) + rand = random.random() - if rand < 0.15: + if rand < interactive_prob: qos_level, priority = QoSLevel.INTERACTIVE, 3 - elif rand < 0.50: + elif rand < responsive_threshold: qos_level, priority = QoSLevel.RESPONSIVE, 2 else: qos_level, priority = QoSLevel.BATCH, 1 @@ -2961,13 +3367,15 @@ def process_requests(self, stop_event: threading.Event): with self.results_lock: self.results['prefill_latencies'].append(write_latency) # 4. Simulate a RAG operation by reading random chunk caches. - # NOTE: Check that documents exist to avoid race condition with RAG ingestion thread - if self.rag_manager and self.rag_manager.documents and random.random() < 0.1: # 10% of requests are RAG queries - doc_id = random.choice(list(self.rag_manager.documents.keys())) - chunks = self.rag_manager.retrieve_chunks(doc_id) - for chunk in chunks: # Read the KV cache for each retrieved chunk. - _, read_lat = self.cache.access_cache(chunk.kv_cache_key, InferencePhase.DECODE) - storage_latency += read_lat + # NOTE: Capture document keys atomically to avoid race condition with RAG ingestion thread + if self.rag_manager and random.random() < 0.1: # 10% of requests are RAG queries + doc_keys = list(self.rag_manager.documents.keys()) if self.rag_manager.documents else [] + if doc_keys: + doc_id = random.choice(doc_keys) + chunks = self.rag_manager.retrieve_chunks(doc_id) + for chunk in chunks: # Read the KV cache for each retrieved chunk. + _, read_lat = self.cache.access_cache(chunk.kv_cache_key, InferencePhase.DECODE) + storage_latency += read_lat # 5. Perform the DECODE operation (a cache READ). if request.phase == InferencePhase.DECODE or request.phase == InferencePhase.PREFILL_DECODE: @@ -2982,7 +3390,7 @@ def process_requests(self, stop_event: threading.Event): storage_latency += write_latency else: # Simulate realistic decode I/O: reads are batched, not per-token. - decode_batch_size = 32 + decode_batch_size = cfg('decode', 'batch_size', default=32) num_batched_reads = max(1, (request.generate_tokens + decode_batch_size - 1) // decode_batch_size) for _ in range(num_batched_reads): _, batch_read_latency = self.cache.access_cache(request.cache_key, InferencePhase.DECODE, cache_type) @@ -3062,9 +3470,9 @@ def monitor_stats(self, stop_event: threading.Event): 'throughput_tokens_per_sec': throughput } self.autoscaler.scaling_history.append(log_entry) - print(f"Autoscaler {action} -> {self.num_users} users (saturation: {saturation_level:.2f})") + logger.info(f"Autoscaler {action} -> {self.num_users} users (saturation: {saturation_level:.2f})") elif action == 'stop': - print("Autoscaler requested stop after reaching capacity peak.") + logger.info("Autoscaler requested stop after reaching capacity peak.") stop_event.set() log_entry = { 'timestamp': datetime.now().isoformat(), @@ -3083,7 +3491,7 @@ def monitor_stats(self, stop_event: threading.Event): if now - last_log_time >= 10: self._calculate_stats() queue_depth = self.request_queue.qsize() - print(f"Time: {int(elapsed)}s, Users: {self.num_users}, Queue: {queue_depth}, " + logger.info(f"Time: {int(elapsed)}s, Users: {self.num_users}, Queue: {queue_depth}, " f"Throughput: {throughput:.2f} tok/s") last_log_time = now @@ -3177,7 +3585,7 @@ def run(self) -> Dict: def _calculate_stats(self, actual_duration: float = None): """Calculate final statistics with all feature breakdowns""" if not self.results['end_to_end_latencies']: - print("\nNo requests completed during benchmark!") + logger.warning("No requests completed during benchmark!") return # Use actual duration if provided (for max_requests mode), else configured duration @@ -3218,18 +3626,24 @@ def _calculate_stats(self, actual_duration: float = None): 'p50': np.percentile(e2e, 50) * 1000, 'p95': np.percentile(e2e, 95) * 1000, 'p99': np.percentile(e2e, 99) * 1000, + 'p999': np.percentile(e2e, 99.9) * 1000, + 'p9999': np.percentile(e2e, 99.99) * 1000, }, 'storage_io_latency_ms': { 'mean': np.mean(storage) * 1000, 'p50': np.percentile(storage, 50) * 1000, 'p95': np.percentile(storage, 95) * 1000, 'p99': np.percentile(storage, 99) * 1000, + 'p999': np.percentile(storage, 99.9) * 1000, + 'p9999': np.percentile(storage, 99.99) * 1000, }, 'generation_latency_ms': { 'mean': np.mean(generation) * 1000, 'p50': np.percentile(generation, 50) * 1000, 'p95': np.percentile(generation, 95) * 1000, 'p99': np.percentile(generation, 99) * 1000, + 'p999': np.percentile(generation, 99.9) * 1000, + 'p9999': np.percentile(generation, 99.99) * 1000, }, 'cache_stats': cache_stats, 'qos_metrics': qos_metrics, @@ -3274,10 +3688,6 @@ def _print_summary(self, summary: Dict): - Phase-specific metrics (prefill/decode) - QoS compliance by service tier - Validation results if available - Note: - The symbols âœ" and ✗ are intended to be checkmark (✓) and cross (✗) - characters for pass/fail indicators but may display incorrectly due to - encoding issues. """ """Print comprehensive results summary""" print("\n" + "=" * 80) @@ -3285,15 +3695,19 @@ def _print_summary(self, summary: Dict): print(f"Generation Mode: {self.generation_mode.value} ({self.ms_per_token:.1f}ms/token)") print("=" * 80) + # Use ASCII-safe symbols for pass/fail indicators + PASS_SYMBOL = "[OK]" + FAIL_SYMBOL = "[X]" + cache_stats = summary['cache_stats'] if 'storage_health' in cache_stats: storage_health = cache_stats['storage_health'] status = storage_health['overall_status'] - status_symbol = '✓' if status == 'PASS' else '✗' + status_symbol = PASS_SYMBOL if status == 'PASS' else FAIL_SYMBOL print(f"\n### STORAGE PERFORMANCE ASSESSMENT: {status} {status_symbol} ###") print(f" Criteria Passed: {storage_health['passed_count']}/{storage_health['total_count']}") for criterion in storage_health['criteria']: - symbol = '✓' if criterion['passed'] else '✗' + symbol = PASS_SYMBOL if criterion['passed'] else FAIL_SYMBOL unit = criterion.get('unit', '') if unit == 'ratio': print(f" {symbol} {criterion['name']}: {criterion['actual']:.1%} (target: {criterion['target']:.1%})") @@ -3323,17 +3737,18 @@ def _print_summary(self, summary: Dict): print(f"Throughput (storage I/O): {summary['storage_throughput_tokens_per_sec']:.2f} tokens/sec") print(f"Requests/sec: {summary['requests_per_second']:.2f}") - print(f"\n### END-TO-END LATENCY (Storage I/O + Token Generation) ###") + print(f"\n### END-TO-END LATENCY (Queue Wait + Storage I/O + Generation) ###") print(f" Mean: {summary['end_to_end_latency_ms']['mean']:.2f} ms") print(f" P50: {summary['end_to_end_latency_ms']['p50']:.2f} ms") print(f" P95: {summary['end_to_end_latency_ms']['p95']:.2f} ms") print(f" P99: {summary['end_to_end_latency_ms']['p99']:.2f} ms") - print(f"\n### STORAGE I/O LATENCY (Primary Metric) ###") + print(f"\n### PER-REQUEST STORAGE LATENCY (All I/O ops for one request) ###") print(f" Mean: {summary['storage_io_latency_ms']['mean']:.2f} ms") print(f" P50: {summary['storage_io_latency_ms']['p50']:.2f} ms") print(f" P95: {summary['storage_io_latency_ms']['p95']:.2f} ms") print(f" P99: {summary['storage_io_latency_ms']['p99']:.2f} ms") + print(f" (= 1 prefill write + N decode reads per request)") if self.generation_mode != GenerationMode.NONE: print(f"\n### TOKEN GENERATION LATENCY (Simulated @ {self.ms_per_token:.1f}ms/token) ###") @@ -3352,20 +3767,46 @@ def _print_summary(self, summary: Dict): print(f"\n### CACHE TIER DISTRIBUTION ###") print(f" GPU Entries: {cache_stats['gpu_entries']} ({cache_stats['gpu_memory_used_gb']:.2f} GB)") print(f" CPU Entries: {cache_stats['cpu_entries']} ({cache_stats['cpu_memory_used_gb']:.2f} GB)") - print(f" NVMe Entries: {cache_stats['nvme_entries']}") - - print(f"\n### PHASE-SPECIFIC METRICS ###") - print(f" Prefill Writes: {cache_stats['prefill_writes']}") - print(f" Prefill Bytes Written: {cache_stats['prefill_bytes_written_gb']:.2f} GB") - print(f" Decode Reads: {cache_stats['decode_reads']}") - print(f" Decode Bytes Read: {cache_stats['decode_bytes_read_gb']:.2f} GB") - - print(f"\n### TIER-SPECIFIC LATENCIES ###") - for tier in ['gpu', 'cpu', 'nvme']: + print(f" Storage Entries: {cache_stats['storage_entries']}") + + print(f"\n### TIER-SPECIFIC KV BYTES ###") + # GPU tier + if cache_stats.get('tier_gpu_kv_bytes_written_gb', 0) > 0: + print(f" GPU KV Bytes Written: {cache_stats['tier_gpu_kv_bytes_written_gb']:.2f} GB") + if cache_stats.get('tier_gpu_kv_bytes_read_gb', 0) > 0: + print(f" GPU KV Bytes Read: {cache_stats['tier_gpu_kv_bytes_read_gb']:.2f} GB") + # CPU tier + if cache_stats.get('tier_cpu_kv_bytes_written_gb', 0) > 0: + print(f" CPU KV Bytes Written: {cache_stats['tier_cpu_kv_bytes_written_gb']:.2f} GB") + if cache_stats.get('tier_cpu_kv_bytes_read_gb', 0) > 0: + print(f" CPU KV Bytes Read: {cache_stats['tier_cpu_kv_bytes_read_gb']:.2f} GB") + # Storage tier + if cache_stats.get('tier_storage_kv_bytes_written_gb', 0) > 0: + print(f" Storage KV Bytes Written: {cache_stats['tier_storage_kv_bytes_written_gb']:.2f} GB") + if cache_stats.get('tier_storage_kv_bytes_read_gb', 0) > 0: + print(f" Storage KV Bytes Read: {cache_stats['tier_storage_kv_bytes_read_gb']:.2f} GB") + + print(f"\n### TIER-SPECIFIC LATENCIES (Total = Host + Device) ###") + for tier in ['gpu', 'cpu', 'storage']: for op in ['read', 'write']: p95_key = f'{tier}_{op}_p95_ms' if p95_key in cache_stats: - print(f" {tier.upper()} {op.title()} P95: {cache_stats[p95_key]:.2f} ms") + tier_label = 'Storage' if tier == 'storage' else tier.upper() + print(f" {tier_label} {op.title()} P95 (Total): {cache_stats[p95_key]:.2f} ms") + + # Storage tier Device vs Host latency breakdown (most important for storage benchmarks) + print(f"\n### STORAGE TIER LATENCY BREAKDOWN (Device = Disk I/O, Host = Serialization) ###") + for op in ['read', 'write']: + device_key = f'storage_{op}_device_p95_ms' + host_key = f'storage_{op}_host_p95_ms' + total_key = f'storage_{op}_p95_ms' + if device_key in cache_stats: + print(f" Storage {op.title()}:") + print(f" - Device P95 (Disk I/O): {cache_stats[device_key]:.2f} ms") + if host_key in cache_stats: + print(f" - Host P95 (Serialization): {cache_stats[host_key]:.2f} ms") + if total_key in cache_stats: + print(f" - Total P95: {cache_stats[total_key]:.2f} ms") print(f"\n### CACHE TYPE BREAKDOWNS ###") print(f" System Prompt Hits: {cache_stats['system_prompt_hits']}") @@ -3397,7 +3838,7 @@ def _print_summary(self, summary: Dict): print(f" Latency P95: {metrics['latency_ms']['p95']:.2f} ms") print(f" Latency P99: {metrics['latency_ms']['p99']:.2f} ms") if 'sla' in metrics: - sla_met = '✓' if metrics['sla']['met'] else '✗' + sla_met = '[OK]' if metrics['sla']['met'] else '[X]' print(f" SLA Met: {sla_met} (compliance: {metrics['sla']['compliance']:.1%})") if summary.get('autoscaling_stats'): @@ -3412,7 +3853,7 @@ def _print_summary(self, summary: Dict): if 'validation' in self.results: print(f"\n### VALIDATION ###") validation = self.results['validation'] - print(f" Validation: {'PASSED ✓' if validation['passed'] else 'FAILED ✗'}") + print(f" Validation: {'PASSED [OK]' if validation['passed'] else 'FAILED [X]'}") print(f" Average Error: {validation['avg_error_pct']:.2f}%") print("\n" + "=" * 80) @@ -3424,9 +3865,112 @@ def _print_summary(self, summary: Dict): print("=" * 80) +# ============================================================================ +# INPUT VALIDATION +# Validates command-line arguments before benchmark execution. +# ============================================================================ + +# Validation constants with documented rationale +MAX_USERS = 100000 # Reasonable upper limit for simulated users +MAX_DURATION_SECONDS = 86400 # 24 hours - prevents runaway benchmarks +MAX_GPU_MEMORY_GB = 1024 # 1TB - covers even the largest GPU clusters +MAX_CPU_MEMORY_GB = 16384 # 16TB - covers high-memory server configurations + +# System directories that should never be used as cache directories +FORBIDDEN_CACHE_PREFIXES = frozenset([ + '/etc', '/bin', '/sbin', '/usr/bin', '/usr/sbin', + '/boot', '/sys', '/proc', '/dev', '/root' +]) + + +def validate_args(args: argparse.Namespace) -> argparse.Namespace: + """ + Validate command-line arguments to catch invalid values early. + + Args: + args: Parsed argparse namespace + + Returns: + The validated args namespace + + Raises: + ValueError: If any validation check fails + """ + errors = [] + + # Validate positive integers + if args.num_users <= 0: + errors.append(f"--num-users must be positive, got {args.num_users}") + if args.num_users > MAX_USERS: + errors.append(f"--num-users exceeds limit ({MAX_USERS}), got {args.num_users}") + + if args.duration <= 0: + errors.append(f"--duration must be positive, got {args.duration}") + if args.duration > MAX_DURATION_SECONDS: + errors.append(f"--duration exceeds 24 hours ({MAX_DURATION_SECONDS}s), got {args.duration}") + + # Validate memory sizes + if args.gpu_mem_gb < 0: + errors.append(f"--gpu-mem-gb cannot be negative, got {args.gpu_mem_gb}") + if args.gpu_mem_gb > MAX_GPU_MEMORY_GB: + errors.append(f"--gpu-mem-gb exceeds limit ({MAX_GPU_MEMORY_GB}GB), got {args.gpu_mem_gb}") + + if args.cpu_mem_gb < 0: + errors.append(f"--cpu-mem-gb cannot be negative, got {args.cpu_mem_gb}") + if args.cpu_mem_gb > MAX_CPU_MEMORY_GB: + errors.append(f"--cpu-mem-gb exceeds limit ({MAX_CPU_MEMORY_GB}GB), got {args.cpu_mem_gb}") + + # Validate optional integers + if args.rag_num_docs < 0: + errors.append(f"--rag-num-docs cannot be negative, got {args.rag_num_docs}") + + if args.max_conversations <= 0: + errors.append(f"--max-conversations must be positive, got {args.max_conversations}") + + if args.max_concurrent_allocs < 0: + errors.append(f"--max-concurrent-allocs cannot be negative, got {args.max_concurrent_allocs}") + + if args.request_rate < 0: + errors.append(f"--request-rate cannot be negative, got {args.request_rate}") + + if args.max_requests < 0: + errors.append(f"--max-requests cannot be negative, got {args.max_requests}") + + # Validate target_saturation range + if not (0.0 <= args.target_saturation <= 1.0): + errors.append(f"--target-saturation must be between 0.0 and 1.0, got {args.target_saturation}") + + # Validate cache directory if provided + if args.cache_dir: + # Resolve symlinks to prevent bypass attacks + cache_path = Path(args.cache_dir).resolve() + cache_path_str = str(cache_path) + + # Check for forbidden system directories + for prefix in FORBIDDEN_CACHE_PREFIXES: + if cache_path_str.startswith(prefix): + errors.append(f"--cache-dir cannot be a system directory: {cache_path}") + break + + # Check if parent directory is writable (if it exists) + parent = cache_path.parent + if parent.exists() and not os.access(parent, os.W_OK): + errors.append(f"--cache-dir parent is not writable: {parent}") + + if errors: + for error in errors: + logger.error(f"Validation error: {error}") + raise ValueError(f"Invalid arguments:\n " + "\n ".join(errors)) + + return args + + def main(): """Main entry point for running the benchmark from the command line.""" parser = argparse.ArgumentParser(description="Integrated Multi-User KV Cache Benchmark") + parser.add_argument('--log-level', type=str, default='INFO', + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Set the logging level (default: INFO)') parser.add_argument('--model', type=str, default='llama3.1-8b', choices=MODEL_CONFIGS.keys(), help='The model configuration to use.') parser.add_argument('--num-users', type=int, default=100, @@ -3481,11 +4025,33 @@ def main(): parser.add_argument('--xlsx-output', type=str, default=None, help='Optional: Output Excel file path for summary results with run parameters. ' 'Requires pandas and openpyxl. Falls back to CSV if openpyxl not available.') + parser.add_argument('--config', type=str, default=None, + help='Path to YAML configuration file. Overrides hardcoded defaults.') args = parser.parse_args() + # Configure logging based on command-line argument + logging.basicConfig( + level=getattr(logging, args.log_level), + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + # Validate command-line arguments + args = validate_args(args) + + # Load YAML config if provided + if args.config: + config = ConfigLoader(args.config) + set_config(config) + logger.info(f"Loaded configuration from {args.config}") + + # Refresh QOS_PROFILES with config values + global QOS_PROFILES + QOS_PROFILES = get_qos_profiles() + if args.seed is not None: - print(f"Using random seed: {args.seed}") + logger.info(f"Using random seed: {args.seed}") random.seed(args.seed) np.random.seed(args.seed) if TORCH_AVAILABLE: @@ -3540,7 +4106,7 @@ def convert_numpy(obj): with open(args.output, 'w') as f: json.dump(results, f, indent=4, default=convert_numpy) - print(f"\nResults saved to {args.output}") + logger.info(f"Results saved to {args.output}") # Export to XLSX if requested if args.xlsx_output: @@ -3558,12 +4124,12 @@ def export_results_to_xlsx(results: Dict, args, output_path: str): output_path: Path for the output Excel/CSV file """ if not PANDAS_AVAILABLE: - print(f"Warning: pandas not available, skipping XLSX export. Install with: pip install pandas") + logger.warning("pandas not available, skipping XLSX export. Install with: pip install pandas") return summary = results.get('summary', {}) if not summary: - print("Warning: No summary data available for XLSX export") + logger.warning("No summary data available for XLSX export") return # Helper to safely get nested keys @@ -3611,32 +4177,88 @@ def get_nested(d, keys, default=None): 'E2E Latency P50 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p50']), 'E2E Latency P95 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p95']), 'E2E Latency P99 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p99']), + 'E2E Latency P99.9 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p999']), + 'E2E Latency P99.99 (ms)': get_nested(summary, ['end_to_end_latency_ms', 'p9999']), # Storage IO Latency 'Storage Latency Mean (ms)': get_nested(summary, ['storage_io_latency_ms', 'mean']), 'Storage Latency P50 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p50']), 'Storage Latency P95 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p95']), 'Storage Latency P99 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p99']), + 'Storage Latency P99.9 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p999']), + 'Storage Latency P99.99 (ms)': get_nested(summary, ['storage_io_latency_ms', 'p9999']), - # Generation Latency + # Generation Latency (simulated GPU work) 'Gen Latency Mean (ms)': get_nested(summary, ['generation_latency_ms', 'mean']), 'Gen Latency P50 (ms)': get_nested(summary, ['generation_latency_ms', 'p50']), 'Gen Latency P95 (ms)': get_nested(summary, ['generation_latency_ms', 'p95']), 'Gen Latency P99 (ms)': get_nested(summary, ['generation_latency_ms', 'p99']), - + + # Storage Tier Total Latency (Host serialization + Device I/O) + 'Storage Tier Read Total P50 (ms)': get_nested(summary, ['cache_stats', 'storage_read_p50_ms']), + 'Storage Tier Read Total P95 (ms)': get_nested(summary, ['cache_stats', 'storage_read_p95_ms']), + 'Storage Tier Read Total P99 (ms)': get_nested(summary, ['cache_stats', 'storage_read_p99_ms']), + 'Storage Tier Read Total P99.9 (ms)': get_nested(summary, ['cache_stats', 'storage_read_p999_ms']), + 'Storage Tier Read Total P99.99 (ms)': get_nested(summary, ['cache_stats', 'storage_read_p9999_ms']), + 'Storage Tier Write Total P50 (ms)': get_nested(summary, ['cache_stats', 'storage_write_p50_ms']), + 'Storage Tier Write Total P95 (ms)': get_nested(summary, ['cache_stats', 'storage_write_p95_ms']), + 'Storage Tier Write Total P99 (ms)': get_nested(summary, ['cache_stats', 'storage_write_p99_ms']), + 'Storage Tier Write Total P99.9 (ms)': get_nested(summary, ['cache_stats', 'storage_write_p999_ms']), + 'Storage Tier Write Total P99.99 (ms)': get_nested(summary, ['cache_stats', 'storage_write_p9999_ms']), + + # Storage Tier Device Latency (actual disk I/O - fsync for writes, np.load for reads) + 'Storage Tier Read Device P50 (ms)': get_nested(summary, ['cache_stats', 'storage_read_device_p50_ms']), + 'Storage Tier Read Device P95 (ms)': get_nested(summary, ['cache_stats', 'storage_read_device_p95_ms']), + 'Storage Tier Read Device P99 (ms)': get_nested(summary, ['cache_stats', 'storage_read_device_p99_ms']), + 'Storage Tier Read Device P99.9 (ms)': get_nested(summary, ['cache_stats', 'storage_read_device_p999_ms']), + 'Storage Tier Read Device P99.99 (ms)': get_nested(summary, ['cache_stats', 'storage_read_device_p9999_ms']), + 'Storage Tier Write Device P50 (ms)': get_nested(summary, ['cache_stats', 'storage_write_device_p50_ms']), + 'Storage Tier Write Device P95 (ms)': get_nested(summary, ['cache_stats', 'storage_write_device_p95_ms']), + 'Storage Tier Write Device P99 (ms)': get_nested(summary, ['cache_stats', 'storage_write_device_p99_ms']), + 'Storage Tier Write Device P99.9 (ms)': get_nested(summary, ['cache_stats', 'storage_write_device_p999_ms']), + 'Storage Tier Write Device P99.99 (ms)': get_nested(summary, ['cache_stats', 'storage_write_device_p9999_ms']), + + # Storage Tier Host Latency (serialization/deserialization - CPU work) + 'Storage Tier Read Host P50 (ms)': get_nested(summary, ['cache_stats', 'storage_read_host_p50_ms']), + 'Storage Tier Read Host P95 (ms)': get_nested(summary, ['cache_stats', 'storage_read_host_p95_ms']), + 'Storage Tier Read Host P99 (ms)': get_nested(summary, ['cache_stats', 'storage_read_host_p99_ms']), + 'Storage Tier Read Host P99.9 (ms)': get_nested(summary, ['cache_stats', 'storage_read_host_p999_ms']), + 'Storage Tier Read Host P99.99 (ms)': get_nested(summary, ['cache_stats', 'storage_read_host_p9999_ms']), + 'Storage Tier Write Host P50 (ms)': get_nested(summary, ['cache_stats', 'storage_write_host_p50_ms']), + 'Storage Tier Write Host P95 (ms)': get_nested(summary, ['cache_stats', 'storage_write_host_p95_ms']), + 'Storage Tier Write Host P99 (ms)': get_nested(summary, ['cache_stats', 'storage_write_host_p99_ms']), + 'Storage Tier Write Host P99.9 (ms)': get_nested(summary, ['cache_stats', 'storage_write_host_p999_ms']), + 'Storage Tier Write Host P99.99 (ms)': get_nested(summary, ['cache_stats', 'storage_write_host_p9999_ms']), + # Cache Stats 'Cache Hit Rate': get_nested(summary, ['cache_stats', 'cache_hit_rate']), 'Read/Write Ratio': get_nested(summary, ['cache_stats', 'read_write_ratio']), 'Total Read (GB)': get_nested(summary, ['cache_stats', 'total_read_gb']), 'Total Write (GB)': get_nested(summary, ['cache_stats', 'total_write_gb']), - 'Prefill Bytes Written (GB)': get_nested(summary, ['cache_stats', 'prefill_bytes_written_gb']), - 'Decode Bytes Read (GB)': get_nested(summary, ['cache_stats', 'decode_bytes_read_gb']), - + + # Per-Tier KV Cache Bytes Written (NEW NAMING - MLPerf v3.0) + 'Tier GPU KV Bytes Written (GB)': get_nested(summary, ['cache_stats', 'tier_gpu_kv_bytes_written_gb']), + 'Tier CPU KV Bytes Written (GB)': get_nested(summary, ['cache_stats', 'tier_cpu_kv_bytes_written_gb']), + 'Tier Storage KV Bytes Written (GB)': get_nested(summary, ['cache_stats', 'tier_storage_kv_bytes_written_gb']), + + # Per-Tier KV Cache Bytes Read (NEW NAMING - MLPerf v3.0) + 'Tier GPU KV Bytes Read (GB)': get_nested(summary, ['cache_stats', 'tier_gpu_kv_bytes_read_gb']), + 'Tier CPU KV Bytes Read (GB)': get_nested(summary, ['cache_stats', 'tier_cpu_kv_bytes_read_gb']), + 'Tier Storage KV Bytes Read (GB)': get_nested(summary, ['cache_stats', 'tier_storage_kv_bytes_read_gb']), + + # Per-Tier Bandwidth (GB/s) - MLPerf v3.0 scoring metric + 'Tier GPU Read Bandwidth (GB/s)': get_nested(summary, ['cache_stats', 'tier_gpu_read_bandwidth_gbps']), + 'Tier GPU Write Bandwidth (GB/s)': get_nested(summary, ['cache_stats', 'tier_gpu_write_bandwidth_gbps']), + 'Tier CPU Read Bandwidth (GB/s)': get_nested(summary, ['cache_stats', 'tier_cpu_read_bandwidth_gbps']), + 'Tier CPU Write Bandwidth (GB/s)': get_nested(summary, ['cache_stats', 'tier_cpu_write_bandwidth_gbps']), + 'Tier Storage Read Bandwidth (GB/s)': get_nested(summary, ['cache_stats', 'tier_storage_read_bandwidth_gbps']), + 'Tier Storage Write Bandwidth (GB/s)': get_nested(summary, ['cache_stats', 'tier_storage_write_bandwidth_gbps']), + # Tier distribution 'GPU Entries': get_nested(summary, ['cache_stats', 'gpu_entries']), 'CPU Entries': get_nested(summary, ['cache_stats', 'cpu_entries']), - 'NVMe Entries': get_nested(summary, ['cache_stats', 'nvme_entries']), - + 'Storage Entries': get_nested(summary, ['cache_stats', 'storage_entries']), + # Multi-turn stats 'Multi-turn Hit Rate': get_nested(summary, ['multi_turn_stats', 'hit_rate']), } @@ -3683,24 +4305,24 @@ def get_nested(d, keys, default=None): qos_df = pd.DataFrame(qos_rows) qos_df.to_excel(writer, sheet_name='QoS Metrics', index=False) - print(f"XLSX results saved to {output_path}") + logger.info(f"XLSX results saved to {output_path}") else: # Fall back to CSV csv_path = output_path.replace('.xlsx', '.csv') if output_path.endswith('.xlsx') else output_path if not csv_path.endswith('.csv'): csv_path += '.csv' df.to_csv(csv_path, index=False) - print(f"CSV results saved to {csv_path} (openpyxl not available for XLSX)") + logger.info(f"CSV results saved to {csv_path} (openpyxl not available for XLSX)") except Exception as e: - print(f"Error saving XLSX/CSV: {e}") + logger.error(f"Error saving XLSX/CSV: {e}") # Last resort: try CSV try: csv_path = output_path.replace('.xlsx', '.csv') df.to_csv(csv_path, index=False) - print(f"Fallback CSV saved to {csv_path}") + logger.info(f"Fallback CSV saved to {csv_path}") except Exception as e2: - print(f"Failed to save results: {e2}") + logger.error(f"Failed to save results: {e2}") if __name__ == "__main__": From d9715bce44676ff4aa3eb63aabee7715fd0f4785 Mon Sep 17 00:00:00 2001 From: Hazem Awadallah Date: Tue, 27 Jan 2026 15:43:17 -0800 Subject: [PATCH 05/43] feat(wrapper): config integration and workload automation - Add -c DIR option for custom config directory - Generate and pass config.yaml to Python script via --config flag - Add --xlsx-output support for Excel export - Update jq queries for new storage_* metric names - Add mlperf_submission workload with required trial parameters - Enhance system detection for thread counts and memory limits - Update metric parsing for storage_throughput primary metric --- kv_cache_benchmark/kv-cache-wrapper.sh | 70 +++++++++++++++++++------- 1 file changed, 52 insertions(+), 18 deletions(-) diff --git a/kv_cache_benchmark/kv-cache-wrapper.sh b/kv_cache_benchmark/kv-cache-wrapper.sh index 2b648d6a..59ba3d37 100644 --- a/kv_cache_benchmark/kv-cache-wrapper.sh +++ b/kv_cache_benchmark/kv-cache-wrapper.sh @@ -40,6 +40,7 @@ Usage: ./kv-cache-wrapper.sh [options] [model] Options: -m MODEL Model key to benchmark (tiny-1b, mistral-7b, llama3.1-8b, llama2-7b, llama3.1-70b-instruct) + -c DIR Cache directory path (default: auto-detect /mnt/nvme, /mnt/ssd, or /tmp) -t SECONDS Duration for tier comparison tests (default: 120) -s SECONDS Duration for storage saturation test (default: 180) -r SECONDS Duration for realistic production test (default: 180) @@ -57,6 +58,7 @@ EOF # Default configuration (can be overridden via getopts) model="" +cache_dir_override="" tier_duration=120 saturation_duration=180 realistic_duration=180 @@ -67,9 +69,10 @@ users_high_override="" rag_enabled=0 rag_docs_override="" -while getopts ":m:t:s:r:a:w:u:U:RD:h" opt; do +while getopts ":m:c:t:s:r:a:w:u:U:RD:h" opt; do case "$opt" in m) model="$OPTARG" ;; + c) cache_dir_override="$OPTARG" ;; t) tier_duration="$OPTARG" ;; s) saturation_duration="$OPTARG" ;; r) realistic_duration="$OPTARG" ;; @@ -275,15 +278,18 @@ else fi # System detection - Storage path -# Priority: /mnt/nvme > /mnt/ssd > /tmp -cache_dir="/tmp/kvcache_benchmark" -if [ -d "/mnt/nvme" ] && [ -w "/mnt/nvme" ]; then +# Priority: user override > /mnt/nvme > /mnt/ssd > /tmp +if [ -n "$cache_dir_override" ]; then + cache_dir="$cache_dir_override" + echo "Cache directory (user override): $cache_dir" +elif [ -d "/mnt/nvme" ] && [ -w "/mnt/nvme" ]; then cache_dir="/mnt/nvme" echo "NVMe storage path: $cache_dir" elif [ -d "/mnt/ssd" ] && [ -w "/mnt/ssd" ]; then cache_dir="/mnt/ssd" echo "SSD storage path: $cache_dir" else + cache_dir="/tmp/kvcache_benchmark" echo "Warning: using temp storage at $cache_dir (consider mounting NVMe to /mnt/nvme)" fi @@ -367,6 +373,7 @@ if should_run 'capacity-autoscale'; then capacity_model="llama3.1-70b-instruct" python3 kv-cache.py \ + --config config.yaml \ --model "$capacity_model" \ --num-users "$capacity_start_users" \ --duration "$autoscale_duration" \ @@ -377,7 +384,8 @@ if should_run 'capacity-autoscale'; then --generation-mode none \ --cache-dir "$cache_dir" \ --seed 42 \ - --output results_autoscaling_capacity.json + --output results_autoscaling_capacity.json \ + --xlsx-output results_autoscaling_capacity.xlsx echo "" echo "Capacity discovery complete. Check results_autoscaling_capacity.json for peak throughput." @@ -423,6 +431,7 @@ if should_run 'mlperf_submission'; then echo " PRIMARY METRICS: Decode Bytes Read, Wall-Clock Throughput" echo " WARNING: Storage Throughput unreliable at cpu_mem=0GB" python3 kv-cache.py \ + --config config.yaml \ --model llama3.1-8b \ --num-users 200 \ --duration 300 \ @@ -432,7 +441,8 @@ if should_run 'mlperf_submission'; then --generation-mode none \ --cache-dir "$cache_dir" \ --seed 42 \ - --output mlperf_v3_stress_8b.json + --output mlperf_v3_stress_8b.json \ + --xlsx-output mlperf_v3_stress_8b.xlsx echo "Maximum storage stress test (8B) complete." echo "" @@ -443,6 +453,7 @@ if should_run 'mlperf_submission'; then echo "[MLPerf 2/4] Storage Throughput Test: llama3.1-8b, cpu_mem=4GB, 100 users..." echo " PRIMARY METRIC: Storage Throughput (tok/s)" python3 kv-cache.py \ + --config config.yaml \ --model llama3.1-8b \ --num-users 100 \ --duration 300 \ @@ -452,7 +463,8 @@ if should_run 'mlperf_submission'; then --generation-mode none \ --cache-dir "$cache_dir" \ --seed 42 \ - --output mlperf_v3_throughput_8b.json + --output mlperf_v3_throughput_8b.json \ + --xlsx-output mlperf_v3_throughput_8b.xlsx echo "Storage throughput test (8B) complete." echo "" @@ -463,6 +475,7 @@ if should_run 'mlperf_submission'; then echo "[MLPerf 3/4] Large Model Stress: llama3.1-70b-instruct, cpu_mem=0GB, 70 users..." echo " PRIMARY METRICS: Decode Bytes Read, Wall-Clock Throughput" python3 kv-cache.py \ + --config config.yaml \ --model llama3.1-70b-instruct \ --num-users 70 \ --duration 300 \ @@ -472,7 +485,8 @@ if should_run 'mlperf_submission'; then --generation-mode none \ --cache-dir "$cache_dir" \ --seed 42 \ - --output mlperf_v3_stress_70b.json + --output mlperf_v3_stress_70b.json \ + --xlsx-output mlperf_v3_stress_70b.xlsx echo "Large model storage stress test (70B) complete." echo "" @@ -482,6 +496,7 @@ if should_run 'mlperf_submission'; then echo "[MLPerf 4/4] Large Model Throughput: llama3.1-70b-instruct, cpu_mem=4GB, 50 users..." echo " PRIMARY METRIC: Storage Throughput (tok/s)" python3 kv-cache.py \ + --config config.yaml \ --model llama3.1-70b-instruct \ --num-users 50 \ --duration 300 \ @@ -491,7 +506,8 @@ if should_run 'mlperf_submission'; then --generation-mode none \ --cache-dir "$cache_dir" \ --seed 42 \ - --output mlperf_v3_throughput_70b.json + --output mlperf_v3_throughput_70b.json \ + --xlsx-output mlperf_v3_throughput_70b.xlsx echo "Large model throughput test (70B) complete." echo "" @@ -523,6 +539,7 @@ if should_run 'gpu-only'; then if [ "$gpu_available" -eq 1 ]; then echo "[1/10] GPU Only - All cache in VRAM..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_baseline \ --duration "$tier_duration" \ @@ -531,7 +548,8 @@ if should_run 'gpu-only'; then --generation-mode realistic \ "${rag_args[@]}" \ --seed 42 \ - --output results_tier_gpu_only.json + --output results_tier_gpu_only.json \ + --xlsx-output results_tier_gpu_only.xlsx echo "" echo "GPU test complete. Expect lowest latency but limited capacity." @@ -552,6 +570,7 @@ fi if should_run 'cpu-only'; then echo "[2/10] CPU Only - All cache in RAM..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_baseline \ --duration "$tier_duration" \ @@ -560,7 +579,8 @@ if should_run 'cpu-only'; then --generation-mode realistic \ "${rag_args[@]}" \ --seed 42 \ - --output results_tier_cpu_only.json + --output results_tier_cpu_only.json \ + --xlsx-output results_tier_cpu_only.xlsx echo "" echo "CPU test complete. This is the typical production configuration." @@ -589,6 +609,7 @@ fi if should_run 'storage-only'; then echo "[3/10] TIER TEST: Storage Only - Pure NVMe/SSD caching..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_baseline \ --duration "$tier_duration" \ @@ -598,7 +619,8 @@ if should_run 'storage-only'; then --cache-dir $cache_dir \ "${rag_args[@]}" \ --seed 42 \ - --output results_tier_storage_only.json + --output results_tier_storage_only.json \ + --xlsx-output results_tier_storage_only.xlsx echo "" echo "Expected: Highest latency, validates NVMe P95 < 200ms for reads" @@ -628,6 +650,7 @@ if should_run 'gpu-cpu'; then if [ "$gpu_available" -eq 1 ]; then echo "[4/10] TIER TEST: GPU + CPU - Two-tier hot/warm caching..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_baseline \ --duration "$tier_duration" \ @@ -636,7 +659,8 @@ if should_run 'gpu-cpu'; then --generation-mode realistic \ "${rag_args[@]}" \ --seed 42 \ - --output results_tier_gpu_cpu.json + --output results_tier_gpu_cpu.json \ + --xlsx-output results_tier_gpu_cpu.xlsx echo "" echo "Expected: Low latency with large capacity" @@ -670,6 +694,7 @@ fi if should_run 'cpu-storage'; then echo "[5/10] TIER TEST: CPU + Storage - RAM with NVMe spillover..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_high \ --duration "$tier_duration" \ @@ -679,7 +704,8 @@ if should_run 'cpu-storage'; then --cache-dir $cache_dir \ "${rag_args[@]}" \ --seed 42 \ - --output results_tier_cpu_storage.json + --output results_tier_cpu_storage.json \ + --xlsx-output results_tier_cpu_storage.xlsx echo "" echo "Expected: Moderate latency, forces storage spillover with ${users_high} users" @@ -710,6 +736,7 @@ if should_run 'gpu-cpu-storage'; then if [ "$gpu_available" -eq 1 ]; then echo "[6/10] TIER TEST: GPU + CPU + Storage - Full three-tier hierarchy..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_high \ --duration "$tier_duration" \ @@ -719,7 +746,8 @@ if should_run 'gpu-cpu-storage'; then --cache-dir $cache_dir \ "${rag_args[@]}" \ --seed 42 \ - --output results_tier_gpu_cpu_storage.json + --output results_tier_gpu_cpu_storage.json \ + --xlsx-output results_tier_gpu_cpu_storage.xlsx echo "" echo "Expected: Best overall - hot in GPU, warm in CPU, cold in storage" @@ -752,6 +780,7 @@ fi if should_run 'storage-saturation'; then echo "[7/10] STRESS TEST: Storage Saturation - Maximum NVMe load..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_high \ --duration "$saturation_duration" \ @@ -761,7 +790,8 @@ if should_run 'storage-saturation'; then --cache-dir $cache_dir \ "${rag_args[@]}" \ --seed 42 \ - --output results_stress_storage_saturation.json + --output results_stress_storage_saturation.json \ + --xlsx-output results_stress_storage_saturation.xlsx echo "" echo "Expected: High storage load, validates NVMe can handle ${users_high} users" @@ -796,6 +826,7 @@ fi if should_run 'production'; then echo "[8/10] REALISTIC TEST: Production Workload - Multi-tier with realistic load..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users $users_baseline \ --duration "$realistic_duration" \ @@ -805,7 +836,8 @@ if should_run 'production'; then --cache-dir $cache_dir \ "${rag_args[@]}" \ --seed 42 \ - --output results_realistic_production.json + --output results_realistic_production.json \ + --xlsx-output results_realistic_production.xlsx echo "" echo "Expected: Balanced performance, realistic production scenario" @@ -839,6 +871,7 @@ fi if should_run 'autoscale'; then echo "[9/10] DISCOVERY TEST: Autoscaling - Find optimal user count..." python3 kv-cache.py \ + --config config.yaml \ --model $model \ --num-users 20 \ --duration "$autoscale_duration" \ @@ -850,7 +883,8 @@ if should_run 'autoscale'; then --cache-dir $cache_dir \ "${rag_args[@]}" \ --seed 42 \ - --output results_autoscaling_discovery.json + --output results_autoscaling_discovery.json \ + --xlsx-output results_autoscaling_discovery.xlsx echo "" echo "Expected: Progressive scaling to find hardware limits" From 001fd3bdda7e87acb4d6b5fea17f700508ccdbe1 Mon Sep 17 00:00:00 2001 From: Hazem Awadallah Date: Tue, 27 Jan 2026 15:43:46 -0800 Subject: [PATCH 06/43] test(kv-cache): comprehensive pytest suite for v3.0 features - Add 170+ tests covering all new functionality - Add ConfigLoader tests: schema validation, defaults, file loading - Add cfg() helper tests for config-driven parameters - Add validate_args() tests for path safety and input validation - Add extended QoS tests for P99.9 and P99.99 percentiles - Add GPU eviction callback tests for metadata sync - Add per-tier bandwidth and KV bytes metric tests - Add storage_* metric naming tests for MLPerf compliance - Add waterfall eviction tests with high/low watermarks - Add storage_health PASS/FAIL criteria tests --- kv_cache_benchmark/tests/test_kv_cache.py | 754 +++++++++++++++++++++- 1 file changed, 753 insertions(+), 1 deletion(-) diff --git a/kv_cache_benchmark/tests/test_kv_cache.py b/kv_cache_benchmark/tests/test_kv_cache.py index cfa42f56..e99ba4d3 100644 --- a/kv_cache_benchmark/tests/test_kv_cache.py +++ b/kv_cache_benchmark/tests/test_kv_cache.py @@ -11,10 +11,17 @@ These tests verify core functionality without running the full benchmark. Typical execution time: < 5 seconds + +This version tests kv-cache.py which includes: +- ConfigLoader with YAML support and strict validation +- Extended QoS SLA with p999 and p9999 percentiles +- Config-driven parameters via cfg() helper +- Renamed nvme_* to storage_* in stats """ import os import sys +import argparse import tempfile import pytest import numpy as np @@ -22,8 +29,26 @@ from pathlib import Path # Import from kv-cache.py (handle the hyphen in filename) +# Try multiple locations: same directory, parent directory import importlib.util -spec = importlib.util.spec_from_file_location("kv_cache", os.path.join(os.path.dirname(__file__), "kv-cache.py")) + +_kv_cache_path = None +_possible_paths = [ + os.path.join(os.path.dirname(__file__), "kv-cache.py"), # Same directory + os.path.join(os.path.dirname(__file__), "..", "kv-cache.py"), # Parent directory +] +for _path in _possible_paths: + if os.path.exists(_path): + _kv_cache_path = _path + break + +if _kv_cache_path is None: + raise FileNotFoundError( + f"Could not find kv-cache.py. Searched in:\n" + + "\n".join(f" - {os.path.abspath(p)}" for p in _possible_paths) + ) + +spec = importlib.util.spec_from_file_location("kv_cache", _kv_cache_path) kv_cache = importlib.util.module_from_spec(spec) spec.loader.exec_module(kv_cache) @@ -44,6 +69,24 @@ MultiTierCache = kv_cache.MultiTierCache export_results_to_xlsx = kv_cache.export_results_to_xlsx PANDAS_AVAILABLE = kv_cache.PANDAS_AVAILABLE + +# New imports for 01-26-2026 version +ConfigLoader = kv_cache.ConfigLoader +cfg = kv_cache.cfg +get_config = kv_cache.get_config +set_config = kv_cache.set_config +get_qos_profiles = kv_cache.get_qos_profiles +QoSSLA = kv_cache.QoSSLA +YAML_AVAILABLE = kv_cache.YAML_AVAILABLE + +# Input validation imports +validate_args = kv_cache.validate_args +MAX_USERS = kv_cache.MAX_USERS +MAX_DURATION_SECONDS = kv_cache.MAX_DURATION_SECONDS +MAX_GPU_MEMORY_GB = kv_cache.MAX_GPU_MEMORY_GB +MAX_CPU_MEMORY_GB = kv_cache.MAX_CPU_MEMORY_GB +FORBIDDEN_CACHE_PREFIXES = kv_cache.FORBIDDEN_CACHE_PREFIXES + if PANDAS_AVAILABLE: import pandas as pd @@ -190,6 +233,171 @@ class MockArgs: return MockArgs() +@pytest.fixture +def sample_config_yaml(tmp_path): + """Create a sample config.yaml for testing.""" + config_content = ''' +user_templates: + chatbot: + context_range: [256, 1024] + generation_range: [50, 150] + think_time_range: [0.1, 0.5] + coding: + context_range: [1024, 4096] + generation_range: [100, 500] + think_time_range: [0.2, 1.0] + document: + context_range: [2048, 8192] + generation_range: [200, 800] + think_time_range: [0.3, 1.5] + +qos_profiles: + interactive: + target_latency_p95_ms: 50 + target_latency_p99_ms: 100 + target_latency_p999_ms: 150 + target_latency_p9999_ms: 200 + priority: 3 + responsive: + target_latency_p95_ms: 100 + target_latency_p99_ms: 200 + target_latency_p999_ms: 350 + target_latency_p9999_ms: 500 + priority: 2 + batch: + target_latency_p95_ms: 1000 + target_latency_p99_ms: 5000 + target_latency_p999_ms: 7500 + target_latency_p9999_ms: 10000 + priority: 1 + +qos_distribution: + interactive_probability: 0.15 + responsive_threshold: 0.50 + +eviction: + max_recursion_depth: 10 + target_usage_ratio: 0.8 + large_entry_limit_ratio: 0.95 + max_evictions_hard_cap: 5000 + max_evictions_min: 1000 + +decode: + batch_size: 32 + +conversation: + max_conversations: 1000 + max_turns_per_conv: 50 + end_conversation_probability: 0.2 +''' + config_file = tmp_path / "test_config.yaml" + config_file.write_text(config_content) + return str(config_file) + + +# ============================================================================= +# Test 0: ConfigLoader (New in 01-26-2026) +# ============================================================================= + +@pytest.mark.skipif(not YAML_AVAILABLE, reason="PyYAML not installed") +class TestConfigLoader: + """Tests for ConfigLoader and cfg() helper function.""" + + def test_config_loader_without_file(self): + """ConfigLoader should work without a config file.""" + loader = ConfigLoader(config_path=None) + assert loader is not None + assert loader.config == {} + + def test_config_loader_loads_yaml(self, sample_config_yaml): + """ConfigLoader should load and parse YAML file.""" + loader = ConfigLoader(config_path=sample_config_yaml) + assert loader.config is not None + assert 'qos_profiles' in loader.config + + def test_config_loader_get_nested_value(self, sample_config_yaml): + """ConfigLoader.get() should retrieve nested values.""" + loader = ConfigLoader(config_path=sample_config_yaml) + priority = loader.get('qos_profiles', 'interactive', 'priority') + assert priority == 3 + + def test_config_loader_get_with_default(self, sample_config_yaml): + """ConfigLoader.get() should return default for missing keys.""" + loader = ConfigLoader(config_path=sample_config_yaml) + value = loader.get('nonexistent', 'key', default=42) + assert value == 42 + + def test_cfg_without_global_config(self): + """cfg() should return default when no global config is set.""" + # Ensure no global config + set_config(None) + value = cfg('qos_profiles', 'interactive', 'priority', default=99) + assert value == 99 + + def test_cfg_with_global_config(self, sample_config_yaml): + """cfg() should retrieve values from global config.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + value = cfg('qos_profiles', 'interactive', 'priority', default=99) + assert value == 3 + finally: + set_config(None) # Clean up + + def test_config_loader_validates_schema(self, tmp_path): + """ConfigLoader should reject unknown keys.""" + bad_config = tmp_path / "bad_config.yaml" + bad_config.write_text(''' +unknown_section: + bad_key: true +''') + with pytest.raises(ValueError, match="Unknown configuration key"): + ConfigLoader(config_path=str(bad_config)) + + def test_get_config_returns_none_initially(self): + """get_config() should return None before set_config() is called.""" + set_config(None) + assert get_config() is None + + def test_set_config_stores_loader(self, sample_config_yaml): + """set_config() should store the ConfigLoader globally.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + assert get_config() is loader + finally: + set_config(None) + + +class TestCfgHelper: + """Tests for cfg() helper function in various contexts.""" + + def test_cfg_returns_default_for_none_config(self): + """cfg() returns default when config is None.""" + set_config(None) + assert cfg('any', 'path', default='fallback') == 'fallback' + + def test_cfg_returns_default_for_missing_key(self, sample_config_yaml): + """cfg() returns default for missing nested keys.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + result = cfg('nonexistent', 'nested', 'key', default=123) + assert result == 123 + finally: + set_config(None) + + def test_cfg_retrieves_list_values(self, sample_config_yaml): + """cfg() can retrieve list values from config.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + context_range = cfg('user_templates', 'chatbot', 'context_range') + assert context_range == [256, 1024] + finally: + set_config(None) + + # ============================================================================= # Test 1: ModelConfig # ============================================================================= @@ -318,6 +526,39 @@ def test_sla_compliance_starts_at_one(self): def test_interactive_target_latency(self): sla = QOS_PROFILES[QoSLevel.INTERACTIVE] assert sla.target_latency_p95_ms == 50 + + # New tests for extended QoS percentiles (01-26-2026 feature) + def test_interactive_has_p999_latency(self): + """Test that p999 percentile is defined for INTERACTIVE.""" + sla = QOS_PROFILES[QoSLevel.INTERACTIVE] + assert hasattr(sla, 'target_latency_p999_ms') + assert sla.target_latency_p999_ms > sla.target_latency_p99_ms + + def test_interactive_has_p9999_latency(self): + """Test that p9999 percentile is defined for INTERACTIVE.""" + sla = QOS_PROFILES[QoSLevel.INTERACTIVE] + assert hasattr(sla, 'target_latency_p9999_ms') + assert sla.target_latency_p9999_ms > sla.target_latency_p999_ms + + def test_all_qos_levels_have_extended_percentiles(self): + """Verify all QoS levels have p999 and p9999 defined.""" + for level in QoSLevel: + sla = QOS_PROFILES[level] + assert hasattr(sla, 'target_latency_p999_ms') + assert hasattr(sla, 'target_latency_p9999_ms') + + def test_get_qos_profiles_returns_dict(self): + """Test that get_qos_profiles() returns profiles dict.""" + profiles = get_qos_profiles() + assert isinstance(profiles, dict) + assert len(profiles) == 3 + + def test_get_qos_profiles_levels(self): + """Test that get_qos_profiles() has all QoS levels.""" + profiles = get_qos_profiles() + assert QoSLevel.INTERACTIVE in profiles + assert QoSLevel.RESPONSIVE in profiles + assert QoSLevel.BATCH in profiles # ============================================================================= @@ -877,6 +1118,515 @@ def test_initial_cpu_usage_zero(self, multi_tier_cache): assert cpu_usage == 0 +# ============================================================================= +# Test 13: Config-Driven Parameters (New in 01-26-2026) +# ============================================================================= + +class TestConfigDrivenConversationManager: + """Tests for ConversationManager with config-driven parameters.""" + + def test_default_max_conversations(self): + """Without config, should use hardcoded default of 1000.""" + set_config(None) + manager = ConversationManager() + assert manager.max_conversations == 1000 + + def test_default_max_turns(self): + """Without config, should use hardcoded default of 50.""" + set_config(None) + manager = ConversationManager() + assert manager.max_turns_per_conv == 50 + + def test_explicit_params_override_config(self, sample_config_yaml): + """Explicit constructor params should override config values.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + manager = ConversationManager(max_conversations=42, max_turns_per_conv=7) + assert manager.max_conversations == 42 + assert manager.max_turns_per_conv == 7 + finally: + set_config(None) + + +@pytest.mark.skipif(not YAML_AVAILABLE, reason="PyYAML not installed") +class TestConfigDrivenUserSimulator: + """Tests for UserSimulator with config-driven parameters.""" + + def test_user_templates_from_config(self, sample_config_yaml): + """UserSimulator should read templates from config.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + templates = UserSimulator._get_user_templates() + assert 'chatbot' in templates + assert 'coding' in templates + assert 'document' in templates + assert templates['chatbot']['context_range'] == (256, 1024) + finally: + set_config(None) + + def test_qos_distribution_from_config(self, sample_config_yaml): + """UserSimulator.generate_mixed_users should use config QoS distribution.""" + loader = ConfigLoader(config_path=sample_config_yaml) + set_config(loader) + try: + # Generate many users to test distribution + users = UserSimulator.generate_mixed_users(1000) + # With 15% interactive probability, expect ~150 interactive users + interactive_count = sum(1 for u in users if u.qos_level == QoSLevel.INTERACTIVE) + # Allow 50% variance for randomness + assert 75 <= interactive_count <= 225, f"Expected ~150 interactive, got {interactive_count}" + finally: + set_config(None) + + +# ============================================================================= +# Test 14: Stats Naming Convention (storage_* vs nvme_*) +# ============================================================================= + +class TestStatsNamingConvention: + """Tests that stats use 'storage_*' naming (not 'nvme_*') in 01-26-2026.""" + + def test_stats_use_storage_prefix(self, multi_tier_cache): + """Stats should use 'storage_' prefix instead of 'nvme_'.""" + multi_tier_cache.allocate_cache("test_entry", num_tokens=100) + multi_tier_cache.access_cache("test_entry", InferencePhase.DECODE) + stats = multi_tier_cache.get_stats(duration=1.0) + + # Check for storage_* naming + storage_keys = [k for k in stats.keys() if 'storage_' in k.lower()] + nvme_keys = [k for k in stats.keys() if 'nvme_' in k.lower()] + + # Should have storage_* keys + assert len(storage_keys) > 0, "Expected storage_* keys in stats" + + def test_tier_stats_key_format(self, multi_tier_cache): + """tier_storage_* keys should exist (renamed from tier_nvme_*).""" + multi_tier_cache.allocate_cache("test_entry", num_tokens=100) + stats = multi_tier_cache.get_stats(duration=1.0) + + # Check for tier_storage_* keys + tier_storage_keys = [k for k in stats.keys() if k.startswith('tier_storage_')] + assert len(tier_storage_keys) > 0, "Expected tier_storage_* keys in stats" + + +# ============================================================================= +# Test 15: GPUMemoryBackend Eviction Callback (New in 01-26-2026) +# ============================================================================= + +@pytest.mark.skipif(not CUDA_AVAILABLE, reason="CUDA not available") +class TestGPUMemoryBackendEvictionCallback: + """Tests for GPUMemoryBackend's on_eviction_callback feature.""" + + def test_gpu_backend_accepts_callback(self): + """GPUMemoryBackend should accept on_eviction_callback parameter.""" + evicted_keys = [] + def callback(key, tier, size): + evicted_keys.append((key, tier, size)) + + backend = GPUMemoryBackend(on_eviction_callback=callback) + assert backend.on_eviction_callback is callback + backend.clear() + + def test_gpu_backend_works_without_callback(self): + """GPUMemoryBackend should work without a callback (None).""" + backend = GPUMemoryBackend(on_eviction_callback=None) + assert backend.on_eviction_callback is None + backend.clear() + + +# ============================================================================= +# Test 16: Input Validation (validate_args) +# ============================================================================= + +class TestValidateArgs: + """Tests for the validate_args() input validation function.""" + + @pytest.fixture + def valid_args(self): + """Create a valid args namespace with all required attributes.""" + import argparse + args = argparse.Namespace( + num_users=100, + duration=60, + gpu_mem_gb=16, + cpu_mem_gb=32, + rag_num_docs=10, + max_conversations=500, + max_concurrent_allocs=0, + request_rate=0, + max_requests=0, + target_saturation=0.8, + cache_dir=None + ) + return args + + def test_valid_args_pass_through(self, valid_args): + """Valid arguments should pass validation and return unchanged.""" + result = validate_args(valid_args) + assert result is valid_args + assert result.num_users == 100 + assert result.duration == 60 + + def test_num_users_zero_rejected(self, valid_args): + """num_users=0 should raise ValueError.""" + valid_args.num_users = 0 + with pytest.raises(ValueError, match="num-users must be positive"): + validate_args(valid_args) + + def test_num_users_negative_rejected(self, valid_args): + """Negative num_users should raise ValueError.""" + valid_args.num_users = -5 + with pytest.raises(ValueError, match="num-users must be positive"): + validate_args(valid_args) + + def test_num_users_exceeds_limit(self, valid_args): + """num_users exceeding MAX_USERS should raise ValueError.""" + valid_args.num_users = MAX_USERS + 1 + with pytest.raises(ValueError, match="num-users exceeds limit"): + validate_args(valid_args) + + def test_duration_zero_rejected(self, valid_args): + """duration=0 should raise ValueError.""" + valid_args.duration = 0 + with pytest.raises(ValueError, match="duration must be positive"): + validate_args(valid_args) + + def test_duration_negative_rejected(self, valid_args): + """Negative duration should raise ValueError.""" + valid_args.duration = -10 + with pytest.raises(ValueError, match="duration must be positive"): + validate_args(valid_args) + + def test_duration_exceeds_limit(self, valid_args): + """duration exceeding 24 hours should raise ValueError.""" + valid_args.duration = MAX_DURATION_SECONDS + 1 + with pytest.raises(ValueError, match="duration exceeds 24 hours"): + validate_args(valid_args) + + def test_gpu_mem_negative_rejected(self, valid_args): + """Negative gpu_mem_gb should raise ValueError.""" + valid_args.gpu_mem_gb = -1 + with pytest.raises(ValueError, match="gpu-mem-gb cannot be negative"): + validate_args(valid_args) + + def test_gpu_mem_zero_allowed(self, valid_args): + """gpu_mem_gb=0 should be valid (disables GPU tier).""" + valid_args.gpu_mem_gb = 0 + result = validate_args(valid_args) + assert result.gpu_mem_gb == 0 + + def test_gpu_mem_exceeds_limit(self, valid_args): + """gpu_mem_gb exceeding limit should raise ValueError.""" + valid_args.gpu_mem_gb = MAX_GPU_MEMORY_GB + 1 + with pytest.raises(ValueError, match="gpu-mem-gb exceeds limit"): + validate_args(valid_args) + + def test_cpu_mem_negative_rejected(self, valid_args): + """Negative cpu_mem_gb should raise ValueError.""" + valid_args.cpu_mem_gb = -1 + with pytest.raises(ValueError, match="cpu-mem-gb cannot be negative"): + validate_args(valid_args) + + def test_cpu_mem_zero_allowed(self, valid_args): + """cpu_mem_gb=0 should be valid.""" + valid_args.cpu_mem_gb = 0 + result = validate_args(valid_args) + assert result.cpu_mem_gb == 0 + + def test_cpu_mem_exceeds_limit(self, valid_args): + """cpu_mem_gb exceeding limit should raise ValueError.""" + valid_args.cpu_mem_gb = MAX_CPU_MEMORY_GB + 1 + with pytest.raises(ValueError, match="cpu-mem-gb exceeds limit"): + validate_args(valid_args) + + def test_target_saturation_below_zero_rejected(self, valid_args): + """target_saturation < 0 should raise ValueError.""" + valid_args.target_saturation = -0.1 + with pytest.raises(ValueError, match="target-saturation must be between 0.0 and 1.0"): + validate_args(valid_args) + + def test_target_saturation_above_one_rejected(self, valid_args): + """target_saturation > 1 should raise ValueError.""" + valid_args.target_saturation = 1.5 + with pytest.raises(ValueError, match="target-saturation must be between 0.0 and 1.0"): + validate_args(valid_args) + + def test_target_saturation_boundaries_valid(self, valid_args): + """target_saturation at 0.0 and 1.0 should be valid.""" + valid_args.target_saturation = 0.0 + result = validate_args(valid_args) + assert result.target_saturation == 0.0 + + valid_args.target_saturation = 1.0 + result = validate_args(valid_args) + assert result.target_saturation == 1.0 + + def test_rag_num_docs_negative_rejected(self, valid_args): + """Negative rag_num_docs should raise ValueError.""" + valid_args.rag_num_docs = -1 + with pytest.raises(ValueError, match="rag-num-docs cannot be negative"): + validate_args(valid_args) + + def test_max_conversations_zero_rejected(self, valid_args): + """max_conversations=0 should raise ValueError.""" + valid_args.max_conversations = 0 + with pytest.raises(ValueError, match="max-conversations must be positive"): + validate_args(valid_args) + + def test_max_concurrent_allocs_negative_rejected(self, valid_args): + """Negative max_concurrent_allocs should raise ValueError.""" + valid_args.max_concurrent_allocs = -1 + with pytest.raises(ValueError, match="max-concurrent-allocs cannot be negative"): + validate_args(valid_args) + + def test_request_rate_negative_rejected(self, valid_args): + """Negative request_rate should raise ValueError.""" + valid_args.request_rate = -1 + with pytest.raises(ValueError, match="request-rate cannot be negative"): + validate_args(valid_args) + + def test_max_requests_negative_rejected(self, valid_args): + """Negative max_requests should raise ValueError.""" + valid_args.max_requests = -1 + with pytest.raises(ValueError, match="max-requests cannot be negative"): + validate_args(valid_args) + + @pytest.mark.skipif(sys.platform == 'win32', reason="Unix paths not valid on Windows") + def test_forbidden_cache_dir_rejected(self, valid_args): + """Cache directories in system paths should be rejected.""" + valid_args.cache_dir = '/etc/kv_cache' + with pytest.raises(ValueError, match="cannot be a system directory"): + validate_args(valid_args) + + def test_valid_cache_dir_allowed(self, valid_args, tmp_path): + """Valid cache directory should be accepted.""" + valid_args.cache_dir = str(tmp_path / "kv_cache_test") + result = validate_args(valid_args) + assert result.cache_dir == str(tmp_path / "kv_cache_test") + + def test_multiple_errors_collected(self, valid_args): + """Multiple validation errors should all be reported.""" + valid_args.num_users = -1 + valid_args.duration = -1 + valid_args.gpu_mem_gb = -1 + with pytest.raises(ValueError) as exc_info: + validate_args(valid_args) + # All three errors should be in the message + error_msg = str(exc_info.value) + assert "num-users" in error_msg + assert "duration" in error_msg + assert "gpu-mem-gb" in error_msg + + +# ============================================================================= +# Test 17: Per-Tier Phase Metrics +# ============================================================================= + +class TestPerTierPhaseMetrics: + """Tests for per-tier KV bytes tracking (prefill/decode per tier).""" + + @pytest.fixture + def tiny_model_config(self): + """Return the tiny-1b model config for fast tests.""" + return MODEL_CONFIGS['tiny-1b'] + + @pytest.fixture + def multi_tier_cache_cpu_only(self, tiny_model_config): + """Return a MultiTierCache in CPU-only mode (GPU disabled).""" + return MultiTierCache( + model_config=tiny_model_config, + gpu_memory_gb=0, + cpu_memory_gb=0.1, # 100MB + seed=42 + ) + + def test_stats_have_tier_kv_bytes_written_keys(self, multi_tier_cache_cpu_only): + """Stats should include tier_*_kv_bytes_written keys.""" + multi_tier_cache_cpu_only.allocate_cache("test_entry", num_tokens=100) + stats = multi_tier_cache_cpu_only.get_stats(duration=1.0) + + # Check for per-tier write tracking + assert 'tier_gpu_kv_bytes_written_gb' in stats + assert 'tier_cpu_kv_bytes_written_gb' in stats + assert 'tier_storage_kv_bytes_written_gb' in stats + + def test_stats_have_tier_kv_bytes_read_keys(self, multi_tier_cache_cpu_only): + """Stats should include tier_*_kv_bytes_read keys.""" + multi_tier_cache_cpu_only.allocate_cache("test_entry", num_tokens=100) + multi_tier_cache_cpu_only.access_cache("test_entry", InferencePhase.DECODE) + stats = multi_tier_cache_cpu_only.get_stats(duration=1.0) + + # Check for per-tier read tracking + assert 'tier_gpu_kv_bytes_read_gb' in stats + assert 'tier_cpu_kv_bytes_read_gb' in stats + assert 'tier_storage_kv_bytes_read_gb' in stats + + def test_cpu_write_bytes_increment_on_allocate(self, multi_tier_cache_cpu_only): + """Allocating to CPU tier should increment tier_cpu_kv_bytes_written.""" + # Get initial stats + stats_before = multi_tier_cache_cpu_only.get_stats(duration=1.0) + cpu_written_before = stats_before.get('tier_cpu_kv_bytes_written_gb', 0) + + # Allocate cache entry (goes to CPU since GPU is disabled) + success, location, _ = multi_tier_cache_cpu_only.allocate_cache("test_entry", num_tokens=100) + assert success + assert location == 'cpu' + + # Check that CPU write bytes increased + stats_after = multi_tier_cache_cpu_only.get_stats(duration=1.0) + cpu_written_after = stats_after.get('tier_cpu_kv_bytes_written_gb', 0) + + assert cpu_written_after > cpu_written_before, \ + f"CPU write bytes should increase: {cpu_written_before} -> {cpu_written_after}" + + def test_cpu_read_bytes_increment_on_access(self, multi_tier_cache_cpu_only): + """Accessing from CPU tier should increment tier_cpu_kv_bytes_read.""" + # Allocate first + multi_tier_cache_cpu_only.allocate_cache("test_entry", num_tokens=100) + + # Get stats before access + stats_before = multi_tier_cache_cpu_only.get_stats(duration=1.0) + cpu_read_before = stats_before.get('tier_cpu_kv_bytes_read_gb', 0) + + # Access the cache entry + location, _ = multi_tier_cache_cpu_only.access_cache("test_entry", InferencePhase.DECODE) + assert location == 'cpu' + + # Check that CPU read bytes increased + stats_after = multi_tier_cache_cpu_only.get_stats(duration=1.0) + cpu_read_after = stats_after.get('tier_cpu_kv_bytes_read_gb', 0) + + assert cpu_read_after > cpu_read_before, \ + f"CPU read bytes should increase: {cpu_read_before} -> {cpu_read_after}" + + def test_gpu_bytes_zero_when_gpu_disabled(self, multi_tier_cache_cpu_only): + """With GPU disabled (0 GB), GPU tier bytes should remain zero.""" + # Do some allocations and accesses + for i in range(5): + multi_tier_cache_cpu_only.allocate_cache(f"entry_{i}", num_tokens=100) + for i in range(5): + multi_tier_cache_cpu_only.access_cache(f"entry_{i}", InferencePhase.DECODE) + + stats = multi_tier_cache_cpu_only.get_stats(duration=1.0) + + # GPU bytes should be zero since GPU tier is disabled + assert stats.get('tier_gpu_kv_bytes_written_gb', 0) == 0, \ + "GPU write bytes should be 0 when GPU disabled" + assert stats.get('tier_gpu_kv_bytes_read_gb', 0) == 0, \ + "GPU read bytes should be 0 when GPU disabled" + + def test_storage_tier_overflow(self, tiny_model_config): + """When CPU is full, allocations should overflow to storage tier.""" + # Create cache with very small CPU limit + cache = MultiTierCache( + model_config=tiny_model_config, + gpu_memory_gb=0, + cpu_memory_gb=0.001, # 1MB - very small + seed=42 + ) + + # Allocate enough to overflow CPU + for i in range(20): + cache.allocate_cache(f"entry_{i}", num_tokens=1000) + + stats = cache.get_stats(duration=1.0) + + # Storage tier should have received some data + storage_written = stats.get('tier_storage_kv_bytes_written_gb', 0) + assert storage_written > 0, \ + f"Storage tier should have data when CPU overflows: {storage_written}" + + def test_per_tier_bandwidth_calculated(self, multi_tier_cache_cpu_only): + """Per-tier bandwidth stats should be calculated.""" + # Do some I/O + for i in range(10): + multi_tier_cache_cpu_only.allocate_cache(f"entry_{i}", num_tokens=100) + for i in range(10): + multi_tier_cache_cpu_only.access_cache(f"entry_{i}", InferencePhase.DECODE) + + stats = multi_tier_cache_cpu_only.get_stats(duration=1.0) + + # Bandwidth stats should exist + assert 'tier_cpu_read_bandwidth_gbps' in stats + assert 'tier_cpu_write_bandwidth_gbps' in stats + assert 'tier_storage_read_bandwidth_gbps' in stats + assert 'tier_storage_write_bandwidth_gbps' in stats + + +@pytest.mark.skipif(not CUDA_AVAILABLE, reason="CUDA not available") +class TestPerTierPhaseMetricsWithGPU: + """Tests for per-tier metrics when GPU is enabled.""" + + @pytest.fixture + def tiny_model_config(self): + """Return the tiny-1b model config for fast tests.""" + return MODEL_CONFIGS['tiny-1b'] + + @pytest.fixture + def multi_tier_cache_with_gpu(self, tiny_model_config): + """Return a MultiTierCache with GPU enabled.""" + return MultiTierCache( + model_config=tiny_model_config, + gpu_memory_gb=1.0, # 1GB GPU + cpu_memory_gb=0.1, # 100MB CPU + seed=42 + ) + + def test_gpu_write_bytes_increment_on_allocate(self, multi_tier_cache_with_gpu): + """Allocating to GPU tier should increment tier_gpu_kv_bytes_written.""" + # Get initial stats + stats_before = multi_tier_cache_with_gpu.get_stats(duration=1.0) + gpu_written_before = stats_before.get('tier_gpu_kv_bytes_written_gb', 0) + + # Allocate cache entry (should go to GPU first) + success, location, _ = multi_tier_cache_with_gpu.allocate_cache("test_entry", num_tokens=100) + assert success + assert location == 'gpu' + + # Check that GPU write bytes increased + stats_after = multi_tier_cache_with_gpu.get_stats(duration=1.0) + gpu_written_after = stats_after.get('tier_gpu_kv_bytes_written_gb', 0) + + assert gpu_written_after > gpu_written_before, \ + f"GPU write bytes should increase: {gpu_written_before} -> {gpu_written_after}" + + def test_gpu_read_bytes_increment_on_access(self, multi_tier_cache_with_gpu): + """Accessing from GPU tier should increment tier_gpu_kv_bytes_read.""" + # Allocate first + multi_tier_cache_with_gpu.allocate_cache("test_entry", num_tokens=100) + + # Get stats before access + stats_before = multi_tier_cache_with_gpu.get_stats(duration=1.0) + gpu_read_before = stats_before.get('tier_gpu_kv_bytes_read_gb', 0) + + # Access the cache entry + location, _ = multi_tier_cache_with_gpu.access_cache("test_entry", InferencePhase.DECODE) + assert location == 'gpu' + + # Check that GPU read bytes increased + stats_after = multi_tier_cache_with_gpu.get_stats(duration=1.0) + gpu_read_after = stats_after.get('tier_gpu_kv_bytes_read_gb', 0) + + assert gpu_read_after > gpu_read_before, \ + f"GPU read bytes should increase: {gpu_read_before} -> {gpu_read_after}" + + def test_gpu_bandwidth_calculated(self, multi_tier_cache_with_gpu): + """GPU tier bandwidth stats should be calculated.""" + # Do some I/O + for i in range(5): + multi_tier_cache_with_gpu.allocate_cache(f"entry_{i}", num_tokens=100) + for i in range(5): + multi_tier_cache_with_gpu.access_cache(f"entry_{i}", InferencePhase.DECODE) + + stats = multi_tier_cache_with_gpu.get_stats(duration=1.0) + + # GPU bandwidth stats should exist + assert 'tier_gpu_read_bandwidth_gbps' in stats + assert 'tier_gpu_write_bandwidth_gbps' in stats + + # ============================================================================= # Main entry point for running without pytest # ============================================================================= @@ -885,8 +1635,10 @@ def pytest_configure(config): """Add metadata to pytest-html report.""" if hasattr(config, '_metadata'): config._metadata['Project'] = 'MLPerf v3 KV Cache Benchmark' + config._metadata['Source File'] = 'kv-cache.py' config._metadata['Models'] = 'tiny-1b, mistral-7b, llama2-7b, llama3.1-8b, llama3.1-70b-instruct' config._metadata['Test File'] = 'test_kv_cache.py' + config._metadata['New Features Tested'] = 'ConfigLoader, Extended QoS (p999/p9999), cfg() helper, storage_* naming' def pytest_html_report_title(report): From 29562889768fe75e312c55e8b822503fb478848d Mon Sep 17 00:00:00 2001 From: Hazem Awadallah Date: Tue, 27 Jan 2026 15:44:14 -0800 Subject: [PATCH 07/43] docs(readme): comprehensive documentation for v3.0 - Add Configuration section with YAML parameter reference - Add MLPerf Submission Guidelines with validated commands - Add Excel metrics reference table with all output columns - Add installation instructions including pyyaml dependency - Add CLI arguments vs config file precedence documentation - Add workload definitions and tier configuration examples - Add troubleshooting section for common issues --- kv_cache_benchmark/README.md | 2098 +++++++++++++++++++++------------- 1 file changed, 1332 insertions(+), 766 deletions(-) diff --git a/kv_cache_benchmark/README.md b/kv_cache_benchmark/README.md index 5f0637c1..b7599b28 100644 --- a/kv_cache_benchmark/README.md +++ b/kv_cache_benchmark/README.md @@ -1,766 +1,1332 @@ -# MLPerf Storage KV Cache Benchmark - -A storage benchmarking tool for Large Language Model inference systems. This benchmark measures the performance of your storage subsystem under realistic KV cache offloading workloads, helping you answer critical questions about hardware capacity and configuration. - -**Author:** Hazem Awadallah, Kingston Digital -**License:** Apache 2.0 -**Version:** MLPerf Storage v3.0 (Enhanced) - ---- - -## Table of Contents - -1. [What This Benchmark Does](#what-this-benchmark-does) -2. [Architecture Overview](#architecture-overview) -3. [System Requirements](#system-requirements) -4. [Installation](#installation) -5. [Quick Start](#quick-start) -6. [Running the Benchmark](#running-the-benchmark) -7. [ShareGPT Replay Workloads](#sharegpt-replay-workloads) -8. [Using the Wrapper Script](#using-the-wrapper-script) -9. [Understanding Results](#understanding-results) -10. [Unit Testing](#unit-testing) -11. [Excel Export](#excel-export) -12. [MLPerf Submission Guidelines](#mlperf-submission-guidelines) -13. [Troubleshooting](#troubleshooting) - ---- - -## What This Benchmark Does - -During LLM inference, models store intermediate attention data in a structure called the KV (Key-Value) cache. This cache grows with conversation length and can consume enormous amounts of memory. Production systems offload this cache from expensive GPU VRAM to cheaper CPU RAM or NVMe storage. - -This benchmark simulates that offloading behavior. It generates realistic multi-user inference workloads and measures how your storage performs under pressure. It measures these components: - -- How many concurrent users your hardware can support -- Whether your NVMe drive is fast enough to handle cache spillover -- The real latency impact of each storage tier -- Where the bottleneck sits in your system - -This is not a pass/fail test. It is a diagnostic tool for system architects and performance engineers. - ---- - -## Architecture Overview - -The benchmark implements a three-tier memory hierarchy that mirrors production LLM serving systems. - -``` -┌─────────────────────────────────────────────────────────────────────────────┐ -│ KV Cache Benchmark Architecture │ -└─────────────────────────────────────────────────────────────────────────────┘ - - ┌──────────────────┐ - │ User Requests │ - │ (Multi-tenant) │ - └────────┬─────────┘ - │ - ▼ - ┌──────────────────────────────────────┐ - │ Request Queue │ - │ (Priority-based: QoS levels) │ - │ Interactive > Responsive > Batch │ - └──────────────────┬───────────────────┘ - │ - ▼ - ┌────────────────────────────────────────────────────────┐ - │ IntegratedBenchmark │ - │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────┐ │ - │ │ Prefill │ │ Decode │ │ Conversation │ │ - │ │ (Write) │ │ (Read) │ │ Manager │ │ - │ └──────┬──────┘ └──────┬──────┘ └────────┬────────┘ │ - └─────────┼────────────────┼─────────────────┼───────────┘ - │ │ │ - └────────────────┼─────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────────────────┐ -│ MultiTierCache │ -│ (Waterfall LRU Eviction) │ -│ │ -│ New Data ─────► Always targets fastest available tier │ -│ If full, LRU entry cascades down │ -│ │ -│ ┌─────────────────────────────────────────────────────────────────────┐ │ -│ │ │ │ -│ │ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ │ -│ │ │ GPU VRAM │ │ CPU RAM │ │ NVMe │ │ │ -│ │ │ (Tier 1) │─────►│ (Tier 2) │─────►│ (Tier 3) │ │ │ -│ │ │ │ LRU │ │ LRU │ │ │ │ -│ │ │ Sub-ms │evict │ Tens of ms │evict │ Hundreds │ │ │ -│ │ │ latency │ │ latency │ │ of ms │ │ │ -│ │ │ │ │ │ │ │ │ │ -│ │ │ PyTorch/CuPy │ │ NumPy arrays │ │ .npy files │ │ │ -│ │ │ tensors │ │ in memory │ │ on disk │ │ │ -│ │ └───────────────┘ └───────────────┘ └───────────────┘ │ │ -│ │ │ │ -│ │ ◄──── HOT DATA ────────────────────────────── COLD DATA ────► │ │ -│ │ │ │ -│ └─────────────────────────────────────────────────────────────────────┘ │ -│ │ -└─────────────────────────────────────────────────────────────────────────────┘ -``` - -### Key Components - -**MultiTierCache**: The core engine. It decides where to place data based on available space and access patterns. New data always targets the fastest tier. When that tier fills up, the least recently used entry gets pushed down to the next tier. - -**Inference Phases**: The benchmark models two distinct I/O patterns: -- **Prefill**: Write-heavy. Processing the user prompt generates new KV cache entries. -- **Decode**: Read-heavy. Generating each output token requires reading the existing cache. - -**User Simulation**: Creates realistic traffic from multiple concurrent users with different behaviors (chatbot, coding assistant, document analysis) and priority levels. - -**Autoscaler**: Automatically adjusts user load to find either the maximum users your system can handle (QoS mode) or the peak throughput of your storage (capacity mode). - ---- - -## System Requirements - -### Minimum - -- CPU: 8+ cores (AMD EPYC, Intel Xeon) -- RAM: 32 GB -- Storage: 256 GB free space on SSD -- OS: Linux (Ubuntu 22.04, RHEL 9, or similar) -- Python: 3.8 or higher -- No GPU required (runs in CPU-only mode) - -### Recommended - -- CPU: 32+ cores -- RAM: 128 GB or more -- GPU: NVIDIA A100/H100 with 40+ GB VRAM (optional but enables full three-tier testing) -- Storage: 1 TB+ on NVMe (PCIe Gen4 or Gen5) -- Tools: `bc`, `jq` for the wrapper script - ---- - -## Installation - -1. Clone or download this repository. - -2. Install Python dependencies: - -```bash -pip install -r requirements.txt -``` - -Or install core dependencies manually: - -```bash -pip install numpy -``` - -3. For GPU support (optional): - -```bash -pip install torch # or cupy-cuda12x for CuPy -``` - -4. For ShareGPT replay workloads (optional): - -```bash -pip install tiktoken -``` - -5. For Excel export (optional): - -```bash -pip install pandas openpyxl -``` - -6. Verify the installation: - -```bash -python3 kv-cache.py --help -``` - ---- - -## Quick Start - -Run a basic storage test with 50 users for 2 minutes: - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 50 \ - --duration 120 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --generation-mode realistic \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results.json -``` - -This forces all cache operations to hit your NVMe drive, giving you a baseline measurement of storage performance. - ---- - -## Running the Benchmark - -### Command Line Options - -``` -python3 kv-cache.py [options] - -Required Arguments: - --model MODEL Model configuration to use. Choices: - tiny-1b, mistral-7b, llama2-7b, llama3.1-8b, - llama3.1-70b-instruct - --num-users N Number of concurrent users to simulate - --duration SECONDS Duration of the benchmark in seconds - -Memory Configuration: - --gpu-mem-gb N GPU VRAM budget in GB (0 to disable GPU tier) - --cpu-mem-gb N CPU RAM budget in GB (0 to disable CPU tier) - --cache-dir PATH Directory for NVMe cache files (defaults to temp directory) - -Token Generation: - --generation-mode Token generation speed simulation. Choices: - - none: Pure storage test, no GPU simulation - - fast: 2ms per token (high-end GPU) - - realistic: 30ms per token (typical production) - -Caching Features: - --disable-multi-turn Disable multi-turn conversation caching - --disable-prefix-caching - Disable prefix caching (shared system prompts) - -Autoscaling: - --enable-autoscaling Enable workload autoscaling - --autoscaler-mode Autoscaling strategy. Choices: - - qos: Latency-based, finds max users at target saturation - - capacity: Throughput-based, finds peak storage performance - --target-saturation N Target storage saturation for QoS autoscaling (0.0-1.0, - default: 0.8) - -ShareGPT Replay (NEW): - --dataset-path PATH Path to ShareGPT JSON for realistic workload replay - --max-conversations N Max conversations to load from dataset (default: 500) - --request-rate RATE Target request arrival rate (requests/sec) - --max-requests N Stop after N requests (for fixed-length runs) - -RAG Workload: - --enable-rag Enable RAG workload simulation - --rag-num-docs N Number of RAG documents to ingest - -Performance and Output: - --performance-profile Profile for pass/fail criteria. Choices: - - latency: Default, evaluates P95 latency targets - - throughput: For MLPerf submission, evaluates tokens/sec - --output FILE Write results to JSON file - --xlsx-output FILE Export results to Excel/CSV file (NEW) - --seed N Seed for random number generators (required for MLPerf - reproducibility) - -Resource Limits: - --max-concurrent-allocs N - Limit concurrent cache allocations to bound RAM usage. - 0 = unlimited. Recommended: 8-16 for large models to - prevent memory explosion. -``` - -### Test Scenarios - -#### Scenario 1: Storage-Only Baseline - -Isolate your NVMe drive by setting GPU memory to zero. This tells you the raw performance of your storage. - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 50 \ - --duration 180 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --generation-mode realistic \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_storage_only.json -``` - -#### Scenario 2: Realistic Production Setup - -Test a balanced three-tier configuration that mirrors production deployment. - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 100 \ - --duration 300 \ - --gpu-mem-gb 16 \ - --cpu-mem-gb 32 \ - --generation-mode realistic \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_production.json -``` - -#### Scenario 3: Find Maximum User Count (QoS Mode) - -Let the autoscaler discover how many users your system can handle while maintaining acceptable latency. - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 20 \ - --duration 300 \ - --gpu-mem-gb 16 \ - --cpu-mem-gb 32 \ - --enable-autoscaling \ - --autoscaler-mode qos \ - --generation-mode realistic \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_autoscale_qos.json -``` - -#### Scenario 4: Find Peak Storage Throughput (Capacity Mode) - -Discover the absolute maximum I/O your storage can deliver by ignoring latency constraints. - -```bash -python3 kv-cache.py \ - --model llama3.1-70b-instruct \ - --num-users 10 \ - --duration 180 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --enable-autoscaling \ - --autoscaler-mode capacity \ - --generation-mode none \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_capacity.json -``` - ---- - -## ShareGPT Replay Workloads - -While synthetic workloads are excellent for controlled stress testing, they may not capture the nuances of real human-AI interaction. The **ShareGPT Replay** feature addresses this by loading actual conversation data. - -### Why Use ShareGPT? - -Real conversations exhibit different patterns than synthetic workloads: -- **Higher cache locality**: Users ask follow-up questions, reusing context -- **Variable context sizes**: Real queries vary wildly (10-16,000 tokens) -- **Multi-turn structure**: Conversation flows are preserved - -### Downloading the ShareGPT Dataset - -Download the full dataset from Hugging Face (~1.2 GB): - -```bash -wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -``` - -**Alternative: Smaller subset for quick testing (~40 MB):** - -```bash -wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json -``` - -### Basic ShareGPT Invocation - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ - --max-conversations 500 \ - --num-users 50 \ - --duration 300 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --generation-mode realistic \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_sharegpt.json -``` - -### ShareGPT with Rate Limiting - -Control the request arrival rate for steady-state testing: - -```bash -python3 kv-cache.py \ - --model llama3.1-70b-instruct \ - --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ - --max-conversations 1000 \ - --request-rate 10.0 \ - --num-users 100 \ - --duration 600 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 8 \ - --generation-mode none \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_sharegpt_rate_limited.json -``` - -### ShareGPT with Fixed Request Count - -Run exactly N requests for reproducible benchmarks: - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ - --max-requests 5000 \ - --num-users 50 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --generation-mode realistic \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output results_sharegpt_fixed.json -``` - -### Comparing Real vs Synthetic Workloads - -| Metric | ShareGPT (Real) | Synthetic (Random) | -| :--- | :--- | :--- | -| Mean Context Size | ~133 tokens | ~2,676 tokens | -| Cache Hit Rate | 85-97% | 50-70% | -| Multi-turn Locality | High | Medium | -| Throughput | Higher | Lower | -| NVMe Stress | Moderate | Extreme | - -**Use ShareGPT** when you want to model real chatbot/assistant usage. -**Use Synthetic** when you want worst-case stress testing or controlled experiments. - ---- - -## Using the Wrapper Script - -The `kv-cache-wrapper.sh` script automates a complete benchmark suite. It detects your hardware, calculates appropriate parameters, and runs multiple test scenarios. - -### Basic Usage - -```bash -./kv-cache-wrapper.sh -``` - -This runs all test scenarios with default settings. Expect roughly 30 minutes for the full suite. - -### Options - -``` -./kv-cache-wrapper.sh [options] - - -m MODEL Model to benchmark (default: llama3.1-8b) - -t SECONDS Duration for tier comparison tests (default: 120) - -s SECONDS Duration for storage saturation test (default: 180) - -r SECONDS Duration for production test (default: 180) - -a SECONDS Duration for autoscaling tests (default: 300) - -w LIST Comma-separated list of workloads to run - -u USERS Override baseline user count - -U USERS Override high-load user count - -R Enable RAG workload - -D DOCS Number of RAG documents (default: 10) - -h Show help -``` - -### Available Workloads - -```bash -# Run only the storage isolation test -./kv-cache-wrapper.sh -w storage-only - -# Run production and autoscaling tests -./kv-cache-wrapper.sh -w production,autoscale - -# Run MLPerf submission tests -./kv-cache-wrapper.sh -w mlperf_submission -``` - ---- - -## Understanding Results - -### Key Metrics - -**Throughput (tokens/sec)**: How many tokens the system processes per second. Higher is better. - -**Storage Throughput (tokens/sec)**: Raw I/O performance calculated from storage latency, not wall-clock time. This is the fairer metric for comparing storage tiers. - -**End-to-End Latency**: Total time from request submission to completion. This is what users experience. - -**Storage I/O Latency**: Time spent reading from and writing to storage tiers. This measures your hardware. - -**Queue Wait Time**: Time requests spend waiting before processing begins. If this dominates, your system is overloaded. - -**Cache Hit Rate**: Percentage of reads served from cache. Higher rates mean less storage pressure. - -### Reading the Output - -``` -### STORAGE PERFORMANCE ASSESSMENT: PASS ### - Criteria Passed: 4/4 - [PASS] NVMe Write P95 < 500ms: 45.20ms - [PASS] NVMe Read P95 < 200ms: 123.45ms - [PASS] CPU RAM P95 < 150ms: 12.30ms - [PASS] Cache Hit Rate > 30%: 67.5% - -### OVERALL PERFORMANCE ### - Total Requests: 2847 - Total Tokens Generated: 489,231 - Avg Throughput: 1,630.77 tok/s - Storage Throughput: 2,105.32 tok/s - -### LATENCY BREAKDOWN ### - End-to-End: mean 89.3ms, P50 45.2ms, P95 312.4ms - Storage I/O: mean 23.1ms, P50 12.4ms, P95 89.2ms -``` - ---- - -## Unit Testing - -This package includes a comprehensive pytest-based test suite to verify core functionality without running the full benchmark. - -### Running Tests - -```bash -# Run all tests with verbose output -pytest test_kv_cache.py -v - -# Run with shorter traceback -pytest test_kv_cache.py -v --tb=short - -# Run specific test class -pytest test_kv_cache.py -k "TestModelConfig" -v - -# Run only CPU tests (skip GPU tests if no CUDA) -pytest test_kv_cache.py -v -m "not skipif" -``` - -### Test Coverage - -The test suite covers 12 component categories: - -| Test Class | Coverage | -|------------|----------| -| `TestModelConfig` | Model configurations, KV cache size calculations | -| `TestInferenceRequest` | Request dataclass, cache key generation | -| `TestQoSProfiles` | QoS levels, SLA targets, priorities | -| `TestKVCacheGenerator` | Determinism, shapes, dtypes, precomputed buffers | -| `TestCPUMemoryBackend` | Write/read/delete/clear operations | -| `TestNVMeBackend` | File I/O, metadata, temp directories | -| `TestGPUMemoryBackend` | CUDA tensors, device placement (skipped without GPU) | -| `TestConversationManager` | Multi-turn tracking, eviction | -| `TestUserSimulator` | User generation, QoS distribution | -| `TestMultiTierCache` | CPU-only mode, allocation, access | -| `TestMultiTierCacheWithGPU` | GPU tier, waterfall eviction (skipped without GPU) | -| `TestXLSXExport` | CSV/Excel export (skipped without pandas) | - -### Expected Runtime - -- **Without GPU**: ~3-5 seconds -- **With GPU**: ~5-10 seconds - -GPU tests are automatically skipped if CUDA is not available. - ---- - -## Excel Export - -The benchmark can export results directly to Excel or CSV format for analysis. - -### Basic Usage - -```bash -python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 50 \ - --duration 120 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --seed 42 \ - --output results.json \ - --xlsx-output results.xlsx -``` - -### Output Format - -The Excel file contains a single row with all key metrics: - -| Column | Description | -|--------|-------------| -| Model | Model configuration used | -| Num Users | Concurrent user count | -| Duration (s) | Benchmark duration | -| GPU Mem (GB) | GPU memory budget | -| CPU Mem (GB) | CPU memory budget | -| Total Requests | Requests completed | -| Total Tokens | Tokens processed | -| Avg Throughput (tok/s) | Wall-clock throughput | -| Storage Throughput (tok/s) | Storage I/O throughput | -| Cache Hit Rate | Percentage of cache hits | -| E2E Latency P95 (ms) | End-to-end 95th percentile | -| Storage IO P95 (ms) | Storage I/O 95th percentile | - -### Fallback Behavior - -- **With openpyxl**: Exports to `.xlsx` format -- **Without openpyxl**: Falls back to `.csv` format -- **Without pandas**: Export is skipped with a warning - ---- - -## MLPerf Submission Guidelines - -For official MLPerf v3.0 storage submissions, use these standardized commands. **These invocations have been validated through extensive discovery testing** (1,411 Fast system tests, 268 Slow system tests comparing 14,000 MB/s vs 3,000 MB/s storage). - -### Discovery Test Key Findings - -| Finding | Impact | -|---------|--------| -| **Metric selection depends on cpu_mem** | Storage Throughput shows only 1.1x at cpu_mem=0GB but 2.2x at cpu_mem=4GB | -| **Best models for differentiation** | llama3.1-8b and mistral-7b show 2.31x ratio | -| **High variance observed** | CV 50-125%, requires 3-5 trials minimum | -| **100% win rate metrics** | Decode Bytes Read and Wall-Clock Throughput at cpu_mem=0GB | - -### Option 1: Maximum Storage Stress (cpu_mem=0GB) - -Use when you want to stress test NVMe and measure I/O volume differentiation. - -**Primary Metrics:** Decode Bytes Read (2.62x differentiation), Wall-Clock Throughput (2.43x differentiation) - -```bash -# MLPerf v3.0: Maximum Storage Stress Test (8B Model) -# Run 3-5 trials for statistical significance -for trial in 1 2 3 4 5; do - python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 200 \ - --duration 300 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 0 \ - --max-concurrent-allocs 16 \ - --generation-mode none \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output mlperf_v3_stress_8b_trial${trial}.json -done -``` - -**⚠️ Important:** At cpu_mem=0GB, do NOT use Storage Throughput as your primary metric—use Decode Bytes Read or Wall-Clock Throughput instead. - -### Option 2: Storage Throughput Focus (cpu_mem=4GB) - -Use when you want Storage Throughput (tok/s) as your primary metric. - -**Primary Metric:** Storage Throughput (2.2x differentiation, 97% win rate) - -```bash -# MLPerf v3.0: Storage Throughput Test (8B Model) -for trial in 1 2 3 4 5; do - python3 kv-cache.py \ - --model llama3.1-8b \ - --num-users 100 \ - --duration 300 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 4 \ - --max-concurrent-allocs 0 \ - --generation-mode none \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output mlperf_v3_throughput_8b_trial${trial}.json -done -``` - -### Option 3: Large Model Submission (70B) - -For maximum per-request storage stress (10x larger KV cache per token): - -```bash -# MLPerf v3.0: Large Model Storage Stress -for trial in 1 2 3; do - python3 kv-cache.py \ - --model llama3.1-70b-instruct \ - --num-users 70 \ - --duration 300 \ - --gpu-mem-gb 0 \ - --cpu-mem-gb 0 \ - --max-concurrent-allocs 4 \ - --generation-mode none \ - --cache-dir /mnt/nvme \ - --seed 42 \ - --output mlperf_v3_stress_70b_trial${trial}.json -done -``` - -### Critical Parameters (Discovery-Validated) - -| Parameter | Value | Rationale | -|-----------|-------|-----------| -| **seed 42** | Required | Reproducibility across systems | -| **gpu-mem-gb 0** | Required | Isolates storage performance | -| **cpu-mem-gb** | 0 or 4 | 0GB for max stress (use I/O volume metrics), 4GB for Storage Throughput metric | -| **max-concurrent-allocs** | 0, 4, or 16 | 0 for throughput, 16 for stress testing | -| **generation-mode** | none or realistic | none for pure I/O, realistic for production simulation | -| **num-users** | 100-200 | Differentiation stable across range; higher = more throughput | -| **duration** | 300-600 | 5-10 minutes for stable metrics | - -### Trial Requirements - -| User Count | Variance (CV) | Minimum Trials | -|------------|---------------|----------------| -| 10 users | ~52% | 3 | -| 50-100 users | ~115-125% | 3-5 | -| 200 users | ~110-120% | 3-5 | - -Report **median** rather than mean for publication-quality results. - ---- - -## Troubleshooting - -### Out of Memory Errors - -Reduce the number of concurrent users or limit parallel allocations: - -```bash -python3 kv-cache.py ... --max-concurrent-allocs 50 -``` - -### Benchmark Hangs - -The system may be thrashing. Reduce users or increase memory budgets. - -### Poor Cache Hit Rates - -Low hit rates indicate your working set exceeds available fast memory. Either: -- Increase GPU/CPU memory budgets -- Reduce user count -- Accept that cold data will hit storage - -### Results Vary Between Runs - -Use the `--seed` flag for reproducible results. - ---- - -## Files in This Package - -- `kv-cache.py`: Main benchmark implementation with ShareGPT support -- `test_kv_cache.py`: Pytest unit test suite -- `requirements.txt`: Python dependencies -- `README.md`: This documentation -- `MLperf v3 KV cache proposal.md`: Detailed technical documentation - ---- - -## License - -Apache License 2.0 - ---- - -## Contact - -For questions or feedback, open an issue on the repository or contact the MLPerf Storage Working Group. +# MLPerf Storage KV Cache Benchmark + +A storage benchmarking tool for Large Language Model inference systems. This benchmark measures the performance of your storage subsystem under realistic KV cache offloading workloads, helping you answer critical questions about hardware capacity and configuration. + +**Author:** Hazem Awadallah, Kingston Digital +**License:** Apache 2.0 +**Version:** MLPerf Storage v3.0 (Enhanced) +**Updated:** January 27, 2026 + +--- + +## Table of Contents + +1. [What This Benchmark Does](#what-this-benchmark-does) +2. [Architecture Overview](#architecture-overview) +3. [System Requirements](#system-requirements) +4. [Installation](#installation) +5. [Configuration](#configuration) +6. [Quick Start](#quick-start) +7. [Running the Benchmark](#running-the-benchmark) +8. [ShareGPT Replay Workloads](#sharegpt-replay-workloads) +9. [Using the Wrapper Script](#using-the-wrapper-script) +10. [Understanding Results](#understanding-results) +11. [Unit Testing](#unit-testing) +12. [Excel Export](#excel-export) +13. [MLPerf Submission Guidelines](#mlperf-submission-guidelines) +14. [Troubleshooting](#troubleshooting) + +--- + +## What This Benchmark Does + +During LLM inference, models store intermediate attention data in a structure called the KV (Key-Value) cache. This cache grows with conversation length and can consume enormous amounts of memory. Production systems offload this cache from expensive GPU VRAM to cheaper CPU RAM or NVMe storage. + +This benchmark simulates that offloading behavior. It generates realistic multi-user inference workloads and measures how your storage performs under pressure. It measures these components: + +- How many concurrent users your hardware can support +- Whether your NVMe drive is fast enough to handle cache spillover +- The real latency impact of each storage tier +- Where the bottleneck sits in your system + +This is not a pass/fail test. It is a diagnostic tool for system architects and performance engineers. + +--- + +## Architecture Overview + +The benchmark implements a three-tier memory hierarchy that mirrors production LLM serving systems. + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ KV Cache Benchmark Architecture │ +└─────────────────────────────────────────────────────────────────────────────┘ + + ┌──────────────────┐ + │ User Requests │ + │ (Multi-tenant) │ + └────────┬─────────┘ + │ + ▼ + ┌──────────────────────────────────────┐ + │ Request Queue │ + │ (Priority-based: QoS levels) │ + │ Interactive > Responsive > Batch │ + └──────────────────┬───────────────────┘ + │ + ▼ + ┌────────────────────────────────────────────────────────┐ + │ IntegratedBenchmark │ + │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────┐ │ + │ │ Prefill │ │ Decode │ │ Conversation │ │ + │ │ (Write) │ │ (Read) │ │ Manager │ │ + │ └──────┬──────┘ └──────┬──────┘ └────────┬────────┘ │ + └─────────┼────────────────┼─────────────────┼───────────┘ + │ │ │ + └────────────────┼─────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ MultiTierCache │ +│ (Waterfall LRU Eviction) │ +│ │ +│ New Data ─────► Always targets fastest available tier │ +│ If full, LRU entry cascades down │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ │ │ +│ │ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ │ +│ │ │ GPU VRAM │ │ CPU RAM │ │ NVMe │ │ │ +│ │ │ (Tier 1) │─────►│ (Tier 2) │─────►│ (Tier 3) │ │ │ +│ │ │ │ LRU │ │ LRU │ │ │ │ +│ │ │ Sub-ms │evict │ Tens of ms │evict │ Hundreds │ │ │ +│ │ │ latency │ │ latency │ │ of ms │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ PyTorch/CuPy │ │ NumPy arrays │ │ .npy files │ │ │ +│ │ │ tensors │ │ in memory │ │ on disk │ │ │ +│ │ └───────────────┘ └───────────────┘ └───────────────┘ │ │ +│ │ │ │ +│ │ ◄──── HOT DATA ────────────────────────────── COLD DATA ────► │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### Key Components + +**MultiTierCache**: The core engine. It decides where to place data based on available space and access patterns. New data always targets the fastest tier. When that tier fills up, the least recently used entry gets pushed down to the next tier. + +**Inference Phases**: The benchmark models two distinct I/O patterns: +- **Prefill**: Write-heavy. Processing the user prompt generates new KV cache entries. +- **Decode**: Read-heavy. Generating each output token requires reading the existing cache. + +**User Simulation**: Creates realistic traffic from multiple concurrent users with different behaviors (chatbot, coding assistant, document analysis) and priority levels. + +**Autoscaler**: Automatically adjusts user load to find either the maximum users your system can handle (QoS mode) or the peak throughput of your storage (capacity mode). + +--- + +## System Requirements + +### Minimum + +- CPU: 8+ cores (AMD EPYC, Intel Xeon) +- RAM: 32 GB +- Storage: 256 GB free space on SSD +- OS: Linux (Ubuntu 22.04, RHEL 9, or similar) or Windows +- Python: 3.8 or higher +- No GPU required (runs in CPU-only mode) + +### Recommended + +- CPU: 32+ cores +- RAM: 128 GB or more +- GPU: NVIDIA A100/H100 with 40+ GB VRAM (optional but enables full three-tier testing) +- Storage: 1 TB+ on NVMe (PCIe Gen4 or Gen5) +- Tools: `bc`, `jq` for the wrapper script (Linux) + +--- + +## Installation + +1. Clone or download this repository. + +2. Install Python dependencies: + +```bash +pip install -r requirements.txt +``` + +Or install core dependencies manually: + +```bash +pip install numpy pyyaml +``` + +3. For GPU support (optional): + +```bash +pip install torch # or cupy-cuda12x for CuPy +``` + +4. For ShareGPT replay workloads (optional): + +```bash +pip install tiktoken +``` + +5. For Excel export (optional): + +```bash +pip install pandas openpyxl +``` + +6. Verify the installation: + +```bash +python3 kv-cache.py --help +``` + +--- + +## Configuration + +The benchmark supports a YAML configuration file (`config.yaml`) for tuning internal parameters without modifying the source code. This is the **recommended approach** for MLPerf submissions to ensure reproducibility. + +### Using the Configuration File + +```bash +python3 kv-cache.py --config config.yaml [other CLI arguments] +``` + +**Note:** CLI arguments always take precedence over config file values for overlapping settings. + +### Configuration File Parameters (config.yaml) + +The configuration file controls internal benchmark behavior that affects workload realism and cache dynamics. These settings are **not** exposed as CLI arguments to prevent accidental misconfigurations in MLPerf submissions. + +> **Tip:** For most benchmarking scenarios, the defaults are carefully tuned. Only modify these if you understand the impact on your results. + +--- + +#### User Templates + +Controls the three simulated user personas. Each persona has distinct characteristics that model real-world usage patterns. + +| Persona | Behavior | Use Case | +|---------|----------|----------| +| **Chatbot** | Short prompts, quick responses, fast iteration | Customer service bots, casual conversation | +| **Coding** | Medium prompts with code context, moderate responses | IDE assistants, code completion | +| **Document** | Long prompts with full documents, lengthy analysis | Document summarization, legal/medical analysis | + +| Parameter | Type | Default | Impact | +|-----------|------|---------|--------| +| `user_templates.chatbot.context_range` | [min, max] | [256, 1024] | **KV cache write size per request.** Smaller values reduce storage pressure; larger values stress NVMe throughput. | +| `user_templates.chatbot.generation_range` | [min, max] | [50, 150] | **Decode phase duration.** More tokens = more cache reads per request. Affects read/write ratio. | +| `user_templates.chatbot.think_time_range` | [min, max] | [0.1, 0.5] | **Request inter-arrival time.** Shorter = higher request rate, more concurrent cache operations. | +| `user_templates.coding.context_range` | [min, max] | [1024, 4096] | Medium-length contexts typical of code completion scenarios. 4× larger than chatbot. | +| `user_templates.coding.generation_range` | [min, max] | [100, 500] | Code generation often produces longer outputs than conversational AI. | +| `user_templates.coding.think_time_range` | [min, max] | [0.2, 1.0] | Developers pause to review generated code before next request. | +| `user_templates.document.context_range` | [min, max] | [2048, 8192] | **Stress test scenarios.** 8K tokens creates ~1 GB of total KV cache data for 8B models (128 KB/token × 8,192 tokens). | +| `user_templates.document.generation_range` | [min, max] | [200, 800] | Long-form analysis outputs (summaries, reports). | +| `user_templates.document.think_time_range` | [min, max] | [0.3, 1.5] | Users read lengthy outputs before continuing. | + +--- + +#### Token Generation Timing + +Simulates GPU compute time per generated token. This controls the backpressure on the storage system. + +| Mode | Default (sec/token) | When to Use | +|------|---------------------|-------------| +| `none` | 0.0 | **Pure storage benchmarking.** 100% of measured latency is I/O. Use for MLPerf storage submissions. | +| `fast` | 0.002 (2ms) | Simulates high-end GPU (H100) with optimized inference. Creates light backpressure. | +| `realistic` | 0.030 (30ms) | Simulates typical production GPU throughput. Balances compute/storage for end-to-end analysis. | + +**Why it matters:** With `generation_mode=none`, the benchmark hammers storage as fast as possible. With `realistic`, storage has time to absorb writes between decode steps, showing how your system performs under sustained (not burst) load. + +--- + +#### QoS Profiles (Quality of Service) + +Defines SLA targets for multi-tenant request prioritization. The benchmark tracks violations against these thresholds. + +| Profile | Typical Use Case | Priority | +|---------|------------------|----------| +| **Interactive** | Live chat UIs, real-time assistants | Highest (3) | +| **Responsive** | API calls, near-real-time processing | Medium (2) | +| **Batch** | Overnight jobs, bulk processing | Lowest (1) | + +| Parameter | Default | Meaning | +|-----------|---------|---------| +| `qos_profiles.interactive.target_latency_p95_ms` | 50 | 95% of interactive requests must complete within 50ms. Aggressive target for premium users. | +| `qos_profiles.interactive.target_latency_p99_ms` | 100 | 99% within 100ms. Allows some slack for tail latency. | +| `qos_profiles.interactive.target_latency_p999_ms` | 150 | 99.9% (3 nines) within 150ms. Production SLOs often specify this level. | +| `qos_profiles.interactive.target_latency_p9999_ms` | 200 | 99.99% (4 nines) within 200ms. Critical for detecting storage-induced tail latency. | +| `qos_profiles.interactive.priority` | 3 | Highest priority. These requests are dequeued first. | +| `qos_profiles.responsive.target_latency_p95_ms` | 100 | 2× the interactive target. Acceptable for API consumers. | +| `qos_profiles.responsive.target_latency_p99_ms` | 200 | 99% within 200ms. | +| `qos_profiles.responsive.target_latency_p999_ms` | 350 | 99.9% within 350ms. | +| `qos_profiles.responsive.target_latency_p9999_ms` | 500 | 99.99% within 500ms. | +| `qos_profiles.responsive.priority` | 2 | Medium priority. | +| `qos_profiles.batch.target_latency_p95_ms` | 1000 | 1 second. Batch jobs are latency-tolerant. | +| `qos_profiles.batch.target_latency_p99_ms` | 5000 | 5 seconds. Acceptable for offline processing. | +| `qos_profiles.batch.target_latency_p999_ms` | 7500 | 7.5 seconds. | +| `qos_profiles.batch.target_latency_p9999_ms` | 10000 | 10 seconds. Even worst-case should complete eventually. | +| `qos_profiles.batch.priority` | 1 | Lowest priority. Processed when interactive/responsive queues are empty. | + +> **Research Basis for QoS Targets** (see [sources.md](sources.md) for full citations): +> - **Interactive (50ms P95, 100ms P99)**: Based on Nielsen Norman Group's 0.1s "instant" threshold, Google RAIL <100ms response target, and observed production LLM APIs (Anthropic Claude TTFT: 50–150ms). +> - **Responsive (100ms P95, 200ms P99)**: Based on Google Core Web Vitals FID <100ms "good" threshold, INP ≤200ms target, and Vercel Edge Functions P99 <200ms. +> - **Batch (1000ms P95, 5000ms P99)**: Based on AWS ALB healthy target <1s, and research showing batch workloads tolerate >1s latency ([Splitwise paper](https://arxiv.org/abs/2401.07935): 80% of production requests need <200ms). +> +> **Note:** MLPerf Inference v4.0–v5.0 defines Server/Offline scenarios but does **not** prescribe specific P95/P99 latency SLAs. These targets represent industry best practices, not MLPerf requirements. + +--- + +#### QoS Distribution + +Controls the probability mix of request priorities in the simulated workload. + +| Parameter | Default | Effect | +|-----------|---------|--------| +| `interactive_probability` | 0.15 | 15% of requests are INTERACTIVE. Increase to stress-test low-latency paths. | +| `responsive_threshold` | 0.50 | If not INTERACTIVE, 35% of remaining requests (50% - 15%) are RESPONSIVE. The rest are BATCH. | + +**Example distribution with defaults:** 15% Interactive, 35% Responsive, 50% Batch. + +--- + +#### Eviction Settings + +Controls the waterfall LRU eviction algorithm that moves cold data down the tier hierarchy (GPU → CPU → NVMe). + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `max_recursion_depth` | 10 | **Safety limit.** Prevents infinite cascading evictions. If you hit this limit, your tiers are severely undersized. | +| `target_usage_ratio` | 0.8 | **Tier headroom.** Keeps each tier at 80% capacity, leaving 20% buffer for burst writes. Lower values = more headroom, fewer evictions. | +| `large_entry_limit_ratio` | 0.95 | **Skip-tier threshold.** If a single entry exceeds 95% of tier capacity, skip directly to the next tier. Prevents tier thrashing with huge entries. | +| `max_evictions_hard_cap` | 5000 | **Absolute safety limit.** Stops eviction loop after 5000 entries regardless of space needs. Prevents runaway eviction under pathological conditions. | +| `max_evictions_min` | 1000 | **Minimum eviction budget.** Ensures the algorithm tries at least 1000 evictions before giving up. Helps with large-model scenarios where many small entries must be evicted. | + +**Tuning guidance:** If you see "Hit recursion limit" warnings, increase `max_recursion_depth`. If evictions dominate your latency, reduce `target_usage_ratio` to provide more headroom. + +--- + +#### GPU Backend Settings + +Controls GPU VRAM allocation and out-of-memory (OOM) recovery behavior. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `memory_fraction` | 0.9 | **VRAM budget.** Uses 90% of GPU memory, reserving 10% for framework overhead and other processes. | +| `max_eviction_attempts` | 100 | **OOM recovery limit.** On CUDA OOM, attempts up to 100 evictions to free space before failing the write. | +| `free_memory_threshold` | 0.1 | **Proactive eviction trigger.** When free GPU memory drops below 10%, begin evicting to CPU before OOM occurs. | + +**Note:** These settings only apply when `--gpu-mem-gb > 0` and PyTorch/CuPy is available. + +--- + +#### Prefix Cache Settings + +Controls hierarchical prefix caching for system prompts (e.g., "You are a helpful assistant"). + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `min_prefix_length` | 50 | **Minimum tokens for caching.** Prefixes shorter than 50 tokens aren't worth the overhead of caching. | +| `max_prefix_entries` | 1000 | **Prefix cache capacity.** LRU eviction kicks in when this limit is reached. Higher values consume more memory but improve hit rates. | +| `system_prompt_hit_probability` | 0.2 | **Simulation realism.** 20% of requests share a common system prompt. Increase to model deployments with standardized prompts (e.g., corporate assistants). | + +**Impact:** Higher `system_prompt_hit_probability` → higher cache hit rates → lower storage throughput (because prefixes are reused). Use 0.0 for pure storage stress testing. + +--- + +#### RAG Settings + +Controls Retrieval-Augmented Generation workload simulation, where external documents are injected into the context. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `chunk_size_tokens` | 512 | **Document chunk granularity.** Each document is split into 512-token chunks for independent caching. Smaller chunks = more cache entries, higher metadata overhead. | +| `top_k_chunks` | 5 | **Retrieval depth.** Number of chunks retrieved per RAG query. More chunks = larger context window = more KV cache I/O. | +| `max_chunk_bytes` | 268435456 | **256 MB per chunk.** Safety limit to prevent single chunks from consuming entire tiers. Particularly important for 70B models where 512 tokens ≈ 160 MB of KV cache (320 KB/token). | + +**When to enable RAG:** Use `--enable-rag` when benchmarking systems designed for document-heavy workloads (legal, medical, enterprise search). + +--- + +#### Conversation Settings + +Controls multi-turn conversation simulation, modeling how chatbot context accumulates across turns. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `max_conversations` | 1000 | **Concurrent conversation limit.** LRU eviction removes oldest conversations when this limit is hit. Higher values = more memory for conversation metadata. | +| `max_turns_per_conv` | 50 | **Conversation depth limit.** After 50 turns, the conversation resets. Prevents unbounded context growth in long-running benchmarks. | +| `end_conversation_probability` | 0.2 | **Conversation turnover rate.** 20% chance each turn ends the conversation. Lower values = longer conversations = more cache reuse. | + +**Impact on metrics:** Higher `max_turns_per_conv` and lower `end_conversation_probability` increase cache hit rates (context reuse). Use low values for stress testing (force cache misses). + +--- + +#### Autoscaler Settings + +Controls the workload autoscaler that discovers system saturation points. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `min_users` | 1 | **Lower bound.** Autoscaler won't go below 1 user. | +| `max_users` | 10000 | **Upper bound.** Autoscaler stops scaling up at 10,000 users. Prevents runaway resource consumption. | +| `scale_up_factor` | 1.2 | **Growth rate.** Increases users by 20% each scaling action (e.g., 100 → 120 → 144). | +| `scale_down_factor` | 0.8 | **Decay rate.** Decreases users by 20% when SLAs are violated (e.g., 100 → 80 → 64). | +| `consecutive_samples_required` | 2 | **Stability requirement.** Requires 2 consecutive samples agreeing on direction before scaling. Prevents oscillation from transient spikes. | + +**QoS mode vs Capacity mode:** In QoS mode, the autoscaler maximizes users while maintaining latency SLAs. In Capacity mode, it maximizes throughput regardless of latency. + +--- + +#### Decode Phase Settings + +Controls token generation batching during the decode (read-heavy) phase. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `batch_size` | 32 | **Decode batch granularity.** Reads 32 tokens worth of KV cache per decode operation. Larger batches amortize I/O overhead but require more memory. | + +--- + +#### ShareGPT Dataset Settings + +Controls loading and processing of real ShareGPT conversation data. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `max_context_tokens` | 8192 | **Context truncation.** Conversations longer than 8192 tokens are truncated. Prevents OOM with very long conversations. | +| `max_generation_tokens` | 2048 | **Generation truncation.** Caps simulated generation at 2048 tokens per turn. | +| `chars_per_token_estimate` | 4 | **Tokenization heuristic.** Used when tiktoken is unavailable. 4 chars/token is typical for English text. | + +--- + +#### Saturation Detection Thresholds + +Controls when the StorageMonitor considers the storage subsystem saturated. + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `read_latency_p95_threshold_ms` | 100 | **Read saturation signal.** If P95 read latency exceeds 100ms, storage is considered stressed. | +| `write_latency_p95_threshold_ms` | 50 | **Write saturation signal.** Writes are more sensitive; 50ms threshold triggers concern earlier. | +| `queue_depth_threshold` | 100 | **Queue pressure signal.** More than 100 pending requests indicates backlog is building. | +| `history_window_size` | 10 | **Trend analysis window.** Uses last 10 samples to detect latency trends (increasing = saturation). | + +**Used by:** The autoscaler uses these thresholds to decide when to scale down (in QoS mode) or when peak throughput is reached (in capacity mode). + +--- + +#### Validation Limits + +Safety limits enforced by `validate_args()` to prevent accidental misconfigurations. + +| Parameter | Default | Rationale | +|-----------|---------|-----------| +| `max_users` | 100000 | Reasonable upper bound for simulated users. Prevents accidental `--num-users 1000000`. | +| `max_duration_seconds` | 86400 | 24 hours maximum. Prevents runaway benchmarks that run forever. | +| `max_gpu_memory_gb` | 1024 | 1 TB. Covers even the largest GPU clusters (8× H100 80GB = 640GB). | +| `max_cpu_memory_gb` | 16384 | 16 TB. Covers high-memory server configurations. | + +--- + +## Quick Start + +Run a basic storage test with 50 users for 2 minutes: + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 50 \ + --duration 120 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results.json +``` + +This forces all cache operations to hit your NVMe drive, giving you a baseline measurement of storage performance. + +--- + +## Running the Benchmark + +### CLI-Only Arguments + +These arguments **must** be passed via command line (not configurable in config.yaml): + +| Argument | Type | Default | Required | Description | +|----------|------|---------|----------|-------------| +| `--config` | str | None | No | Path to YAML configuration file | +| `--log-level` | str | INFO | No | Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL | +| `--model` | str | llama3.1-8b | Yes | Model config: tiny-1b, mistral-7b, llama2-7b, llama3.1-8b, llama3.1-70b-instruct | +| `--num-users` | int | 100 | Yes | Number of concurrent users to simulate | +| `--duration` | int | 60 | Yes | Benchmark duration in seconds | +| `--gpu-mem-gb` | float | 16 | Yes | GPU VRAM budget in GB (0 to disable) | +| `--cpu-mem-gb` | float | 32 | Yes | CPU RAM budget in GB | +| `--cache-dir` | str | temp | No | Directory for NVMe cache files | +| `--generation-mode` | str | realistic | No | Token generation: none, fast, realistic | +| `--performance-profile` | str | latency | No | Pass/fail criteria: latency, throughput | +| `--disable-multi-turn` | flag | False | No | Disable multi-turn conversation caching | +| `--disable-prefix-caching` | flag | False | No | Disable prefix caching | +| `--enable-rag` | flag | False | No | Enable RAG workload simulation | +| `--rag-num-docs` | int | 10 | No | Number of RAG documents to ingest | +| `--enable-autoscaling` | flag | False | No | Enable workload autoscaling | +| `--autoscaler-mode` | str | qos | No | Autoscaling strategy: qos, capacity | +| `--target-saturation` | float | 0.8 | No | Target storage saturation (0.0-1.0) | +| `--use-burst-trace` | flag | False | No | Use BurstGPT trace for workload | +| `--burst-trace-path` | str | BurstGPT/... | No | Path to BurstGPT trace file | +| `--validation-trace` | str | None | No | Path to validation trace file | +| `--dataset-path` | str | None | No | Path to ShareGPT dataset JSON | +| `--max-conversations` | int | 500 | No | Max conversations from dataset | +| `--output` | str | auto | No | Output JSON file path | +| `--seed` | int | None | **MLPerf** | Random seed for reproducibility | +| `--max-concurrent-allocs` | int | 0 | No | Limit concurrent allocations (0=unlimited) | +| `--request-rate` | float | 0 | No | Target request rate (req/sec, 0=unlimited) | +| `--max-requests` | int | 0 | No | Stop after N requests (0=use duration) | +| `--xlsx-output` | str | None | No | Excel/CSV output file path | + +### Test Scenarios + +#### Scenario 1: Storage-Only Baseline + +Isolate your NVMe drive by setting GPU memory to zero. This tells you the raw performance of your storage. + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 50 \ + --duration 180 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_storage_only.json +``` + +#### Scenario 2: Realistic Production Setup + +Test a balanced three-tier configuration that mirrors production deployment. + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 100 \ + --duration 300 \ + --gpu-mem-gb 16 \ + --cpu-mem-gb 32 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_production.json +``` + +#### Scenario 3: Find Maximum User Count (QoS Mode) + +Let the autoscaler discover how many users your system can handle while maintaining acceptable latency. + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 20 \ + --duration 300 \ + --gpu-mem-gb 16 \ + --cpu-mem-gb 32 \ + --enable-autoscaling \ + --autoscaler-mode qos \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_autoscale_qos.json +``` + +#### Scenario 4: Find Peak Storage Throughput (Capacity Mode) + +Discover the absolute maximum I/O your storage can deliver by ignoring latency constraints. + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-70b-instruct \ + --num-users 10 \ + --duration 180 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --enable-autoscaling \ + --autoscaler-mode capacity \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_capacity.json +``` + +#### Scenario 5: Low Cache Hit Rate (Maximum Storage Stress) + +Force cache misses to maximize NVMe I/O pressure. This is useful for stress testing storage subsystems and measuring worst-case performance. + +**Key flags to lower cache hit rate:** +- `--disable-multi-turn`: Each request is independent (no conversation context reuse) +- `--disable-prefix-caching`: No system prompt caching (every request generates fresh KV cache) +- `--cpu-mem-gb 0`: No CPU tier buffer (all evictions go directly to NVMe) +- High user count with synthetic workload: More unique cache entries + +```bash +# Minimal caching - forces nearly all operations to hit NVMe +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 200 \ + --duration 180 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 0 \ + --disable-multi-turn \ + --disable-prefix-caching \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_low_hit_rate.json +``` + +**Expected results:** Cache hit rate drops to 10-30% (vs 50-70% with defaults, or 85-97% with ShareGPT). + +For even more aggressive stress testing with the 70B model (2.5× larger KV cache per token): + +```bash +# Maximum NVMe stress - 70B model with no caching +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-70b-instruct \ + --num-users 50 \ + --duration 180 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 0 \ + --disable-multi-turn \ + --disable-prefix-caching \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_70b_low_hit_rate.json +``` + +| Configuration | Typical Cache Hit Rate | Use Case | +|---------------|------------------------|----------| +| ShareGPT + defaults | 85-97% | Realistic production simulation | +| Synthetic + defaults | 50-70% | Balanced stress testing | +| `--disable-multi-turn` only | 30-50% | Moderate stress | +| `--disable-multi-turn --disable-prefix-caching` | 10-30% | Maximum NVMe stress | +| Above + `--cpu-mem-gb 0` | 5-15% | Worst-case storage scenario | + +--- + +## ShareGPT Replay Workloads + +While synthetic workloads are excellent for controlled stress testing, they may not capture the nuances of real human-AI interaction. The **ShareGPT Replay** feature addresses this by loading actual conversation data. + +### Why Use ShareGPT? + +Real conversations exhibit different patterns than synthetic workloads: +- **Higher cache locality**: Users ask follow-up questions, reusing context +- **Variable context sizes**: Real queries vary wildly (10-16,000 tokens) +- **Multi-turn structure**: Conversation flows are preserved + +### Downloading the ShareGPT Dataset + +Download the full dataset from Hugging Face (~1.2 GB): + +```bash +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +``` + +**Alternative: Smaller subset for quick testing (~40 MB):** + +```bash +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json +``` + +### Basic ShareGPT Invocation + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ + --max-conversations 500 \ + --num-users 50 \ + --duration 300 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_sharegpt.json +``` + +### ShareGPT with Rate Limiting + +Control the request arrival rate for steady-state testing: + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-70b-instruct \ + --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ + --max-conversations 1000 \ + --request-rate 10.0 \ + --num-users 100 \ + --duration 600 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 8 \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_sharegpt_rate_limited.json +``` + +### ShareGPT with Fixed Request Count + +Run exactly N requests for reproducible benchmarks: + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \ + --max-requests 5000 \ + --num-users 50 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --generation-mode realistic \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output results_sharegpt_fixed.json +``` + +### Comparing Real vs Synthetic Workloads + +| Metric | ShareGPT (Real) | Synthetic (Random) | +| :--- | :--- | :--- | +| Mean Context Size | ~133 tokens | ~2,676 tokens | +| Cache Hit Rate | 85-97% | 50-70% | +| Multi-turn Locality | High | Medium | +| Throughput | Higher | Lower | +| NVMe Stress | Moderate | Extreme | + +**Use ShareGPT** when you want to model real chatbot/assistant usage. +**Use Synthetic** when you want worst-case stress testing or controlled experiments. + +--- + +## Using the Wrapper Script + +The `kv-cache-wrapper.sh` script automates a complete benchmark suite. It detects your hardware, calculates appropriate parameters, and runs multiple test scenarios. + +### Basic Usage + +```bash +./kv-cache-wrapper.sh +``` + +This runs all test scenarios with default settings. Expect roughly 30 minutes for the full suite. + +### Options + +``` +./kv-cache-wrapper.sh [options] + + -m MODEL Model to benchmark (default: llama3.1-8b) + -t SECONDS Duration for tier comparison tests (default: 120) + -s SECONDS Duration for storage saturation test (default: 180) + -r SECONDS Duration for production test (default: 180) + -a SECONDS Duration for autoscaling tests (default: 300) + -w LIST Comma-separated list of workloads to run + -u USERS Override baseline user count + -U USERS Override high-load user count + -R Enable RAG workload + -D DOCS Number of RAG documents (default: 10) + -h Show help +``` + +### Available Workloads + +```bash +# Run only the storage isolation test +./kv-cache-wrapper.sh -w storage-only + +# Run production and autoscaling tests +./kv-cache-wrapper.sh -w production,autoscale + +# Run MLPerf submission tests +./kv-cache-wrapper.sh -w mlperf_submission +``` + +--- + +## Understanding Results + +### Key Metrics + +**Throughput (tokens/sec)**: How many tokens the system processes per second. Higher is better. + +**Storage Throughput (tokens/sec)**: Raw I/O performance calculated from storage latency, not wall-clock time. This is the fairer metric for comparing storage tiers. + +**End-to-End Latency**: Total time from request submission to completion. This is what users experience. + +**Storage I/O Latency**: Time spent reading from and writing to storage tiers. This measures your hardware. + +**Queue Wait Time**: Time requests spend waiting before processing begins. If this dominates, your system is overloaded. + +**Cache Hit Rate**: Percentage of reads served from cache. Higher rates mean less storage pressure. + +### Reading the Output + +``` +### STORAGE PERFORMANCE ASSESSMENT: PASS ### + Criteria Passed: 4/4 + [PASS] NVMe Write P95 < 500ms: 45.20ms + [PASS] NVMe Read P95 < 200ms: 123.45ms + [PASS] CPU RAM P95 < 150ms: 12.30ms + [PASS] Cache Hit Rate > 30%: 67.5% + +### OVERALL PERFORMANCE ### + Total Requests: 2847 + Total Tokens Generated: 489,231 + Avg Throughput: 1,630.77 tok/s + Storage Throughput: 2,105.32 tok/s + +### LATENCY BREAKDOWN ### + End-to-End: mean 89.3ms, P50 45.2ms, P95 312.4ms + Storage I/O: mean 23.1ms, P50 12.4ms, P95 89.2ms +``` + +--- + +## Understanding Excel Performance Metrics + +The `--xlsx-output` option exports detailed performance metrics to Excel for analysis. This section provides a comprehensive reference for every metric in the export. + +### Run Parameters (Configuration) + +These columns record the benchmark configuration used for the run: + +| Column | Description | +|--------|-------------| +| **Timestamp** | When the benchmark was executed (YYYY-MM-DD HH:MM:SS) | +| **Model** | Model configuration key (e.g., `llama3.1-8b`, `llama3.1-70b-instruct`) | +| **Num Users** | Number of concurrent simulated users | +| **Duration (s)** | Benchmark duration in seconds | +| **GPU Memory (GB)** | GPU VRAM budget allocated | +| **CPU Memory (GB)** | CPU RAM budget allocated | +| **Generation Mode** | Token generation simulation: `none`, `fast`, or `realistic` | +| **Performance Profile** | Pass/fail criteria: `latency` or `throughput` | +| **Multi-turn** | Whether multi-turn conversation caching was enabled | +| **Prefix Caching** | Whether system prompt prefix caching was enabled | +| **RAG Enabled** | Whether RAG workload simulation was enabled | +| **Autoscaling** | Whether workload autoscaling was enabled | +| **Seed** | Random seed for reproducibility | +| **Max Concurrent Allocs** | Limit on parallel cache allocations (0 = unlimited) | +| **Request Rate** | Target request rate in req/sec (0 = unlimited) | +| **Max Requests** | Stop after N requests (0 = use duration) | +| **Dataset Path** | Path to ShareGPT dataset if used | +| **Cache Dir** | Directory used for NVMe cache files | + +--- + +### Throughput Metrics + +| Metric | Unit | What It Measures | Interpretation | +|--------|------|------------------|----------------| +| **Total Requests** | count | Total inference requests completed | Higher = more work done. Compare across runs with same duration. | +| **Total Tokens** | count | Total tokens generated across all requests | Primary workload volume indicator. | +| **Elapsed Time (s)** | seconds | Actual wall-clock benchmark duration | May differ slightly from configured duration. | +| **Avg Throughput (tok/s)** | tokens/sec | `Total Tokens / Elapsed Time` | **Wall-clock throughput.** Includes all overheads (queue wait, generation simulation). **Primary metric when `gpu_mem=0` and `cpu_mem=0`.** | +| **Storage Throughput (tok/s)** | tokens/sec | `Total Tokens / Total Storage I/O Time` | **Pure storage throughput.** Excludes generation simulation time. Useful when `cpu_mem > 0` to isolate storage I/O. | +| **Requests/sec** | req/sec | `Total Requests / Elapsed Time` | Request processing rate. Higher = system handling more concurrent users efficiently. | + +> **Which throughput metric to use?** +> - **When `gpu_mem=0` and `cpu_mem=0`**: Use **Avg Throughput (tok/s)** — all I/O hits the storage tier, so wall-clock throughput directly reflects storage performance. +> - **When `cpu_mem > 0`**: Use **Storage Throughput (tok/s)** to isolate storage I/O from CPU cache hits. +> - **For MLPerf submissions**: Use **Tier Storage Read/Write Bandwidth (GB/s)** as the primary comparison metric (see below). + +--- + +### End-to-End Latency Metrics + +End-to-end (E2E) latency measures the total time from request submission to completion, including queue wait, cache operations, and simulated generation time. **This is what users experience.** + +| Metric | What It Measures | +|--------|------------------| +| **E2E Latency Mean (ms)** | Average latency across all requests. Sensitive to outliers. | +| **E2E Latency P50 (ms)** | Median latency. 50% of requests complete within this time. | +| **E2E Latency P95 (ms)** | 95th percentile. 95% of requests complete within this time. **Standard SLA metric.** | +| **E2E Latency P99 (ms)** | 99th percentile. 99% of requests complete within this time. **Tail latency indicator.** | +| **E2E Latency P99.9 (ms)** | 99.9th percentile (3 nines). Captures rare slow requests. | +| **E2E Latency P99.99 (ms)** | 99.99th percentile (4 nines). Extreme tail latency for SLA compliance. | + +> **Interpreting percentiles:** +> - **P50** tells you the typical user experience. +> - **P95** is the standard for SLA definitions ("95% of requests under X ms"). +> - **P99–P99.99** reveal tail latency issues that affect a small but real fraction of users. +> - Large gaps between P95 and P99 indicate inconsistent performance (investigate queue buildup or storage saturation). + +--- + +### Storage I/O Latency Metrics + +Storage latency measures only the time spent on cache read/write operations, excluding queue wait and generation simulation. **This isolates storage subsystem performance.** + +| Metric | What It Measures | +|--------|------------------| +| **Storage Latency Mean (ms)** | Average storage I/O time across all operations. | +| **Storage Latency P50 (ms)** | Median storage I/O time. | +| **Storage Latency P95 (ms)** | 95th percentile storage I/O time. **Key metric for storage evaluation.** | +| **Storage Latency P99 (ms)** | 99th percentile storage I/O time. | +| **Storage Latency P99.9 (ms)** | 99.9th percentile storage I/O time. | +| **Storage Latency P99.99 (ms)** | 99.99th percentile storage I/O time. | + +--- + +### Generation Latency Metrics + +Generation latency measures the simulated GPU token generation time. Only meaningful when `--generation-mode` is `fast` or `realistic`. + +| Metric | What It Measures | +|--------|------------------| +| **Gen Latency Mean (ms)** | Average simulated generation time per request. | +| **Gen Latency P50 (ms)** | Median generation time. | +| **Gen Latency P95 (ms)** | 95th percentile generation time. | +| **Gen Latency P99 (ms)** | 99th percentile generation time. | + +> **Note:** With `--generation-mode none`, these values are all 0 (pure storage benchmark). + +--- + +### Storage Tier Latency Breakdown (PRIMARY METRICS) + +These metrics provide granular visibility into storage tier operations. The "storage" tier is device-agnostic—it could be NVMe, SATA SSD, CXL memory, or any block storage device. Each operation is decomposed into: + +- **Total**: Complete operation time (Host + Device) +- **Device**: Actual storage I/O time (`np.save`/`np.load` with fsync) — **PRIMARY LATENCY METRIC** +- **Host**: CPU serialization/deserialization time + +> **⭐ PRIMARY METRICS for MLPerf Storage Comparison:** +> - **Storage Tier Read Device P95 (ms)** — Raw storage read latency +> - **Storage Tier Write Device P95 (ms)** — Raw storage write latency +> - **Tier Storage Read Bandwidth (GB/s)** — Storage read throughput +> - **Tier Storage Write Bandwidth (GB/s)** — Storage write throughput +> +> **What Device Latency Measures:** +> ``` +> Device Latency = [ OS/FS Queue ] + [ Block Layer ] + [ Driver ] + [ Physical I/O ] +> ``` +> The **Storage Tier Read Device P95 (ms)** is the 95th percentile latency of reading one `.npy` file containing the KV cache data for a single cache entry (one request's token sequence). This captures tail latency—95% of reads complete faster than this value, so it reveals worst-case storage behavior under load. + +#### Read Operations (Decode Phase) + +| Metric | Component | What It Measures | +|--------|-----------|------------------| +| **Storage Tier Read Total P50–P99.99 (ms)** | Total | Complete read time including deserialization | +| **Storage Tier Read Device P50–P99.99 (ms)** | Device | **⭐ Raw storage read time (`np.load`) — PRIMARY** | +| **Storage Tier Read Host P50–P99.99 (ms)** | Host | NumPy array deserialization CPU time | + +#### Write Operations (Prefill Phase) + +| Metric | Component | What It Measures | +|--------|-----------|------------------| +| **Storage Tier Write Total P50–P99.99 (ms)** | Total | Complete write time including serialization | +| **Storage Tier Write Device P50–P99.99 (ms)** | Device | **⭐ Raw storage write time (`np.save` + fsync) — PRIMARY** | +| **Storage Tier Write Host P50–P99.99 (ms)** | Host | NumPy array serialization CPU time | + +> **Diagnosing storage bottlenecks:** +> - If **Device >> Host**: Your storage device is the bottleneck. Consider faster storage (NVMe Gen5, CXL). +> - If **Host >> Device**: CPU serialization is the bottleneck. Consider faster CPU or memory bandwidth. +> - Typical ratio: Device should be 60-80% of Total for well-balanced systems. + +--- + +### Cache Statistics + +| Metric | Unit | What It Measures | Good Values | +|--------|------|------------------|-------------| +| **Cache Hit Rate** | ratio (0–1) | Fraction of reads served from cache vs. storage | Higher is better. 0.7+ with multi-turn enabled. | +| **Read/Write Ratio** | ratio | Total reads / Total writes | Higher indicates read-heavy workload (typical for decode phase). | +| **Total Read (GB)** | GB | Total data read from all tiers | Workload volume indicator. | +| **Total Write (GB)** | GB | Total data written to all tiers | Workload volume indicator. | + +--- + +### Per-Tier I/O Volume + +These metrics show data movement through each tier of the cache hierarchy: + +| Metric | What It Measures | +|--------|------------------| +| **Tier GPU KV Bytes Written (GB)** | Data written to GPU VRAM tier | +| **Tier GPU KV Bytes Read (GB)** | Data read from GPU VRAM tier | +| **Tier CPU KV Bytes Written (GB)** | Data written to CPU RAM tier | +| **Tier CPU KV Bytes Read (GB)** | Data read from CPU RAM tier | +| **Tier Storage KV Bytes Written (GB)** | Data written to storage tier (NVMe, SATA, CXL, etc.) | +| **Tier Storage KV Bytes Read (GB)** | Data read from storage tier (NVMe, SATA, CXL, etc.) | + +> **Analyzing tier distribution:** +> - High GPU/CPU reads with low storage reads = hot data fits in fast tiers (good!) +> - High storage reads = working set exceeds fast tier capacity (consider adding memory) +> - **Tier Storage KV Bytes Read** is a key MLPerf differentiation metric (100% win rate in discovery testing) + +--- + +### Per-Tier Bandwidth (PRIMARY METRICS) + +These metrics measure the actual throughput achieved on each tier. **Tier Storage Bandwidth is the primary metric for comparing storage devices.** + +| Metric | Unit | What It Measures | +|--------|------|------------------| +| **Tier GPU Read Bandwidth (GB/s)** | GB/s | GPU VRAM read throughput | +| **Tier GPU Write Bandwidth (GB/s)** | GB/s | GPU VRAM write throughput | +| **Tier CPU Read Bandwidth (GB/s)** | GB/s | CPU RAM read throughput | +| **Tier CPU Write Bandwidth (GB/s)** | GB/s | CPU RAM write throughput | +| **Tier Storage Read Bandwidth (GB/s)** | GB/s | **⭐ Storage tier read throughput — PRIMARY** | +| **Tier Storage Write Bandwidth (GB/s)** | GB/s | **⭐ Storage tier write throughput — PRIMARY** | + +> **Expected bandwidth ranges:** +> - **GPU**: 500–2000 GB/s (HBM2e/HBM3) +> - **CPU**: 50–200 GB/s (DDR4/DDR5) +> - **Storage (NVMe Gen4)**: 3–7 GB/s +> - **Storage (NVMe Gen5)**: 10–14 GB/s +> - **Storage (SATA SSD)**: 0.4–0.6 GB/s +> - **Storage (CXL Memory)**: 30–50 GB/s + +--- + +### Tier Entry Distribution + +| Metric | What It Measures | +|--------|------------------| +| **GPU Entries** | Number of KV cache entries currently in GPU VRAM | +| **CPU Entries** | Number of KV cache entries currently in CPU RAM | +| **Storage Entries** | Number of KV cache entries currently on storage tier | + +> **Interpreting entry counts:** +> - Most entries should be in the fastest available tier for optimal performance. +> - High **Storage Entries** with low **GPU/CPU Entries** indicates memory pressure. +> - When `gpu_mem=0` and `cpu_mem=0`, all entries will be in **Storage Entries**. + +--- + +### Multi-turn Statistics + +| Metric | What It Measures | +|--------|------------------| +| **Multi-turn Hit Rate** | Fraction of requests that reused context from previous conversation turns | + +> **Interpreting Multi-turn Hit Rate:** +> - **High (0.6+)**: Effective conversation context caching. Most requests are follow-ups that reuse existing KV cache entries, reducing redundant computation. Typical for chatbot/assistant workloads. +> - **Low (<0.3)**: Indicates one or more of the following: +> - `--disable-multi-turn` is enabled (expected: 0.0) +> - Workload has high conversation turnover (users start new conversations frequently) +> - Single-shot API usage pattern (each request is independent) +> - Memory pressure causing cache eviction before context reuse +> - Short benchmark duration (not enough time for multi-turn patterns to emerge) +> +> **Note:** A low multi-turn hit rate is **not inherently bad**—it depends on your use case. For storage stress testing, low hit rates force more I/O which is often the goal. + +--- + +### Using Excel Metrics for Analysis + +**⭐ Primary Metrics for MLPerf Storage Comparison:** + +| Metric | When to Use | Why | +|--------|-------------|-----| +| **Tier Storage Read Bandwidth (GB/s)** | Always | Direct measure of storage read throughput | +| **Tier Storage Write Bandwidth (GB/s)** | Always | Direct measure of storage write throughput | +| **Storage Tier Read Device P95 (ms)** | Always | Raw storage read latency (excludes CPU overhead) | +| **Storage Tier Write Device P95 (ms)** | Always | Raw storage write latency (excludes CPU overhead) | +| **Avg Throughput (tok/s)** | When `gpu_mem=0, cpu_mem=0` | Wall-clock throughput equals storage throughput | + +**Comparing storage devices:** +1. Run identical benchmarks on each device with `--gpu-mem-gb 0 --cpu-mem-gb 0` +2. Compare **primary metrics**: Tier Storage Read/Write Bandwidth, Storage Tier Device P95 latencies +3. Use **Avg Throughput (tok/s)** as the overall performance score + +**Diagnosing performance issues:** +1. Check **Storage Tier Device P95** vs **Storage Tier Host P95** +2. If Device >> Host: Storage device is the bottleneck +3. If Host >> Device: CPU serialization is the bottleneck + +**Validating cache configuration:** +1. Check **Cache Hit Rate** and **Multi-turn Hit Rate** +2. Low hit rates with enabled caching: Working set too large for memory budget +3. Compare **Tier Storage KV Bytes Read** across configurations + +--- + +## Unit Testing + +This package includes a comprehensive pytest-based test suite to verify core functionality without running the full benchmark. + +### Running Tests + +```bash +# Run all tests with verbose output +pytest test_kv_cache.py -v + +# Run with shorter traceback +pytest test_kv_cache.py -v --tb=short + +# Run specific test class +pytest test_kv_cache.py -k "TestModelConfig" -v + +# Run only CPU tests (skip GPU tests if no CUDA) +pytest test_kv_cache.py -v -m "not skipif" +``` + +### Test Coverage + +The test suite covers 23 component categories with ~170+ individual tests: + +| Test Class | Tests | Coverage | +|------------|-------|----------| +| `TestConfigLoader` | 5 | YAML loading, strict schema validation, error on unknown keys, nested key access | +| `TestCfgHelper` | 4 | Global `cfg()` helper, defaults when config not loaded, list value extraction | +| `TestModelConfig` | 4 | Model configurations, KV cache size per token calculations, dtype handling | +| `TestInferenceRequest` | 5 | Request dataclass, automatic cache key generation, phase handling, QoS assignment | +| `TestQoSProfiles` | 5 | QoS levels (interactive/responsive/batch), SLA targets, priority ordering, p999/p9999 extended metrics | +| `TestKVCacheGenerator` | 4 | Reproducible generation with seeds, correct tensor shapes, dtype consistency, precomputed buffers | +| `TestCPUMemoryBackend` | 4 | Write/read/delete/clear operations, timing metadata, data integrity | +| `TestNVMeBackend` | 5 | File I/O operations, .npy format handling, metadata persistence, temp directory cleanup | +| `TestGPUMemoryBackend` | 4 | CUDA tensor placement, device memory management (skipped without GPU) | +| `TestConversationManager` | 4 | Multi-turn conversation tracking, cache key management, LRU eviction | +| `TestUserSimulator` | 3 | User profile generation from templates, QoS distribution validation | +| `TestMultiTierCache` | 5 | CPU-only allocation paths, cache access patterns, tier selection logic | +| `TestMultiTierCacheWithGPU` | 4 | GPU tier allocation, waterfall eviction GPU→CPU→NVMe (skipped without GPU) | +| `TestXLSXExport` | 4 | CSV fallback, Excel export, run parameters embedding (skipped without pandas) | +| `TestEnums` | 3 | InferencePhase, GenerationMode, QoSLevel enum values | +| `TestTierLogic` | 3 | Tier ordering (GPU→CPU→NVMe), usage tracking, limit validation | +| `TestConfigDrivenConversationManager` | 2 | ConversationManager respects config.yaml settings | +| `TestConfigDrivenUserSimulator` | 3 | UserSimulator reads user_templates from config | +| `TestStatsNamingConvention` | 2 | `storage_*` naming convention validation for metrics keys | +| `TestGPUMemoryBackendEvictionCallback` | 2 | GPU eviction callback invocation and data passing (skipped without GPU) | +| `TestValidateArgs` | 24 | CLI argument validation: positive integers, ranges, memory limits, cache directory safety, forbidden prefixes | +| `TestPerTierPhaseMetrics` | 7 | Per-tier (GPU/CPU/Storage) KV bytes read/written tracking during prefill/decode phases | +| `TestPerTierPhaseMetricsWithGPU` | 4 | GPU tier metrics tracking, phase-aware read/write separation (skipped without GPU) | + +### Expected Runtime + +- **Without GPU**: ~5-10 seconds +- **With GPU**: ~10-15 seconds + +GPU tests are automatically skipped if CUDA is not available. + +--- + +## Excel Export + +The benchmark can export results directly to Excel or CSV format for analysis. + +### Basic Usage + +```bash +python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 50 \ + --duration 120 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --seed 42 \ + --output results.json \ + --xlsx-output results.xlsx +``` + +### Output Format + +The Excel file contains a single row with all key metrics: + +| Column | Description | +|--------|-------------| +| Model | Model configuration used | +| Num Users | Concurrent user count | +| Duration (s) | Benchmark duration | +| GPU Mem (GB) | GPU memory budget | +| CPU Mem (GB) | CPU memory budget | +| Total Requests | Requests completed | +| Total Tokens | Tokens processed | +| Avg Throughput (tok/s) | Wall-clock throughput | +| Storage Throughput (tok/s) | Storage I/O throughput | +| Cache Hit Rate | Percentage of cache hits | +| E2E Latency P95 (ms) | End-to-end 95th percentile | +| Storage IO P95 (ms) | Storage I/O 95th percentile | + +### Fallback Behavior + +- **With openpyxl**: Exports to `.xlsx` format +- **Without openpyxl**: Falls back to `.csv` format +- **Without pandas**: Export is skipped with a warning + +--- + +## MLPerf Submission Guidelines + +For official MLPerf v3.0 storage submissions, use these standardized commands. **These invocations have been validated through extensive discovery testing** (1,411 Fast system tests, 268 Slow system tests comparing 14,000 MB/s vs 3,000 MB/s storage). + +### Discovery Test Key Findings + +| Finding | Impact | +|---------|--------| +| **Metric selection depends on cpu_mem** | Storage Throughput shows only 1.1x at cpu_mem=0GB but 2.2x at cpu_mem=4GB | +| **Best models for differentiation** | llama3.1-8b and mistral-7b show 2.31x ratio | +| **High variance observed** | CV 50-125%, requires 3-5 trials minimum | +| **100% win rate metrics** | Decode Bytes Read and Wall-Clock Throughput at cpu_mem=0GB | + +### Option 1: Maximum Storage Stress (cpu_mem=0GB) + +Use when you want to stress test NVMe and measure I/O volume differentiation. + +**Primary Metrics:** Decode Bytes Read (2.62x differentiation), Wall-Clock Throughput (2.43x differentiation) + +```bash +# MLPerf v3.0: Maximum Storage Stress Test (8B Model) +# Run 3-5 trials for statistical significance +for trial in 1 2 3 4 5; do + python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 200 \ + --duration 300 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 0 \ + --max-concurrent-allocs 16 \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output mlperf_v3_stress_8b_trial${trial}.json +done +``` + +**⚠️ Important:** At cpu_mem=0GB, do NOT use Storage Throughput as your primary metric—use Decode Bytes Read or Wall-Clock Throughput instead. + +### Option 2: Storage Throughput Focus (cpu_mem=4GB) + +Use when you want Storage Throughput (tok/s) as your primary metric. + +**Primary Metric:** Storage Throughput (2.2x differentiation, 97% win rate) + +```bash +# MLPerf v3.0: Storage Throughput Test (8B Model) +for trial in 1 2 3 4 5; do + python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-8b \ + --num-users 100 \ + --duration 300 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 4 \ + --max-concurrent-allocs 0 \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output mlperf_v3_throughput_8b_trial${trial}.json +done +``` + +### Option 3: Large Model Submission (70B) + +For maximum per-request storage stress (2.5× larger KV cache per token: 320 KB vs 128 KB): + +```bash +# MLPerf v3.0: Large Model Storage Stress +for trial in 1 2 3; do + python3 kv-cache.py \ + --config config.yaml \ + --model llama3.1-70b-instruct \ + --num-users 70 \ + --duration 300 \ + --gpu-mem-gb 0 \ + --cpu-mem-gb 0 \ + --max-concurrent-allocs 4 \ + --generation-mode none \ + --cache-dir /mnt/nvme \ + --seed 42 \ + --output mlperf_v3_stress_70b_trial${trial}.json +done +``` + +### Critical Parameters (Discovery-Validated) + +| Parameter | Value | Rationale | +|-----------|-------|-----------| +| **--config config.yaml** | Required | Ensures consistent internal settings | +| **--seed 42** | Required | Reproducibility across systems | +| **--gpu-mem-gb 0** | Required | Isolates storage performance | +| **--cpu-mem-gb** | 0 or 4 | 0GB for max stress (use I/O volume metrics), 4GB for Storage Throughput metric | +| **--max-concurrent-allocs** | 0, 4, or 16 | 0 for throughput, 16 for stress testing | +| **--generation-mode** | none or realistic | none for pure I/O, realistic for production simulation | +| **--num-users** | 100-200 | Differentiation stable across range; higher = more throughput | +| **--duration** | 300-600 | 5-10 minutes for stable metrics | + +### Trial Requirements + +| User Count | Variance (CV) | Minimum Trials | +|------------|---------------|----------------| +| 10 users | ~52% | 3 | +| 50-100 users | ~115-125% | 3-5 | +| 200 users | ~110-120% | 3-5 | + +Report **median** rather than mean for publication-quality results. + +--- + +## Troubleshooting + +### Out of Memory Errors + +Reduce the number of concurrent users or limit parallel allocations: + +```bash +python3 kv-cache.py --config config.yaml ... --max-concurrent-allocs 50 +``` + +### Benchmark Hangs + +The system may be thrashing. Reduce users or increase memory budgets. + +### Poor Cache Hit Rates + +Low hit rates indicate your working set exceeds available fast memory. Either: +- Increase GPU/CPU memory budgets +- Reduce user count +- Accept that cold data will hit storage + +### Results Vary Between Runs + +Use the `--seed` flag for reproducible results. + +### Configuration Validation Errors + +If you see "Unknown configuration key" errors, check your `config.yaml` for typos. The benchmark uses strict schema validation to prevent silent misconfigurations. + +--- + +## Files in This Package + +- `kv-cache.py`: Main benchmark implementation with ShareGPT support +- `config.yaml`: YAML configuration file for internal parameters +- `test_kv_cache.py`: Pytest unit test suite +- `requirements.txt`: Python dependencies +- `README.md`: This documentation +- `MLperf v3 KV cache proposal.md`: Detailed technical documentation + +--- + +## License + +Apache License 2.0 + +--- + +## Contact + +For questions or feedback, open an issue on the repository or contact the MLPerf Storage Working Group. From 166f2b2312670cf23bc46789591069e538ea0793 Mon Sep 17 00:00:00 2001 From: Hazem Awadallah Date: Tue, 27 Jan 2026 15:44:50 -0800 Subject: [PATCH 08/43] test(results): add pytest HTML test report - Add kv-cache-test-report.html with full test execution results - All 170+ tests passing for v3.0 features - Create unit_test_results directory for test artifacts --- .../tests/unit_test_results/kv-cache-test-report.html | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kv_cache_benchmark/tests/unit_test_results/kv-cache-test-report.html b/kv_cache_benchmark/tests/unit_test_results/kv-cache-test-report.html index 1f4a7fa3..4dc72edf 100644 --- a/kv_cache_benchmark/tests/unit_test_results/kv-cache-test-report.html +++ b/kv_cache_benchmark/tests/unit_test_results/kv-cache-test-report.html @@ -328,7 +328,7 @@

kv-cache-test-report.html

-

Report generated on 12-Jan-2026 at 16:00:59 by pytest-html +

Report generated on 27-Jan-2026 at 11:38:56 by pytest-html v4.1.1

Environment

@@ -382,7 +382,7 @@

Environment

Summary

-

112 tests took 00:01:19.

+

172 tests took 00:01:18.

(Un)check the boxes to filter the results.