kruize · bharathappali · Jan 12, 2026 · Jan 12, 2026 · Jan 12, 2026 · Jan 12, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,25 @@
+FROM python:3.11-slim
+
+# install java (required for spark)
+RUN apt-get update && \
+    apt-get install -y openjdk-17-jre-headless curl && \
+    rm -rf /var/lib/apt/lists/*
+
+# spark version
+ENV SPARK_VERSION=3.5.1
+ENV HADOOP_VERSION=3
+
+# install spark
+RUN curl -L https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
+    | tar -xz -C /opt/ && \
+    mv /opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} /opt/spark
+
+ENV SPARK_HOME=/opt/spark
+ENV PATH=$SPARK_HOME/bin:$PATH
+
+# install pyspark python package (light dependency wrapper)
+RUN pip install --no-cache-dir pyspark
+
+WORKDIR /app
+
+ENTRYPOINT ["python"]
diff --git a/README.md b/README.md
@@ -1,2 +1,117 @@
 # data-gen
-Scripts to generate synthetic usage data to test kruize on kind or minikube
+
+# Synthetic Kubernetes Workload Usage Generator
+
+This project generates **synthetic Kubernetes-style metrics** and converts it into **TSDB blocks**.
+The goal is to create realistic, large-scale time-series data for testing ingestion pipelines, storage systems, and query performance without requiring real cluster metrics.
+
+The pipeline simulates infrastructure, produces time-series signals, converts them into OpenMetrics format, and finally builds TSDB blocks.
+
+---
+
+## Data Generation Flow
+
+The pipeline runs in four stages:
+
+### 1. Configuration generation
+
+A synthetic environment configuration is created first.
+
+* Defines clusters, namespaces, workloads, pods, and containers
+* Assigns resource limits and utilization behavior
+* Schedules workloads onto simulated nodes
+* Defines the time range and sampling interval
+
+The configuration is generated **from `mapping.json`**, which provides the base cluster → namespace → workload relationships.
+The output is a `meta.json` file that fully describes the simulated environment.
+
+---
+
+### 2. Synthetic metric generation
+
+Using the generated configuration:
+
+* Time-series metrics are emitted for CPU, memory, and GPU
+* Metrics are generated per container across the full time window
+* Cumulative counters evolve over time
+* Labels reflect infrastructure topology and runtime identity
+
+This produces structured metric data aligned with the environment model.
+
+---
+
+### 3. OpenMetrics conversion
+
+Generated metric data is converted into **OpenMetrics (OMF)** format using a converter script.
+
+This prepares the data for ingestion into Prometheus-compatible tooling.
+
+---
+
+### 4. TSDB block creation
+
+OpenMetrics data is converted into **TSDB blocks** using the Prometheus `promtool` utility.
+
+These blocks can be loaded directly into a TSDB for benchmarking or testing.
+
+---
+
+## Project Structure
+
+```
+.
+├── generate_config.py              # Creates synthetic environment config
+├── data-gen.py                     # Generates time-series metrics
+├── consts/                         # Resource limits and constants
+├── data/
+│   ├── metadata/
+│   │   └── mapping.json            # Base topology mapping
+│   └── configs/
+│       └── <config_name>/
+│           └── meta.json
+│
+├── converters/                     # Metric → OpenMetrics conversion (external)
+├── tsdb/                           # OMF → TSDB block creation (promtool)
+```
+
+---
+
+## How to Run
+
+### Step 1 — Generate configuration
+
+```
+python generate_config.py --config-name <config_name>
+```
+
+This creates:
+
+```
+data/configs/<config_name>/meta.json
+```
+
+The configuration defines the entire simulated environment.
+
+---
+
+### Step 2 — Generate synthetic metrics
+
+```
+python data-gen.py --config-name <config_name>
+```
+
+Metrics are generated using the configuration.
+
+---
+
+### Step 3 — Convert metrics to OpenMetrics
+
+Yet to be implemented
+
+---
+
+### Step 4 — Generate TSDB blocks
+
+Yet to be implemented
+
+
diff --git a/consts/__init__.py b/consts/__init__.py
diff --git a/consts/constants.py b/consts/constants.py
@@ -0,0 +1,74 @@
+class Constants:
+    KRUIZE_TEAM_NAMES = [
+        "dinakar",
+        "rebecca",
+        "rashmi",
+        "bhakta",
+        "kusuma",
+        "chandrakala",
+        "pinky",
+        "vinay",
+        "saad",
+        "bhanvi",
+        "shreya",
+        "shekhar",
+        "nick",
+        "bharath"
+    ]
+
+    NAMESPACE = "namespace"
+
+    INTERVAL_CHOICES = ["1s", "5s", "15s", "30s", "60s"]
+
+    class InputConsts:
+        DEFAULT_NUM_NAMESPACES = 183
+        DEFAULT_MIN_DEPLOYMENTS = 1
+        DEFAULT_MAX_DEPLOYMENTS = 25
+        DEFAULT_MIN_REPLICAS = 1
+        DEFAULT_MAX_REPLICAS = 10
+        DEFAULT_INTERVAL = "30s"
+        DEFAULT_PRE_DAYS = 15
+        DEFAULT_POST_DAYS = 15
+        DEFAULT_CONFIG_NAME = "default"
+
+    class ResourceConstraints:
+        ZERO_VAL = 0.00
+
+        MIN_CPU = 0.01
+        MAX_CPU = 8.00
+
+        MIN_MIN_CPU = MIN_CPU
+        MAX_MIN_CPU = 0.1
+
+        MIN_AVG_CPU = 0.50
+        MAX_AVG_CPU = 3.50
+
+        MIN_MAX_CPU = 2.00
+        MAX_MAX_CPU = MAX_CPU
+
+        IDLE_CPU_MIN = 0.00001
+        IDLE_CPU_MAX = 0.0001
+
+        MIN_MEMORY = 50.00
+        MAX_MEMORY = 4000.00
+
+        MIN_MIN_MEMORY = MIN_MEMORY
+        MAX_MIN_MEMORY = 150.00
+
+        MIN_AVG_MEMORY = 200.00
+        MAX_AVG_MEMORY = 2000.00
+
+        MIN_MAX_MEMORY = 1500.00
+        MAX_MAX_MEMORY = MAX_MEMORY
+
+        MIN_GPU = 1.00
+        MAX_GPU = 100.00
+
+        MIN_MIN_GPU = MIN_GPU
+        MAX_MIN_GPU = 25.00
+
+        MIN_AVG_GPU = 25.00
+        MAX_AVG_GPU = 65.00
+
+        MIN_MAX_GPU = 65.00
+        MAX_MAX_GPU = MAX_GPU