From 18703fa757e858b8f361bfbe9c5f56a8f1eb70ba Mon Sep 17 00:00:00 2001 From: Bryan Deng Date: Sat, 16 May 2026 12:43:40 +0800 Subject: [PATCH] docs: update references for subprocess_worker rename --- .../scripts/faang-daily-close/0.1.0/builder.py | 2 +- dev-docs/SPEC-backend.md | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/builders/scripts/faang-daily-close/0.1.0/builder.py b/builders/scripts/faang-daily-close/0.1.0/builder.py index fdc408d..e9b4278 100644 --- a/builders/scripts/faang-daily-close/0.1.0/builder.py +++ b/builders/scripts/faang-daily-close/0.1.0/builder.py @@ -11,7 +11,7 @@ def build( dependencies: dict[str, dict[datetime, list[dict]]], timestamp: datetime ) -> list[dict[str, Any]]: """Fetch NYSE daily close prices for FAANG stocks from EODHD.""" - # APIClient is instantiated here, not at module level — isolated_worker.py + # APIClient is instantiated here, not at module level — subprocess_worker.py # loads the .env before calling build() but after importing the module, # so module-level env access would fail api = APIClient(os.environ["EODHD_API_KEY"]) diff --git a/dev-docs/SPEC-backend.md b/dev-docs/SPEC-backend.md index f9afdb0..523537e 100644 --- a/dev-docs/SPEC-backend.md +++ b/dev-docs/SPEC-backend.md @@ -99,7 +99,7 @@ Each entry in `rows` contains all data dicts for that timestamp (matching the DB - It dynamically imports builder scripts to run them (see below). - All builder scripts will be implemented by internal users, so we may trust that all code is safe. - But we must not trust that builder scripts will not crash, so each builder invocation runs in an isolated subprocess. - - Each builder runs in its own `subprocess.Popen` process, communicating via JSON over stdin/stdout. A standalone worker script (`isolated_worker.py`) handles deserialization, builder import, and result serialization. It uses only stdlib so it works in any venv. + - Each builder runs in its own `subprocess.Popen` process, communicating via JSON over stdin/stdout. A standalone worker script (`subprocess_worker.py`) handles deserialization, builder import, and result serialization. It uses only stdlib so it works in any venv. - If the subprocess crashes, the main process catches the failure cleanly without going down. - Builders **never** have direct access to the database -- all reads and writes are handled by the server. For now, this is enforced by convention. TODO: add a runtime guard to enforce this. - After builder scripts are run, we upload the data to the Postgres database (see below). @@ -218,15 +218,16 @@ builders/server/ ├── db/ # database connection management and queries │ ├── connection.py │ └── datasets.py -├── runtime/ # config loading, subprocess isolation, schema validation, venv management +├── runtime/ # config loading, schema validation, venv management │ ├── config.py │ ├── registry.py # startup preload + in-memory config registry -│ ├── isolated_worker.py # standalone worker script (stdlib-only, runs in builder subprocesses) │ ├── loader.py │ ├── runner.py │ ├── serialization.py # JSON serialization for subprocess IPC │ ├── validator.py │ └── venv_management.py # per-builder venv creation and caching +├── workers/ # subprocess entry points +│ └── subprocess_worker.py # standalone worker script (stdlib-only, runs in builder subprocesses) ├── utils/ # shared utilities │ ├── retry.py # generic retry with exponential backoff │ └── semver.py @@ -235,6 +236,7 @@ builders/server/ ├── service/ ├── db/ ├── runtime/ + ├── workers/ └── utils/ ``` @@ -474,7 +476,7 @@ Builder scripts may need secrets or config (API keys, credentials) passed via en **Runtime behavior**: - The `.env` file is validated at build time, not config load time. This means CI can load configs for datasets with `env-vars = true` without needing the actual `.env` file present. - If `env-vars` is `true` and the `.env` file is missing at build time, a `FileNotFoundError` is raised. -- The main server process never reads the `.env` values. The `.env` file is parsed inside the subprocess only (using a minimal stdlib-only parser in `isolated_worker.py`), so secrets never enter the parent process memory. +- The main server process never reads the `.env` values. The `.env` file is parsed inside the subprocess only (using a minimal stdlib-only parser in `subprocess_worker.py`), so secrets never enter the parent process memory. - Environment variables are scoped to the subprocess and do not leak to the parent. ### Per-builder virtual environments @@ -514,8 +516,8 @@ builders/scripts/// Builder subprocesses use `subprocess.Popen` with JSON over stdin/stdout for IPC: 1. The runner serializes builder inputs (dependencies, timestamp, paths, env file) to JSON via `serialization.py` -2. The subprocess runs `isolated_worker.py` using the builder's venv python (or system python) -3. `isolated_worker.py` is stdlib-only: it deserializes input, imports and runs the builder, serializes output +2. The subprocess runs `subprocess_worker.py` using the builder's venv python (or system python) +3. `subprocess_worker.py` is stdlib-only: it deserializes input, imports and runs the builder, serializes output 4. The runner deserializes the JSON response from stdout This model supports per-builder venvs since each subprocess uses its own Python interpreter. The worker script has no dependencies on server code.