From 9b5176b9fa5604df39eaaec883db5a7db3da37e3 Mon Sep 17 00:00:00 2001 From: Carst Vaartjes Date: Sat, 28 Mar 2026 11:35:09 +0100 Subject: [PATCH] feat: add configurable DuckDB memory limit via DUCKDB_MEMORY_LIMIT env var --- RELEASE_NOTES.md | 6 ++++++ parquery/__init__.py | 2 +- parquery/aggregate_duckdb.py | 11 ++++++++++- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 1aa7664..0dde145 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,5 +1,11 @@ # Release Notes for ParQuery +## Release 2.0.7 +- Add configurable DuckDB memory limit via `DUCKDB_MEMORY_LIMIT` environment variable +- Prevents DuckDB OOM on containers with limited memory (e.g. ECS tasks with multiple Gunicorn workers) +- When set (e.g. `DUCKDB_MEMORY_LIMIT=2GB`), DuckDB spills to temp storage instead of allocating unbounded memory +- No performance penalty — benchmark shows equal or faster execution with bounded memory + ## Release 2.0.6 - Add specific functions to public api for imports diff --git a/parquery/__init__.py b/parquery/__init__.py index 7f31edb..8529ada 100644 --- a/parquery/__init__.py +++ b/parquery/__init__.py @@ -27,7 +27,7 @@ from parquery.write import df_to_parquet pre_release_version = os.getenv("PRE_RELEASE_VERSION", "") -__version__: str = pre_release_version if pre_release_version else "2.0.6" +__version__: str = pre_release_version if pre_release_version else "2.0.7" __all__ = [ "aggregate_pq", diff --git a/parquery/aggregate_duckdb.py b/parquery/aggregate_duckdb.py index 19d0ff5..367894f 100644 --- a/parquery/aggregate_duckdb.py +++ b/parquery/aggregate_duckdb.py @@ -2,6 +2,7 @@ import gc import logging +import os from typing import Any try: @@ -17,6 +18,10 @@ logger = logging.getLogger(__name__) +# DuckDB memory limit per connection. Set via DUCKDB_MEMORY_LIMIT env var. +# Examples: "4GB", "2GB", "512MB". Default: no limit (DuckDB manages automatically). +DUCKDB_MEMORY_LIMIT: str | None = os.environ.get("DUCKDB_MEMORY_LIMIT") + def aggregate_pq_duckdb( file_name: str, @@ -134,8 +139,12 @@ def call_duckdb(sql) -> Any: - Uses in-memory database (:memory:) for temporary processing. - Connection is automatically closed after query execution. - Results are streamed via RecordBatchReader and converted to Table. + - Set DUCKDB_MEMORY_LIMIT env var (e.g. "4GB") to cap memory per query. """ - conn = duckdb.connect(":memory:") + config = {} + if DUCKDB_MEMORY_LIMIT: + config["memory_limit"] = DUCKDB_MEMORY_LIMIT + conn = duckdb.connect(":memory:", config=config) # arrow() returns a RecordBatchReader, convert to Table reader = conn.execute(sql).arrow() result_arrow = reader.read_all()