From aab762eca1fba956f3e7489588b9b2a0eb468db0 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Wed, 1 Apr 2026 10:04:37 -0500 Subject: [PATCH 01/13] Stream query results to bound memory during scans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both the HTTP and binary protocols previously buffered the entire ClickHouse response in memory before returning any rows to the Postgres executor. A query against a 1 GB table caused the backend to consume ~980 MB of RSS, making it easy to OOM a memory-constrained Postgres instance. Replace the buffered simple_query path with a streaming_query path for FDW scans. Both protocols now run the query inside a minicoro coroutine. The data callback (curl write_data for HTTP, OnDataCancelable for binary) yields to the caller when a batch of rows is ready. The executor resumes the coroutine to fetch the next batch. Memory is bounded to roughly fetch_size × row_size per batch regardless of total result set size. HTTP: curl_easy_perform runs inside a coroutine. write_data counts TSV rows and yields when fetch_size rows accumulate. On resume, the previous batch is compacted and curl continues receiving. Binary: Client::Select runs inside a coroutine. OnDataCancelable yields each block to the consumer. The consumer iterates rows within the block, then resumes for the next one. Supporting changes: - Activate the fetch_size FDW option (server and table level, default 50000). The existing plumbing from the postgres_fdw fork was present but never wired up. - Add pg_clickhouse.max_result_size GUC (integer, MB, default 0 meaning disabled) for a hard abort when a response exceeds a configured size. - Add a streaming_query method to the libclickhouse_methods vtable so FDW scans use streaming while non-scan operations (IMPORT FOREIGN SCHEMA, INSERT) keep the original buffered path. - Add clickhouseReScanForeignScan (previously aliased to End) so parameterized rescans properly reset streaming state. - Vendor minicoro (github.com/edubart/minicoro) as a git submodule for stackful coroutine support on all platforms. Measured with 1M rows (~1 GB of TSV data): Before: 980 MB peak RSS After: 66 MB peak RSS --- .gitmodules | 3 + Makefile | 2 +- src/binary.cpp | 270 +++++++++++++++++++++++++++++ src/fdw.c.in | 50 +++++- src/http.c | 390 ++++++++++++++++++++++++++++++++++++++++++ src/include/binary.hh | 14 ++ src/include/fdw.h | 8 + src/include/http.h | 15 ++ src/minicoro.c | 6 + src/option.c | 21 +++ src/pglink.c | 287 +++++++++++++++++++++++++++++-- vendor/minicoro | 1 + 12 files changed, 1051 insertions(+), 16 deletions(-) create mode 100644 src/minicoro.c create mode 160000 vendor/minicoro diff --git a/.gitmodules b/.gitmodules index 55b8de33..b8157233 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "clickhouse-cpp"] path = vendor/clickhouse-cpp url = https://github.com/ClickHouse/clickhouse-cpp.git +[submodule "vendor/minicoro"] + path = vendor/minicoro + url = https://github.com/edubart/minicoro.git diff --git a/Makefile b/Makefile index a2a57b82..ba317a5e 100644 --- a/Makefile +++ b/Makefile @@ -56,7 +56,7 @@ else endif # Add include directories. -PG_CPPFLAGS = -I./src/include -I$(CH_CPP_DIR) -I$(CH_CPP_DIR)/contrib/absl +PG_CPPFLAGS = -I./src/include -I$(CH_CPP_DIR) -I$(CH_CPP_DIR)/contrib/absl -Ivendor/minicoro # Include other libraries compiled into clickhouse-cpp. PG_LDFLAGS = -lstdc++ -lssl -lcrypto $(shell $(CURL_CONFIG) --libs) diff --git a/src/binary.cpp b/src/binary.cpp index e6882243..d6117071 100644 --- a/src/binary.cpp +++ b/src/binary.cpp @@ -1210,4 +1210,274 @@ extern "C" if (state->error) free(state->error); } +/* + * ============================================================ + * Streaming binary API + * + * Uses minicoro to run Client::Select inside a coroutine. + * The OnDataCancelable callback yields each block back to the + * consumer. No threads, no synchronization. + * ============================================================ + */ + +#include "minicoro.h" + +struct ch_binary_streaming_state +{ + mco_coro *co; + + /* Current block yielded by the coroutine */ + Block *current_block; /* points into coroutine stack; valid until next resume */ + size_t current_row; + bool have_block; /* true if current_block is valid */ + bool done; /* coroutine has finished */ + + /* Per-row scratch arrays (allocated once) */ + Oid *coltypes; + Datum *values; + bool *nulls; + size_t columns_count; + + /* Error / cancel */ + char *error; + bool canceled; + bool (*check_cancel)(void); + + /* Query details (owned, for coroutine to use) */ + Client *client; + std::string sql; + QuerySettings settings; + QueryParams params; +}; + +/* + * Coroutine function: runs Client::Select, yielding each block. + */ +static void +binary_streaming_coro(mco_coro *co) +{ + ch_binary_streaming_state *st = + (ch_binary_streaming_state *)mco_get_user_data(co); + + try + { + st->client->Select( + clickhouse::Query(st->sql) + .SetQuerySettings(st->settings) + .SetParams(st->params) + .OnDataCancelable( + [st, co](const Block & block) -> bool + { + if (st->check_cancel && st->check_cancel()) + { + st->error = strdup("query was canceled"); + return false; + } + + if (st->canceled) + return false; + + if (block.GetColumnCount() == 0) + return true; + + /* Make the block available to the consumer and yield */ + st->current_block = const_cast(&block); + st->columns_count = block.GetColumnCount(); + st->have_block = true; + st->current_row = 0; + + mco_yield(co); + + /* Resumed by consumer — check if canceled */ + return !st->canceled; + } + ) + ); + } + catch (const std::exception & e) + { + if (!st->error) + st->error = strdup(e.what()); + } + + st->have_block = false; + st->done = true; +} + +ch_binary_streaming_state * +ch_binary_begin_streaming(ch_binary_connection_t * conn, + const ch_query * query, + bool (*check_cancel)(void)) +{ + mco_desc desc; + mco_result res; + + ch_binary_streaming_state *st = new (std::nothrow) ch_binary_streaming_state(); + if (!st) + return NULL; + + st->co = NULL; + st->current_block = NULL; + st->current_row = 0; + st->have_block = false; + st->done = false; + st->coltypes = NULL; + st->values = NULL; + st->nulls = NULL; + st->columns_count = 0; + st->error = NULL; + st->canceled = false; + st->check_cancel = check_cancel; + st->client = (Client *)conn->client; + st->sql = query->sql; + st->settings = ch_binary_settings(query); + st->params = ch_binary_params(query); + + desc = mco_desc_init(binary_streaming_coro, 1024 * 1024); + desc.user_data = st; + res = mco_create(&st->co, &desc); + if (res != MCO_SUCCESS) + { + st->error = strdup(mco_result_description(res)); + st->done = true; + return st; + } + + /* Resume to start the query and get the first block */ + mco_resume(st->co); + + return st; +} + +/* + * Fetch the next block from the stream. + * Returns true if a new block is available. + */ +bool +ch_binary_fetch_block(ch_binary_streaming_state * st) +{ + if (!st || st->done) + return false; + + if (st->have_block) + return true; + + /* Resume coroutine to get next block */ + if (mco_status(st->co) == MCO_SUSPENDED) + mco_resume(st->co); + + return st->have_block; +} + +/* + * Read the next row from the current block. Returns false when the + * current block is exhausted (caller should call ch_binary_fetch_block). + */ +bool +ch_binary_streaming_read_row(ch_binary_streaming_state * st) +{ + if (!st || !st->have_block || !st->current_block) + return false; + + Block & block = *st->current_block; + size_t row_count = block.GetRowCount(); + + if (st->current_row >= row_count) + { + /* Block exhausted — mark it consumed so next fetch_block resumes */ + st->have_block = false; + return false; + } + + /* Allocate scratch arrays on first use */ + if (!st->coltypes && st->columns_count > 0) + { + st->coltypes = new Oid[st->columns_count]; + st->values = new Datum[st->columns_count]; + st->nulls = new bool[st->columns_count]; + } + + try + { + for (size_t i = 0; i < st->columns_count; i++) + { + st->values[i] = make_datum(block[i], st->current_row, + &st->coltypes[i], &st->nulls[i]); + } + } + catch (const std::exception & e) + { + if (!st->error) + st->error = strdup(e.what()); + return false; + } + + st->current_row++; + return true; +} + +size_t +ch_binary_streaming_columns(ch_binary_streaming_state * st) +{ + return st ? st->columns_count : 0; +} + +Datum +ch_binary_streaming_value(ch_binary_streaming_state * st, size_t col, + Oid * valtype, bool * is_null) +{ + if (!st || col >= st->columns_count) + { + *is_null = true; + *valtype = InvalidOid; + return (Datum)0; + } + + *valtype = st->coltypes[col]; + *is_null = st->nulls[col]; + return st->values[col]; +} + +char * +ch_binary_streaming_error(ch_binary_streaming_state * st) +{ + return st ? st->error : NULL; +} + +void +ch_binary_end_streaming(ch_binary_streaming_state * st) +{ + if (!st) + return; + + /* + * If the coroutine is still suspended (query not fully consumed), + * destroy it without resuming. mco_destroy handles suspended + * coroutines. Then reset the connection to cancel any in-flight + * ClickHouse query so the server doesn't keep processing. + */ + if (st->co) + { + bool was_suspended = (mco_status(st->co) == MCO_SUSPENDED); + + mco_destroy(st->co); + st->co = NULL; + + if (was_suspended) + st->client->ResetConnection(); + } + + if (st->coltypes) + { + delete[] st->coltypes; + delete[] st->values; + delete[] st->nulls; + } + + if (st->error) + free(st->error); + + delete st; +} + } diff --git a/src/fdw.c.in b/src/fdw.c.in index 1ffafd5f..569ec835 100644 --- a/src/fdw.c.in +++ b/src/fdw.c.in @@ -34,7 +34,9 @@ #endif /* extension includes. */ +#include "commands/defrem.h" #include "utils/builtins.h" +#include "utils/guc.h" #include "binary.hh" #include "internal.h" #include "fdw.h" @@ -205,6 +207,7 @@ static int double *totaldeadrows); static void clickhouseBeginForeignScan(ForeignScanState * node, int eflags); static TupleTableSlot * clickhouseIterateForeignScan(ForeignScanState * node); +static void clickhouseReScanForeignScan(ForeignScanState * node); static void clickhouseEndForeignScan(ForeignScanState * node); static List * clickhousePlanForeignModify(PlannerInfo * root, ModifyTable * plan, @@ -355,6 +358,25 @@ clickhouseGetForeignRelSize(PlannerInfo * root, fpinfo->fdw_startup_cost = DEFAULT_FDW_STARTUP_COST; fpinfo->fdw_tuple_cost = DEFAULT_FDW_TUPLE_COST; fpinfo->shippable_extensions = NIL; + fpinfo->fetch_size = 50000; + + /* Server-level fetch_size overrides default. */ + foreach(lc, fpinfo->server->options) + { + DefElem *def = (DefElem *) lfirst(lc); + + if (strcmp(def->defname, "fetch_size") == 0) + (void) parse_int(defGetString(def), &fpinfo->fetch_size, 0, NULL); + } + + /* Table-level fetch_size overrides server-level. */ + foreach(lc, fpinfo->table->options) + { + DefElem *def = (DefElem *) lfirst(lc); + + if (strcmp(def->defname, "fetch_size") == 0) + (void) parse_int(defGetString(def), &fpinfo->fetch_size, 0, NULL); + } chfdw_apply_custom_table_options(fpinfo, foreigntableid); @@ -949,8 +971,13 @@ clickhouseIterateForeignScan(ForeignScanState * node) values); } - fsstate->ch_cursor = fsstate->conn.methods->simple_query(fsstate->conn.conn, - &query); + if (fsstate->fetch_size > 0 && + fsstate->conn.methods->streaming_query != NULL) + fsstate->ch_cursor = fsstate->conn.methods->streaming_query( + fsstate->conn.conn, &query, fsstate->fetch_size); + else + fsstate->ch_cursor = fsstate->conn.methods->simple_query( + fsstate->conn.conn, &query); time_used += fsstate->ch_cursor->request_time; MemoryContextSwitchTo(old); @@ -979,6 +1006,23 @@ clickhouseIterateForeignScan(ForeignScanState * node) return slot; } +/* + * clickhouseReScanForeignScan + * Restart the scan from scratch by disposing the current cursor. + * The next IterateForeignScan call will re-issue the query. + */ +static void +clickhouseReScanForeignScan(ForeignScanState * node) +{ + ChFdwScanState *fsstate = (ChFdwScanState *) node->fdw_state; + + if (fsstate && fsstate->ch_cursor) + { + MemoryContextDelete(fsstate->ch_cursor->memcxt); + fsstate->ch_cursor = NULL; + } +} + /* * clickhouseEndForeignScan * Finish scanning foreign table and dispose objects used for this scan @@ -2927,7 +2971,7 @@ clickhouse_fdw_handler(PG_FUNCTION_ARGS) routine->GetForeignPlan = clickhouseGetForeignPlan; routine->BeginForeignScan = clickhouseBeginForeignScan; routine->IterateForeignScan = clickhouseIterateForeignScan; - routine->ReScanForeignScan = clickhouseEndForeignScan; + routine->ReScanForeignScan = clickhouseReScanForeignScan; routine->EndForeignScan = clickhouseEndForeignScan; /* Functions for updating foreign tables */ diff --git a/src/http.c b/src/http.c index 10ff0095..5121c778 100644 --- a/src/http.c +++ b/src/http.c @@ -343,3 +343,393 @@ ch_http_response_free(ch_http_response_t * resp) free(resp); } + +/* + * ============================================================ + * Streaming HTTP API + * + * Uses minicoro to run curl_easy_perform inside a coroutine. + * The write callback counts complete TSV rows; when a batch is + * ready it yields back to the caller. Resuming the coroutine + * continues the transfer. No threads, no curl_multi. + * ============================================================ + */ + +#include "minicoro.h" + +struct ch_http_streaming_state +{ + mco_coro *co; + ch_http_connection_t *conn; + + /* Owned resources that need cleanup */ + char *url; + struct curl_slist *headers; + curl_mime *form; + + bool transfer_done; /* coroutine has finished */ + + /* Rolling buffer: raw bytes from curl */ + char *buf; + size_t buf_alloc; + size_t buf_len; /* total bytes in buf */ + size_t batch_end; /* offset past last complete row in batch */ + int rows_ready; /* complete rows accumulated */ + int fetch_size; + + /* Error / status */ + long http_status; + char errbuffer[CURL_ERROR_SIZE]; + char *error; + double pretransfer_time; + double total_time; + char query_id[37]; + + /* Cumulative bytes received (for max_result_size check) */ + size_t total_bytes; +}; + +/* + * Streaming write callback. Appends data to the rolling buffer and counts + * complete TSV rows (terminated by '\n'). When fetch_size rows have been + * accumulated, yields the coroutine back to the caller. + */ +static size_t +write_data_streaming(void *contents, size_t size, size_t nmemb, void *userp) +{ + size_t realsize = size * nmemb; + ch_http_streaming_state *st = userp; + + /* Grow buffer if needed */ + if (st->buf_len + realsize + 1 > st->buf_alloc) + { + size_t newsize = st->buf_alloc * 2; + + if (newsize < st->buf_len + realsize + 1) + newsize = st->buf_len + realsize + 1; + + char *newbuf = realloc(st->buf, newsize); + + if (!newbuf) + return 0; /* signal error to curl */ + st->buf = newbuf; + st->buf_alloc = newsize; + } + + memcpy(st->buf + st->buf_len, contents, realsize); + st->buf_len += realsize; + st->buf[st->buf_len] = '\0'; + + /* Count new complete rows */ + for (size_t i = st->buf_len - realsize; i < st->buf_len; i++) + { + if (st->buf[i] == '\n') + { + st->rows_ready++; + st->batch_end = i + 1; + + if (st->rows_ready >= st->fetch_size) + { + /* Batch is ready — yield to consumer */ + mco_yield(st->co); + + /* + * Resumed: the consumer has consumed the batch and compacted + * the buffer. Continue receiving. + */ + return realsize; + } + } + } + + return realsize; +} + +/* + * Coroutine function: runs the entire curl_easy_perform, yielding + * whenever a batch is ready. + */ +static void +http_streaming_coro(mco_coro *co) +{ + ch_http_streaming_state *st = mco_get_user_data(co); + CURLcode errcode; + static char errbuffer[CURL_ERROR_SIZE]; + + errbuffer[0] = '\0'; + errcode = curl_easy_perform(st->conn->curl); + + if (errcode != CURLE_OK && errcode != CURLE_WRITE_ERROR) + { + if (!st->error) + st->error = strdup(st->errbuffer[0] ? + st->errbuffer : curl_easy_strerror(errcode)); + } + else if (errcode == CURLE_ABORTED_BY_CALLBACK) + { + st->http_status = 418; + } + + curl_easy_getinfo(st->conn->curl, CURLINFO_PRETRANSFER_TIME, + &st->pretransfer_time); + curl_easy_getinfo(st->conn->curl, CURLINFO_TOTAL_TIME, + &st->total_time); + curl_easy_getinfo(st->conn->curl, CURLINFO_RESPONSE_CODE, + &st->http_status); + + /* Mark any remaining partial data as the final batch */ + if (st->buf_len > st->batch_end) + st->batch_end = st->buf_len; + + st->transfer_done = true; + + /* The coroutine returns here; mco_status becomes MCO_DEAD. */ +} + +ch_http_streaming_state * +ch_http_begin_streaming(ch_http_connection_t * conn, const ch_query * query, + int fetch_size) +{ + CURLU *cu; + kv_iter iter; + char *buf = NULL; + int i; + mco_desc desc; + mco_result res; + + ch_http_streaming_state *st = calloc(sizeof(ch_http_streaming_state), 1); + + if (!st) + return NULL; + + st->conn = conn; + st->fetch_size = fetch_size; + + /* Initial buffer: 64 KB */ + st->buf_alloc = 64 * 1024; + st->buf = malloc(st->buf_alloc); + if (!st->buf) + { + free(st); + return NULL; + } + + /* Generate query ID */ + { + uuid_t id; + + uuid_generate(id); + uuid_unparse(id, st->query_id); + } + + assert(conn && conn->curl); + + /* Build URL (same logic as ch_http_simple_query) */ + cu = curl_url(); + curl_url_set(cu, CURLUPART_URL, conn->base_url, 0); + buf = psprintf("query_id=%s", st->query_id); + curl_url_set(cu, CURLUPART_QUERY, buf, CURLU_APPENDQUERY | CURLU_URLENCODE); + pfree(buf); + + for (iter = new_kv_iter(query->settings); !kv_iter_done(&iter); kv_iter_next(&iter)) + { + /* Skip settings that would break parsing and type conversion. */ + if ( + strcmp(iter.name, "date_time_output_format") == 0 || + strcmp(iter.name, "format_tsv_null_representation") == 0 || + strcmp(iter.name, "output_format_tsv_crlf_end_of_line") == 0 + ) + continue; + buf = psprintf("%s=%s", iter.name, iter.value); + curl_url_set(cu, CURLUPART_QUERY, buf, CURLU_APPENDQUERY | CURLU_URLENCODE); + pfree(buf); + } + + /* Always use ISO date format, \N for NULL, \n for EOL. */ + curl_url_set(cu, CURLUPART_QUERY, "date_time_output_format=iso", CURLU_APPENDQUERY | CURLU_URLENCODE); + curl_url_set(cu, CURLUPART_QUERY, "format_tsv_null_representation=\\N", CURLU_APPENDQUERY | CURLU_URLENCODE); + curl_url_set(cu, CURLUPART_QUERY, "output_format_tsv_crlf_end_of_line=0", CURLU_APPENDQUERY | CURLU_URLENCODE); + curl_url_get(cu, CURLUPART_URL, &st->url, 0); + curl_url_cleanup(cu); + + /* Configure easy handle */ + st->errbuffer[0] = '\0'; + curl_easy_reset(conn->curl); + curl_easy_setopt(conn->curl, CURLOPT_WRITEFUNCTION, write_data_streaming); + curl_easy_setopt(conn->curl, CURLOPT_WRITEDATA, st); + curl_easy_setopt(conn->curl, CURLOPT_ERRORBUFFER, st->errbuffer); + curl_easy_setopt(conn->curl, CURLOPT_PATH_AS_IS, 1L); + curl_easy_setopt(conn->curl, CURLOPT_URL, st->url); + curl_easy_setopt(conn->curl, CURLOPT_NOSIGNAL, 1L); + curl_easy_setopt(conn->curl, CURLOPT_VERBOSE, curl_verbose); + curl_easy_setopt(conn->curl, CURLOPT_NOPROGRESS, 1L); + + if (conn->dbname) + { + st->headers = curl_slist_append(NULL, + psprintf("%s: %s", DATABASE_HEADER, conn->dbname)); + curl_easy_setopt(conn->curl, CURLOPT_HTTPHEADER, st->headers); + } + + /* Set POST data */ + if (query->num_params == 0) + { + curl_easy_setopt(conn->curl, CURLOPT_POSTFIELDS, query->sql); + } + else + { + curl_mimepart *part; + + st->form = curl_mime_init(conn->curl); + part = curl_mime_addpart(st->form); + curl_mime_name(part, "query"); + curl_mime_data(part, query->sql, CURL_ZERO_TERMINATED); + for (i = 0; i < query->num_params; i++) + { + part = curl_mime_addpart(st->form); + curl_mime_name(part, psprintf("param_p%d", i + 1)); + curl_mime_data(part, query->param_values[i], CURL_ZERO_TERMINATED); + } + curl_easy_setopt(conn->curl, CURLOPT_MIMEPOST, st->form); + } + + /* Create coroutine */ + desc = mco_desc_init(http_streaming_coro, 1024 * 1024); + desc.user_data = st; + res = mco_create(&st->co, &desc); + if (res != MCO_SUCCESS) + { + st->error = strdup(mco_result_description(res)); + st->transfer_done = true; + return st; + } + + /* Resume to start the transfer and get the first batch */ + mco_resume(st->co); + + return st; +} + +/* + * Fetch the next batch of rows from the stream. + * + * Returns true if a batch is available (access via ch_http_streaming_batch_data). + * Returns false when there is no more data. + */ +bool +ch_http_fetch_batch(ch_http_streaming_state * st) +{ + size_t consumed; + size_t remaining; + + if (!st) + return false; + + /* + * Consume the previous batch: compact the buffer. + */ + consumed = st->batch_end; + st->total_bytes += consumed; + + remaining = st->buf_len - consumed; + if (remaining > 0) + memmove(st->buf, st->buf + consumed, remaining); + st->buf_len = remaining; + st->buf[st->buf_len] = '\0'; + st->batch_end = 0; + st->rows_ready = 0; + + /* If the coroutine is already done, return whatever remains */ + if (st->transfer_done || mco_status(st->co) == MCO_DEAD) + { + st->transfer_done = true; + if (st->buf_len > 0) + { + st->batch_end = st->buf_len; + return true; + } + return false; + } + + /* Resume the coroutine to continue receiving data */ + mco_resume(st->co); + + /* Check if we got a batch or reached EOF */ + if (st->batch_end > 0) + return true; + + if (st->transfer_done && st->buf_len > 0) + { + st->batch_end = st->buf_len; + return true; + } + + return false; +} + +char * +ch_http_streaming_error(ch_http_streaming_state * st) +{ + return st ? st->error : NULL; +} + +long +ch_http_streaming_status(ch_http_streaming_state * st) +{ + return st ? st->http_status : 0; +} + +double +ch_http_streaming_request_time(ch_http_streaming_state * st) +{ + return st ? st->pretransfer_time : 0; +} + +double +ch_http_streaming_total_time(ch_http_streaming_state * st) +{ + return st ? st->total_time : 0; +} + +char * +ch_http_streaming_batch_data(ch_http_streaming_state * st) +{ + return st ? st->buf : NULL; +} + +size_t +ch_http_streaming_batch_size(ch_http_streaming_state * st) +{ + return st ? st->batch_end : 0; +} + +void +ch_http_end_streaming(ch_http_streaming_state * st) +{ + if (!st) + return; + + /* + * Destroy the coroutine without resuming. This abandons + * curl_easy_perform mid-transfer, so reset the easy handle + * afterwards to ensure curl cleans up its internal state. + */ + if (st->co) + mco_destroy(st->co); + + curl_easy_reset(st->conn->curl); + + /* Clean up owned resources */ + if (st->url) + curl_free(st->url); + if (st->headers) + curl_slist_free_all(st->headers); + if (st->form) + curl_mime_free(st->form); + if (st->buf) + free(st->buf); + if (st->error) + free(st->error); + + free(st); +} diff --git a/src/include/binary.hh b/src/include/binary.hh index b856a7bb..bfa14119 100644 --- a/src/include/binary.hh +++ b/src/include/binary.hh @@ -74,6 +74,20 @@ extern "C" const ch_query * query, bool (*check_cancel) (void)); extern void ch_binary_response_free(ch_binary_response_t * resp); +/* Streaming binary API: background thread with bounded block queue */ + typedef struct ch_binary_streaming_state ch_binary_streaming_state; + + extern ch_binary_streaming_state * ch_binary_begin_streaming( + ch_binary_connection_t * conn, const ch_query * query, + bool (*check_cancel) (void)); + extern bool ch_binary_fetch_block(ch_binary_streaming_state * state); + extern bool ch_binary_streaming_read_row(ch_binary_streaming_state * state); + extern size_t ch_binary_streaming_columns(ch_binary_streaming_state * state); + extern Datum ch_binary_streaming_value(ch_binary_streaming_state * state, size_t col, + Oid * valtype, bool * is_null); + extern char * ch_binary_streaming_error(ch_binary_streaming_state * state); + extern void ch_binary_end_streaming(ch_binary_streaming_state * state); + /* reading */ void ch_binary_read_state_init(ch_binary_read_state_t * state, ch_binary_response_t * resp); void ch_binary_read_state_free(ch_binary_read_state_t * state); diff --git a/src/include/fdw.h b/src/include/fdw.h index 6adb9b8c..4ce4abd8 100644 --- a/src/include/fdw.h +++ b/src/include/fdw.h @@ -54,6 +54,11 @@ typedef struct ch_cursor double total_time; size_t columns_count; uintptr_t *conversion_states; /* for binary */ + + /* Streaming support */ + void *streaming_state; /* opaque: ch_http_streaming_state or + * ch_binary_streaming_state */ + bool is_streaming; } ch_cursor; typedef struct ChFdwScanRowContext @@ -69,6 +74,7 @@ typedef struct ChFdwScanRowContext typedef void (*disconnect_method) (void *conn); typedef void (*check_conn_method) (const char *password, UserMapping * user); typedef ch_cursor * (*simple_query_method) (void *conn, const ch_query * query); +typedef ch_cursor * (*streaming_query_method) (void *conn, const ch_query * query, int fetch_size); typedef void (*simple_insert_method) (void *conn, const ch_query * query); typedef Datum * (*cursor_fetch_row_method) (ChFdwScanRowContext * ctx); typedef void *(*prepare_insert_method) (void *conn, ResultRelInfo *, List *, @@ -79,6 +85,7 @@ typedef struct { disconnect_method disconnect; simple_query_method simple_query; + streaming_query_method streaming_query; cursor_fetch_row_method fetch_row; prepare_insert_method prepare_insert; insert_tuple_method insert_tuple; @@ -229,6 +236,7 @@ extern void chfdw_report_error(int elevel, ch_connection conn, /* in option.c */ extern kv_list * chfdw_get_session_settings(void); +extern int chfdw_get_max_result_size(void); extern void chfdw_extract_options(List * defelems, char **driver, char **host, int *port, diff --git a/src/include/http.h b/src/include/http.h index ff771c41..ce48f094 100644 --- a/src/include/http.h +++ b/src/include/http.h @@ -53,4 +53,19 @@ void ch_http_read_state_init(ch_http_read_state * state, char *data, size_t dat int ch_http_read_next(ch_http_read_state * state); void ch_http_response_free(ch_http_response_t * resp); +/* Streaming API: single HTTP request with pause/unpause batching */ +typedef struct ch_http_streaming_state ch_http_streaming_state; + +ch_http_streaming_state *ch_http_begin_streaming(ch_http_connection_t * conn, + const ch_query * query, + int fetch_size); +bool ch_http_fetch_batch(ch_http_streaming_state * state); +char *ch_http_streaming_error(ch_http_streaming_state * state); +long ch_http_streaming_status(ch_http_streaming_state * state); +double ch_http_streaming_request_time(ch_http_streaming_state * state); +double ch_http_streaming_total_time(ch_http_streaming_state * state); +char *ch_http_streaming_batch_data(ch_http_streaming_state * state); +size_t ch_http_streaming_batch_size(ch_http_streaming_state * state); +void ch_http_end_streaming(ch_http_streaming_state * state); + #endif /* CLICKHOUSE_HTTP_H */ diff --git a/src/minicoro.c b/src/minicoro.c new file mode 100644 index 00000000..68470c3a --- /dev/null +++ b/src/minicoro.c @@ -0,0 +1,6 @@ +/* + * minicoro.c + * Compile the minicoro implementation exactly once. + */ +#define MINICORO_IMPL +#include "minicoro.h" diff --git a/src/option.c b/src/option.c index a1e3c2cc..9239231e 100644 --- a/src/option.c +++ b/src/option.c @@ -70,6 +70,7 @@ static const ChFdwOption ch_options[] = */ static char *ch_session_settings = NULL; static kv_list * ch_session_settings_list = NULL; +static int ch_max_result_size = 0; /* MB, 0 = disabled */ /* * Helper functions @@ -149,6 +150,8 @@ InitChFdwOptions(void) {"table_name", ForeignTableRelationId, false}, {"engine", ForeignTableRelationId, false}, {"driver", ForeignServerRelationId, false}, + {"fetch_size", ForeignServerRelationId, false}, + {"fetch_size", ForeignTableRelationId, false}, {"aggregatefunction", AttributeRelationId, false}, {"simpleaggregatefunction", AttributeRelationId, false}, {"column_name", AttributeRelationId, false}, @@ -561,7 +564,25 @@ _PG_init(void) chfdw_settings_assign_hook, NULL); + DefineCustomIntVariable("pg_clickhouse.max_result_size", + "Maximum size (MB) for a ClickHouse response.", + "Aborts a query if the accumulated response data exceeds " + "this limit. 0 disables the check.", + &ch_max_result_size, + 0, /* default: disabled */ + 0, /* min */ + 65536, /* max: 64 GB */ + PGC_USERSET, + GUC_UNIT_MB, + NULL, NULL, NULL); + #if PG_VERSION_NUM >= 150000 MarkGUCPrefixReserved("pg_clickhouse"); #endif } + +int +chfdw_get_max_result_size(void) +{ + return ch_max_result_size; +} diff --git a/src/pglink.c b/src/pglink.c index 9693453e..254b5ca3 100644 --- a/src/pglink.c +++ b/src/pglink.c @@ -28,8 +28,10 @@ static bool initialized = false; static void http_disconnect(void *conn); static ch_cursor * http_simple_query(void *conn, const ch_query * query); +static ch_cursor * http_streaming_query(void *conn, const ch_query * query, int fetch_size); static void http_simple_insert(void *conn, const ch_query * query); static void http_cursor_free(void *); +static void http_streaming_cursor_free(void *); static Datum * http_fetch_row(ChFdwScanRowContext * ctx); static void *http_prepare_insert(void *, ResultRelInfo *, List *, const ch_query *, char *); static void http_insert_tuple(void *, TupleTableSlot *); @@ -38,15 +40,18 @@ static void char_to_datum(ChFdwScanRowContext * ctx, int attnum, char *data, siz static libclickhouse_methods http_methods = { .disconnect = http_disconnect, - .simple_query = http_simple_query, - .fetch_row = http_fetch_row, - .prepare_insert = http_prepare_insert, - .insert_tuple = http_insert_tuple + .simple_query = http_simple_query, + .streaming_query = http_streaming_query, + .fetch_row = http_fetch_row, + .prepare_insert = http_prepare_insert, + .insert_tuple = http_insert_tuple }; static void binary_disconnect(void *conn); static ch_cursor * binary_simple_query(void *conn, const ch_query * query); +static ch_cursor * binary_streaming_query(void *conn, const ch_query * query, int fetch_size); static void binary_cursor_free(void *cursor); +static void binary_streaming_cursor_free(void *cursor); /* static void binary_simple_insert(void *conn, const char *query); */ static Datum * binary_fetch_row(ChFdwScanRowContext * ctx); @@ -61,10 +66,11 @@ extern const char *ch_quote_ident(const char *rawstr); static libclickhouse_methods binary_methods = { .disconnect = binary_disconnect, - .simple_query = binary_simple_query, - .fetch_row = binary_fetch_row, - .prepare_insert = binary_prepare_insert, - .insert_tuple = binary_insert_tuple + .simple_query = binary_simple_query, + .streaming_query = binary_streaming_query, + .fetch_row = binary_fetch_row, + .prepare_insert = binary_prepare_insert, + .insert_tuple = binary_insert_tuple }; static int @@ -261,6 +267,94 @@ http_simple_query(void *conn, const ch_query * query) return cursor; } +static void +http_streaming_cursor_free(void *c) +{ + ch_cursor *cursor = c; + + if (cursor->streaming_state) + ch_http_end_streaming(cursor->streaming_state); +} + +/* + * Start a streaming query over HTTP. Returns a cursor that yields rows + * in bounded-memory batches of fetch_size rows each. + */ +static ch_cursor * +http_streaming_query(void *conn, const ch_query * query, int fetch_size) +{ + MemoryContext tempcxt, + oldcxt; + ch_cursor *cursor; + ch_http_streaming_state *sstate; + + ch_http_set_progress_func(http_progress_callback); + + sstate = ch_http_begin_streaming(conn, query, fetch_size); + if (sstate == NULL) + elog(ERROR, "out of memory"); + + { + char *err = ch_http_streaming_error(sstate); + + if (err != NULL) + { + char *errcopy = pstrdup(err); + + ch_http_end_streaming(sstate); + ereport(ERROR, + (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), + errmsg("pg_clickhouse: %s", format_error(errcopy)))); + } + } + + { + long status = ch_http_streaming_status(sstate); + + if (status != 0 && status != 200) + { + char *data = ch_http_streaming_batch_data(sstate); + char *errcopy = data ? pstrdup(data) : pstrdup("unknown error"); + + ch_http_end_streaming(sstate); + ereport(ERROR, ( + errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), + errmsg("pg_clickhouse: %s", format_error(errcopy)), + status < 404 ? 0 : errdetail_internal("Remote Query: %.64000s", query->sql), + errcontext("HTTP status code: %li", status) + )); + } + } + + tempcxt = AllocSetContextCreate(PortalContext, "pg_clickhouse streaming cursor", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(tempcxt); + + cursor = palloc0(sizeof(ch_cursor)); + cursor->streaming_state = sstate; + cursor->is_streaming = true; + cursor->query = pstrdup(query->sql); + cursor->request_time = ch_http_streaming_request_time(sstate) * 1000; + cursor->total_time = ch_http_streaming_total_time(sstate) * 1000; + + /* + * Initialize read_state for the first batch. The batch data pointer + * comes from the streaming state's rolling buffer. + */ + cursor->read_state = palloc0(sizeof(ch_http_read_state)); + ch_http_read_state_init(cursor->read_state, + ch_http_streaming_batch_data(sstate), + ch_http_streaming_batch_size(sstate)); + + cursor->memcxt = tempcxt; + cursor->callback.func = http_streaming_cursor_free; + cursor->callback.arg = cursor; + MemoryContextRegisterResetCallback(tempcxt, &cursor->callback); + MemoryContextSwitchTo(oldcxt); + + return cursor; +} + static void http_simple_insert(void *conn, const ch_query * query) { @@ -326,9 +420,24 @@ http_fetch_row(ChFdwScanRowContext * ctx) Datum *values; ch_http_read_state *state = cursor->read_state; - /* All rows or empty table. */ + /* All rows or empty table — for streaming, try next batch. */ if (state->done || state->data == NULL) - return NULL; + { + if (!cursor->is_streaming) + return NULL; + + /* Attempt to fetch the next batch from the stream. */ + if (!ch_http_fetch_batch(cursor->streaming_state)) + return NULL; + + /* Re-initialize read state with new batch data. */ + ch_http_read_state_init(state, + ch_http_streaming_batch_data(cursor->streaming_state), + ch_http_streaming_batch_size(cursor->streaming_state)); + + if (state->done || state->data == NULL) + return NULL; + } /* Special case: SELECT NULL. */ if (attcount == 0) @@ -689,6 +798,67 @@ binary_simple_query(void *conn, const ch_query * query) return cursor; } +static void +binary_streaming_cursor_free(void *c) +{ + ch_cursor *cursor = c; + + if (cursor->streaming_state) + ch_binary_end_streaming(cursor->streaming_state); +} + +/* + * Start a streaming query over the binary protocol. Returns a cursor + * that yields rows block-by-block via coroutine. + */ +static ch_cursor * +binary_streaming_query(void *conn, const ch_query * query, int fetch_size) +{ + MemoryContext tempcxt, + oldcxt; + ch_cursor *cursor; + ch_binary_streaming_state *sstate; + + sstate = ch_binary_begin_streaming(conn, query, &is_canceled); + if (sstate == NULL) + elog(ERROR, "out of memory"); + + { + char *err = ch_binary_streaming_error(sstate); + + if (err != NULL) + { + char *errcopy = pstrdup(err); + + ch_binary_end_streaming(sstate); + ereport(ERROR, ( + errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), + errmsg("pg_clickhouse: %s", errcopy), + errdetail_internal("Remote Query: %.64000s", query->sql) + )); + } + } + + tempcxt = AllocSetContextCreate(PortalContext, "pg_clickhouse streaming cursor", + ALLOCSET_DEFAULT_SIZES); + + oldcxt = MemoryContextSwitchTo(tempcxt); + cursor = palloc0(sizeof(ch_cursor)); + cursor->streaming_state = sstate; + cursor->is_streaming = true; + cursor->query = pstrdup(query->sql); + cursor->columns_count = ch_binary_streaming_columns(sstate); + cursor->conversion_states = palloc0(sizeof(uintptr_t) * Max(cursor->columns_count, 1)); + + cursor->memcxt = tempcxt; + cursor->callback.func = binary_streaming_cursor_free; + cursor->callback.arg = cursor; + MemoryContextRegisterResetCallback(tempcxt, &cursor->callback); + MemoryContextSwitchTo(oldcxt); + + return cursor; +} + /* * Fetch a row from the binary cursor and return its values. * @@ -704,6 +874,92 @@ binary_simple_query(void *conn, const ch_query * query) * text `Datum`s. This is the use case for `chfdw_construct_create_tables()`, * which only cares about text. */ +/* + * Streaming variant of binary_fetch_row. Reads rows from the coroutine- + * backed streaming state, fetching the next block when the current one + * is exhausted. + */ +static Datum * +binary_streaming_fetch_row(ChFdwScanRowContext * ctx) +{ + ListCell *lc; + ch_cursor *cursor = ctx->cursor; + ch_binary_streaming_state *sstate = cursor->streaming_state; + List *attrs = ctx->retrieved_attrs; + TupleDesc tupdesc = ctx->tupdesc; + Datum *values = ctx->values; + bool *nulls = ctx->nulls; + size_t attcount = list_length(attrs); + + /* Try to read a row; if block exhausted, fetch next block. */ + while (!ch_binary_streaming_read_row(sstate)) + { + char *err = ch_binary_streaming_error(sstate); + + if (err != NULL) + ereport(ERROR, + (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), + errmsg("pg_clickhouse: error while reading row: %s", err))); + + if (!ch_binary_fetch_block(sstate)) + return NULL; /* no more blocks — EOF */ + } + + if (tupdesc) + { + size_t j = 0; + + Assert(values && nulls); + foreach(lc, attrs) + { + int i = lfirst_int(lc); + Oid valtype; + bool isnull; + Datum val; + intptr_t convstate; + + val = ch_binary_streaming_value(sstate, j, &valtype, &isnull); + + if (isnull) + values[i - 1] = (Datum) 0; + else + { + again_s: + convstate = cursor->conversion_states[j]; + switch (convstate) + { + case 0: + { + MemoryContext old_mcxt; + Oid outtype = TupleDescAttr(tupdesc, i - 1)->atttypid; + void *s; + + old_mcxt = MemoryContextSwitchTo(cursor->memcxt); + s = ch_binary_init_convert_state(val, valtype, outtype); + MemoryContextSwitchTo(old_mcxt); + + if (s == NULL) + cursor->conversion_states[j] = 1; + else + cursor->conversion_states[j] = (uintptr_t) s; + goto again_s; + } + case 1: + values[i - 1] = val; + break; + default: + values[i - 1] = ch_binary_convert_datum((void *) convstate, val); + } + } + + nulls[i - 1] = isnull; + j++; + } + } + + return values; +} + static Datum * binary_fetch_row(ChFdwScanRowContext * ctx) { @@ -713,10 +969,17 @@ binary_fetch_row(ChFdwScanRowContext * ctx) TupleDesc tupdesc = ctx->tupdesc; Datum *values = ctx->values; bool *nulls = ctx->nulls; - ch_binary_read_state_t *state = cursor->read_state; - bool have_data = ch_binary_read_row(state); + ch_binary_read_state_t *state; + bool have_data; size_t attcount = list_length(attrs); + /* Streaming cursors use a separate code path. */ + if (cursor->is_streaming) + return binary_streaming_fetch_row(ctx); + + state = cursor->read_state; + have_data = ch_binary_read_row(state); + if (state->error) ereport(ERROR, (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), diff --git a/vendor/minicoro b/vendor/minicoro new file mode 160000 index 00000000..02dad0f8 --- /dev/null +++ b/vendor/minicoro @@ -0,0 +1 @@ +Subproject commit 02dad0f8b7cbb12fe6e216ae7a76db15ca55cd7b From f0b878852d238857bfd9c7934b05ffe15870e436 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Wed, 1 Apr 2026 10:12:06 -0500 Subject: [PATCH 02/13] Fix stale comments referencing old streaming design The streaming API comments still referenced the earlier design (background threads for binary, curl pause/unpause for HTTP). Update them to reflect the actual minicoro coroutine implementation. --- src/include/binary.hh | 2 +- src/include/http.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/include/binary.hh b/src/include/binary.hh index bfa14119..4eab9bf5 100644 --- a/src/include/binary.hh +++ b/src/include/binary.hh @@ -74,7 +74,7 @@ extern "C" const ch_query * query, bool (*check_cancel) (void)); extern void ch_binary_response_free(ch_binary_response_t * resp); -/* Streaming binary API: background thread with bounded block queue */ +/* Streaming binary API: coroutine-based block-at-a-time iteration */ typedef struct ch_binary_streaming_state ch_binary_streaming_state; extern ch_binary_streaming_state * ch_binary_begin_streaming( diff --git a/src/include/http.h b/src/include/http.h index ce48f094..fd91a47f 100644 --- a/src/include/http.h +++ b/src/include/http.h @@ -53,7 +53,7 @@ void ch_http_read_state_init(ch_http_read_state * state, char *data, size_t dat int ch_http_read_next(ch_http_read_state * state); void ch_http_response_free(ch_http_response_t * resp); -/* Streaming API: single HTTP request with pause/unpause batching */ +/* Streaming API: single HTTP request with coroutine-based batching */ typedef struct ch_http_streaming_state ch_http_streaming_state; ch_http_streaming_state *ch_http_begin_streaming(ch_http_connection_t * conn, From b452daba3822576d2aecec360148425c355a5171 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Wed, 1 Apr 2026 10:16:47 -0500 Subject: [PATCH 03/13] Fix CI: remove unused variable, apply pgindent formatting Remove unused local `errbuffer` in `http_streaming_coro` that caused -Werror=unused-but-set-variable on GCC. Apply pg_bsd_indent formatting to all files modified by the streaming PR. --- src/fdw.c.in | 4 ++-- src/http.c | 14 ++++++-------- src/include/binary.hh | 8 ++++---- src/option.c | 6 +++--- src/pglink.c | 20 ++++++++++---------- 5 files changed, 25 insertions(+), 27 deletions(-) diff --git a/src/fdw.c.in b/src/fdw.c.in index 569ec835..f0b6cd1e 100644 --- a/src/fdw.c.in +++ b/src/fdw.c.in @@ -974,10 +974,10 @@ clickhouseIterateForeignScan(ForeignScanState * node) if (fsstate->fetch_size > 0 && fsstate->conn.methods->streaming_query != NULL) fsstate->ch_cursor = fsstate->conn.methods->streaming_query( - fsstate->conn.conn, &query, fsstate->fetch_size); + fsstate->conn.conn, &query, fsstate->fetch_size); else fsstate->ch_cursor = fsstate->conn.methods->simple_query( - fsstate->conn.conn, &query); + fsstate->conn.conn, &query); time_used += fsstate->ch_cursor->request_time; MemoryContextSwitchTo(old); diff --git a/src/http.c b/src/http.c index 5121c778..f9c2d982 100644 --- a/src/http.c +++ b/src/http.c @@ -450,13 +450,11 @@ write_data_streaming(void *contents, size_t size, size_t nmemb, void *userp) * whenever a batch is ready. */ static void -http_streaming_coro(mco_coro *co) +http_streaming_coro(mco_coro * co) { ch_http_streaming_state *st = mco_get_user_data(co); CURLcode errcode; - static char errbuffer[CURL_ERROR_SIZE]; - errbuffer[0] = '\0'; errcode = curl_easy_perform(st->conn->curl); if (errcode != CURLE_OK && errcode != CURLE_WRITE_ERROR) @@ -667,7 +665,7 @@ ch_http_fetch_batch(ch_http_streaming_state * st) return false; } -char * +char * ch_http_streaming_error(ch_http_streaming_state * st) { return st ? st->error : NULL; @@ -691,7 +689,7 @@ ch_http_streaming_total_time(ch_http_streaming_state * st) return st ? st->total_time : 0; } -char * +char * ch_http_streaming_batch_data(ch_http_streaming_state * st) { return st ? st->buf : NULL; @@ -710,9 +708,9 @@ ch_http_end_streaming(ch_http_streaming_state * st) return; /* - * Destroy the coroutine without resuming. This abandons - * curl_easy_perform mid-transfer, so reset the easy handle - * afterwards to ensure curl cleans up its internal state. + * Destroy the coroutine without resuming. This abandons curl_easy_perform + * mid-transfer, so reset the easy handle afterwards to ensure curl cleans + * up its internal state. */ if (st->co) mco_destroy(st->co); diff --git a/src/include/binary.hh b/src/include/binary.hh index 4eab9bf5..de91ff79 100644 --- a/src/include/binary.hh +++ b/src/include/binary.hh @@ -78,14 +78,14 @@ extern "C" typedef struct ch_binary_streaming_state ch_binary_streaming_state; extern ch_binary_streaming_state * ch_binary_begin_streaming( - ch_binary_connection_t * conn, const ch_query * query, - bool (*check_cancel) (void)); + ch_binary_connection_t * conn, const ch_query * query, + bool (*check_cancel) (void)); extern bool ch_binary_fetch_block(ch_binary_streaming_state * state); extern bool ch_binary_streaming_read_row(ch_binary_streaming_state * state); extern size_t ch_binary_streaming_columns(ch_binary_streaming_state * state); extern Datum ch_binary_streaming_value(ch_binary_streaming_state * state, size_t col, - Oid * valtype, bool * is_null); - extern char * ch_binary_streaming_error(ch_binary_streaming_state * state); + Oid * valtype, bool *is_null); + extern char *ch_binary_streaming_error(ch_binary_streaming_state * state); extern void ch_binary_end_streaming(ch_binary_streaming_state * state); /* reading */ diff --git a/src/option.c b/src/option.c index 9239231e..2c28bec8 100644 --- a/src/option.c +++ b/src/option.c @@ -70,7 +70,7 @@ static const ChFdwOption ch_options[] = */ static char *ch_session_settings = NULL; static kv_list * ch_session_settings_list = NULL; -static int ch_max_result_size = 0; /* MB, 0 = disabled */ +static int ch_max_result_size = 0; /* MB, 0 = disabled */ /* * Helper functions @@ -569,8 +569,8 @@ _PG_init(void) "Aborts a query if the accumulated response data exceeds " "this limit. 0 disables the check.", &ch_max_result_size, - 0, /* default: disabled */ - 0, /* min */ + 0, /* default: disabled */ + 0, /* min */ 65536, /* max: 64 GB */ PGC_USERSET, GUC_UNIT_MB, diff --git a/src/pglink.c b/src/pglink.c index 254b5ca3..135e5d98 100644 --- a/src/pglink.c +++ b/src/pglink.c @@ -40,11 +40,11 @@ static void char_to_datum(ChFdwScanRowContext * ctx, int attnum, char *data, siz static libclickhouse_methods http_methods = { .disconnect = http_disconnect, - .simple_query = http_simple_query, - .streaming_query = http_streaming_query, - .fetch_row = http_fetch_row, - .prepare_insert = http_prepare_insert, - .insert_tuple = http_insert_tuple + .simple_query = http_simple_query, + .streaming_query = http_streaming_query, + .fetch_row = http_fetch_row, + .prepare_insert = http_prepare_insert, + .insert_tuple = http_insert_tuple }; static void binary_disconnect(void *conn); @@ -66,11 +66,11 @@ extern const char *ch_quote_ident(const char *rawstr); static libclickhouse_methods binary_methods = { .disconnect = binary_disconnect, - .simple_query = binary_simple_query, - .streaming_query = binary_streaming_query, - .fetch_row = binary_fetch_row, - .prepare_insert = binary_prepare_insert, - .insert_tuple = binary_insert_tuple + .simple_query = binary_simple_query, + .streaming_query = binary_streaming_query, + .fetch_row = binary_fetch_row, + .prepare_insert = binary_prepare_insert, + .insert_tuple = binary_insert_tuple }; static int From f6442fb47868f20cd61d86fe20f34a9d31ed7133 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Wed, 1 Apr 2026 10:21:23 -0500 Subject: [PATCH 04/13] Remove unused variable in binary_streaming_fetch_row The attcount variable was declared but never referenced, causing -Werror=unused-variable failures on GCC in CI. --- src/pglink.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pglink.c b/src/pglink.c index 135e5d98..17eeb7fd 100644 --- a/src/pglink.c +++ b/src/pglink.c @@ -889,7 +889,6 @@ binary_streaming_fetch_row(ChFdwScanRowContext * ctx) TupleDesc tupdesc = ctx->tupdesc; Datum *values = ctx->values; bool *nulls = ctx->nulls; - size_t attcount = list_length(attrs); /* Try to read a row; if block exhausted, fetch next block. */ while (!ch_binary_streaming_read_row(sstate)) From 7ce8bdb23857ba33e04c01b4e0f9d4576febd317 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Wed, 1 Apr 2026 10:30:38 -0500 Subject: [PATCH 05/13] Address review feedback on streaming scan PR - Remove confusing comment above the streaming re-fetch branch in http_fetch_row (the control flow speaks for itself) - Remove stray docstring that got wedged between binary_fetch_row's comment block and binary_streaming_fetch_row - Extract binary_convert_column helper to deduplicate the conversion state logic shared by binary_fetch_row and binary_streaming_fetch_row - Extract binary_cursor_create helper to share cursor allocation boilerplate between binary_simple_query and binary_streaming_query --- src/pglink.c | 244 ++++++++++++++++++++++++--------------------------- 1 file changed, 113 insertions(+), 131 deletions(-) diff --git a/src/pglink.c b/src/pglink.c index 17eeb7fd..740523f7 100644 --- a/src/pglink.c +++ b/src/pglink.c @@ -420,17 +420,14 @@ http_fetch_row(ChFdwScanRowContext * ctx) Datum *values; ch_http_read_state *state = cursor->read_state; - /* All rows or empty table — for streaming, try next batch. */ if (state->done || state->data == NULL) { if (!cursor->is_streaming) return NULL; - /* Attempt to fetch the next batch from the stream. */ if (!ch_http_fetch_batch(cursor->streaming_state)) return NULL; - /* Re-initialize read state with new batch data. */ ch_http_read_state_init(state, ch_http_streaming_batch_data(cursor->streaming_state), ch_http_streaming_batch_size(cursor->streaming_state)); @@ -746,12 +743,39 @@ binary_disconnect(void *conn) ch_binary_close((ch_binary_connection_t *) conn); } +/* + * Allocate a binary cursor in its own memory context with a cleanup callback. + */ static ch_cursor * -binary_simple_query(void *conn, const ch_query * query) +binary_cursor_create(const char *sql, size_t columns_count, + MemoryContextCallbackFunction cleanup_func) { MemoryContext tempcxt, oldcxt; ch_cursor *cursor; + + tempcxt = AllocSetContextCreate(PortalContext, "pg_clickhouse cursor", + ALLOCSET_DEFAULT_SIZES); + + oldcxt = MemoryContextSwitchTo(tempcxt); + cursor = palloc0(sizeof(ch_cursor)); + cursor->query = pstrdup(sql); + cursor->columns_count = columns_count; + cursor->conversion_states = palloc0(sizeof(uintptr_t) * Max(columns_count, 1)); + + cursor->memcxt = tempcxt; + cursor->callback.func = cleanup_func; + cursor->callback.arg = cursor; + MemoryContextRegisterResetCallback(tempcxt, &cursor->callback); + MemoryContextSwitchTo(oldcxt); + + return cursor; +} + +static ch_cursor * +binary_simple_query(void *conn, const ch_query * query) +{ + ch_cursor *cursor; ch_binary_read_state_t *state; ch_binary_response_t *resp = ch_binary_simple_query(conn, query, &is_canceled); @@ -768,24 +792,18 @@ binary_simple_query(void *conn, const ch_query * query) )); } - tempcxt = AllocSetContextCreate(PortalContext, "pg_clickhouse cursor", - ALLOCSET_DEFAULT_SIZES); + cursor = binary_cursor_create(query->sql, resp->columns_count, + binary_cursor_free); - oldcxt = MemoryContextSwitchTo(tempcxt); - cursor = palloc0(sizeof(ch_cursor)); - cursor->query_response = resp; - state = (ch_binary_read_state_t *) palloc0(sizeof(ch_binary_read_state_t)); - cursor->query = pstrdup(query->sql); - cursor->read_state = state; - cursor->columns_count = resp->columns_count; - ch_binary_read_state_init(cursor->read_state, resp); - cursor->conversion_states = palloc0(sizeof(uintptr_t) * cursor->columns_count); + { + MemoryContext oldcxt = MemoryContextSwitchTo(cursor->memcxt); - cursor->memcxt = tempcxt; - cursor->callback.func = binary_cursor_free; - cursor->callback.arg = cursor; - MemoryContextRegisterResetCallback(tempcxt, &cursor->callback); - MemoryContextSwitchTo(oldcxt); + cursor->query_response = resp; + state = (ch_binary_read_state_t *) palloc0(sizeof(ch_binary_read_state_t)); + cursor->read_state = state; + ch_binary_read_state_init(state, resp); + MemoryContextSwitchTo(oldcxt); + } if (state->error) { @@ -814,9 +832,6 @@ binary_streaming_cursor_free(void *c) static ch_cursor * binary_streaming_query(void *conn, const ch_query * query, int fetch_size) { - MemoryContext tempcxt, - oldcxt; - ch_cursor *cursor; ch_binary_streaming_state *sstate; sstate = ch_binary_begin_streaming(conn, query, &is_canceled); @@ -839,41 +854,63 @@ binary_streaming_query(void *conn, const ch_query * query, int fetch_size) } } - tempcxt = AllocSetContextCreate(PortalContext, "pg_clickhouse streaming cursor", - ALLOCSET_DEFAULT_SIZES); + { + ch_cursor *cursor; + + cursor = binary_cursor_create(query->sql, + ch_binary_streaming_columns(sstate), + binary_streaming_cursor_free); + cursor->streaming_state = sstate; + cursor->is_streaming = true; + return cursor; + } +} - oldcxt = MemoryContextSwitchTo(tempcxt); - cursor = palloc0(sizeof(ch_cursor)); - cursor->streaming_state = sstate; - cursor->is_streaming = true; - cursor->query = pstrdup(query->sql); - cursor->columns_count = ch_binary_streaming_columns(sstate); - cursor->conversion_states = palloc0(sizeof(uintptr_t) * Max(cursor->columns_count, 1)); +/* + * Convert a single binary column value, initializing the conversion state + * on first use. Shared by both the buffered and streaming fetch_row paths. + */ +static Datum +binary_convert_column(ch_cursor * cursor, TupleDesc tupdesc, + int attindex, size_t colindex, + Datum val, Oid valtype, bool isnull) +{ + intptr_t convstate; - cursor->memcxt = tempcxt; - cursor->callback.func = binary_streaming_cursor_free; - cursor->callback.arg = cursor; - MemoryContextRegisterResetCallback(tempcxt, &cursor->callback); - MemoryContextSwitchTo(oldcxt); + if (isnull) + return (Datum) 0; - return cursor; +retry: + convstate = cursor->conversion_states[colindex]; + switch (convstate) + { + case 0: + { + MemoryContext old_mcxt; + Oid outtype = TupleDescAttr(tupdesc, attindex)->atttypid; + void *s; + + /* + * Conversion states must outlive the per-tuple memory + * context, so allocate them in the cursor's memory context. + */ + old_mcxt = MemoryContextSwitchTo(cursor->memcxt); + s = ch_binary_init_convert_state(val, valtype, outtype); + MemoryContextSwitchTo(old_mcxt); + + if (s == NULL) + cursor->conversion_states[colindex] = 1; + else + cursor->conversion_states[colindex] = (uintptr_t) s; + goto retry; + } + case 1: + return val; + default: + return ch_binary_convert_datum((void *) convstate, val); + } } -/* - * Fetch a row from the binary cursor and return its values. - * - * If ctx->tupdesc is set, ctx->attinmeta must also be set, and ctx->values - * and ctx->nulls must already be palloc'd with space for ctx->tupdesc->natts - * values. - * - * Use ctx->tupdesc and ctx->attinmeta to convert the values to the - * appropriate Datums, and store them and the indication of their NULLness in - * ctx->values and ctx->nulls, respectively, then return ctx->values. - * - * If ctx->tupdesc is not set, treat all values as text and return them as - * text `Datum`s. This is the use case for `chfdw_construct_create_tables()`, - * which only cares about text. - */ /* * Streaming variant of binary_fetch_row. Reads rows from the coroutine- * backed streaming state, fetching the next block when the current one @@ -915,42 +952,11 @@ binary_streaming_fetch_row(ChFdwScanRowContext * ctx) Oid valtype; bool isnull; Datum val; - intptr_t convstate; val = ch_binary_streaming_value(sstate, j, &valtype, &isnull); - - if (isnull) - values[i - 1] = (Datum) 0; - else - { - again_s: - convstate = cursor->conversion_states[j]; - switch (convstate) - { - case 0: - { - MemoryContext old_mcxt; - Oid outtype = TupleDescAttr(tupdesc, i - 1)->atttypid; - void *s; - - old_mcxt = MemoryContextSwitchTo(cursor->memcxt); - s = ch_binary_init_convert_state(val, valtype, outtype); - MemoryContextSwitchTo(old_mcxt); - - if (s == NULL) - cursor->conversion_states[j] = 1; - else - cursor->conversion_states[j] = (uintptr_t) s; - goto again_s; - } - case 1: - values[i - 1] = val; - break; - default: - values[i - 1] = ch_binary_convert_datum((void *) convstate, val); - } - } - + values[i - 1] = binary_convert_column(cursor, tupdesc, + i - 1, j, + val, valtype, isnull); nulls[i - 1] = isnull; j++; } @@ -959,6 +965,21 @@ binary_streaming_fetch_row(ChFdwScanRowContext * ctx) return values; } +/* + * Fetch a row from the binary cursor and return its values. + * + * If ctx->tupdesc is set, ctx->attinmeta must also be set, and ctx->values + * and ctx->nulls must already be palloc'd with space for ctx->tupdesc->natts + * values. + * + * Use ctx->tupdesc and ctx->attinmeta to convert the values to the + * appropriate Datums, and store them and the indication of their NULLness in + * ctx->values and ctx->nulls, respectively, then return ctx->values. + * + * If ctx->tupdesc is not set, treat all values as text and return them as + * text `Datum`s. This is the use case for `chfdw_construct_create_tables()`, + * which only cares about text. + */ static Datum * binary_fetch_row(ChFdwScanRowContext * ctx) { @@ -1019,51 +1040,12 @@ binary_fetch_row(ChFdwScanRowContext * ctx) { int i = lfirst_int(lc); bool isnull = state->nulls[j]; - intptr_t convstate; - - - if (isnull) - values[i - 1] = (Datum) 0; - else - { - again: - convstate = cursor->conversion_states[j]; - switch (convstate) - { - case 0: - { - MemoryContext old_mcxt; - - Oid outtype = TupleDescAttr(tupdesc, i - 1)->atttypid; - void *s; - - /* - * now we're should be in temporary memory - * context, so make sure conversion states outlive - * it. - */ - old_mcxt = MemoryContextSwitchTo(cursor->memcxt); - s = ch_binary_init_convert_state(state->values[j], - state->coltypes[j], outtype); - MemoryContextSwitchTo(old_mcxt); - - if (s == NULL) - /* no conversion but state is initialized */ - cursor->conversion_states[j] = 1; - else - cursor->conversion_states[j] = (uintptr_t) s; - goto again; - } - case 1: - /* no conversion */ - values[i - 1] = state->values[j]; - break; - default: - values[i - 1] = ch_binary_convert_datum((void *) convstate, - state->values[j]); - } - } + values[i - 1] = binary_convert_column(cursor, tupdesc, + i - 1, j, + state->values[j], + state->coltypes[j], + isnull); nulls[i - 1] = isnull; j++; } From 698c470102e03fcbd9279a7448f56880a7b7114c Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Wed, 1 Apr 2026 10:35:03 -0500 Subject: [PATCH 06/13] Extract shared cursor_create helper for HTTP and binary paths Generalize binary_cursor_create into cursor_create and use it from all four query functions (http_simple_query, http_streaming_query, binary_simple_query, binary_streaming_query). This removes the duplicated AllocSetContextCreate / palloc0 / MemoryContextRegister boilerplate from each caller. --- src/pglink.c | 131 +++++++++++++++++++++------------------------------ 1 file changed, 55 insertions(+), 76 deletions(-) diff --git a/src/pglink.c b/src/pglink.c index 740523f7..553fccf0 100644 --- a/src/pglink.c +++ b/src/pglink.c @@ -188,13 +188,42 @@ kill_query(void *conn, const char *query_id) ch_http_response_free(resp); } +/* + * Allocate a cursor in its own memory context with a cleanup callback. + * Used by both HTTP and binary query paths. + */ static ch_cursor * -http_simple_query(void *conn, const ch_query * query) +cursor_create(const char *sql, size_t columns_count, + MemoryContextCallbackFunction cleanup_func) { - int attempts = 0; MemoryContext tempcxt, oldcxt; ch_cursor *cursor; + + tempcxt = AllocSetContextCreate(PortalContext, "pg_clickhouse cursor", + ALLOCSET_DEFAULT_SIZES); + + oldcxt = MemoryContextSwitchTo(tempcxt); + cursor = palloc0(sizeof(ch_cursor)); + cursor->query = pstrdup(sql); + cursor->columns_count = columns_count; + if (columns_count > 0) + cursor->conversion_states = palloc0(sizeof(uintptr_t) * columns_count); + + cursor->memcxt = tempcxt; + cursor->callback.func = cleanup_func; + cursor->callback.arg = cursor; + MemoryContextRegisterResetCallback(tempcxt, &cursor->callback); + MemoryContextSwitchTo(oldcxt); + + return cursor; +} + +static ch_cursor * +http_simple_query(void *conn, const ch_query * query) +{ + int attempts = 0; + ch_cursor *cursor; ch_http_response_t *resp; ch_http_set_progress_func(http_progress_callback); @@ -242,27 +271,18 @@ http_simple_query(void *conn, const ch_query * query) )); } - /* - * we could not control properly deallocation of libclickhouse memory, so - * we use memory context callbacks for that - */ - tempcxt = AllocSetContextCreate(PortalContext, "pg_clickhouse cursor", - ALLOCSET_DEFAULT_SIZES); - oldcxt = MemoryContextSwitchTo(tempcxt); + cursor = cursor_create(query->sql, 0, http_cursor_free); - cursor = palloc0(sizeof(ch_cursor)); - cursor->query_response = resp; - cursor->read_state = palloc0(sizeof(ch_http_read_state)); - cursor->query = pstrdup(query->sql); - cursor->request_time = resp->pretransfer_time * 1000; - cursor->total_time = resp->total_time * 1000; - ch_http_read_state_init(cursor->read_state, resp->data, resp->datasize); + { + MemoryContext oldcxt = MemoryContextSwitchTo(cursor->memcxt); - cursor->memcxt = tempcxt; - cursor->callback.func = http_cursor_free; - cursor->callback.arg = cursor; - MemoryContextRegisterResetCallback(tempcxt, &cursor->callback); - MemoryContextSwitchTo(oldcxt); + cursor->query_response = resp; + cursor->read_state = palloc0(sizeof(ch_http_read_state)); + cursor->request_time = resp->pretransfer_time * 1000; + cursor->total_time = resp->total_time * 1000; + ch_http_read_state_init(cursor->read_state, resp->data, resp->datasize); + MemoryContextSwitchTo(oldcxt); + } return cursor; } @@ -283,8 +303,6 @@ http_streaming_cursor_free(void *c) static ch_cursor * http_streaming_query(void *conn, const ch_query * query, int fetch_size) { - MemoryContext tempcxt, - oldcxt; ch_cursor *cursor; ch_http_streaming_state *sstate; @@ -326,31 +344,21 @@ http_streaming_query(void *conn, const ch_query * query, int fetch_size) } } - tempcxt = AllocSetContextCreate(PortalContext, "pg_clickhouse streaming cursor", - ALLOCSET_DEFAULT_SIZES); - oldcxt = MemoryContextSwitchTo(tempcxt); - - cursor = palloc0(sizeof(ch_cursor)); + cursor = cursor_create(query->sql, 0, http_streaming_cursor_free); cursor->streaming_state = sstate; cursor->is_streaming = true; - cursor->query = pstrdup(query->sql); cursor->request_time = ch_http_streaming_request_time(sstate) * 1000; cursor->total_time = ch_http_streaming_total_time(sstate) * 1000; - /* - * Initialize read_state for the first batch. The batch data pointer - * comes from the streaming state's rolling buffer. - */ - cursor->read_state = palloc0(sizeof(ch_http_read_state)); - ch_http_read_state_init(cursor->read_state, - ch_http_streaming_batch_data(sstate), - ch_http_streaming_batch_size(sstate)); + { + MemoryContext oldcxt = MemoryContextSwitchTo(cursor->memcxt); - cursor->memcxt = tempcxt; - cursor->callback.func = http_streaming_cursor_free; - cursor->callback.arg = cursor; - MemoryContextRegisterResetCallback(tempcxt, &cursor->callback); - MemoryContextSwitchTo(oldcxt); + cursor->read_state = palloc0(sizeof(ch_http_read_state)); + ch_http_read_state_init(cursor->read_state, + ch_http_streaming_batch_data(sstate), + ch_http_streaming_batch_size(sstate)); + MemoryContextSwitchTo(oldcxt); + } return cursor; } @@ -743,35 +751,6 @@ binary_disconnect(void *conn) ch_binary_close((ch_binary_connection_t *) conn); } -/* - * Allocate a binary cursor in its own memory context with a cleanup callback. - */ -static ch_cursor * -binary_cursor_create(const char *sql, size_t columns_count, - MemoryContextCallbackFunction cleanup_func) -{ - MemoryContext tempcxt, - oldcxt; - ch_cursor *cursor; - - tempcxt = AllocSetContextCreate(PortalContext, "pg_clickhouse cursor", - ALLOCSET_DEFAULT_SIZES); - - oldcxt = MemoryContextSwitchTo(tempcxt); - cursor = palloc0(sizeof(ch_cursor)); - cursor->query = pstrdup(sql); - cursor->columns_count = columns_count; - cursor->conversion_states = palloc0(sizeof(uintptr_t) * Max(columns_count, 1)); - - cursor->memcxt = tempcxt; - cursor->callback.func = cleanup_func; - cursor->callback.arg = cursor; - MemoryContextRegisterResetCallback(tempcxt, &cursor->callback); - MemoryContextSwitchTo(oldcxt); - - return cursor; -} - static ch_cursor * binary_simple_query(void *conn, const ch_query * query) { @@ -792,8 +771,8 @@ binary_simple_query(void *conn, const ch_query * query) )); } - cursor = binary_cursor_create(query->sql, resp->columns_count, - binary_cursor_free); + cursor = cursor_create(query->sql, resp->columns_count, + binary_cursor_free); { MemoryContext oldcxt = MemoryContextSwitchTo(cursor->memcxt); @@ -857,9 +836,9 @@ binary_streaming_query(void *conn, const ch_query * query, int fetch_size) { ch_cursor *cursor; - cursor = binary_cursor_create(query->sql, - ch_binary_streaming_columns(sstate), - binary_streaming_cursor_free); + cursor = cursor_create(query->sql, + ch_binary_streaming_columns(sstate), + binary_streaming_cursor_free); cursor->streaming_state = sstate; cursor->is_streaming = true; return cursor; From 1a75e36fa4017e060fef9903bc70c4b6ef711812 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Wed, 1 Apr 2026 10:41:27 -0500 Subject: [PATCH 07/13] Remove unnecessary aside from streaming HTTP comment --- src/http.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/http.c b/src/http.c index f9c2d982..bd84ba0d 100644 --- a/src/http.c +++ b/src/http.c @@ -351,7 +351,7 @@ ch_http_response_free(ch_http_response_t * resp) * Uses minicoro to run curl_easy_perform inside a coroutine. * The write callback counts complete TSV rows; when a batch is * ready it yields back to the caller. Resuming the coroutine - * continues the transfer. No threads, no curl_multi. + * continues the transfer. * ============================================================ */ From 3900f18428e0be508562fda925adfd7912fb5084 Mon Sep 17 00:00:00 2001 From: "David E. Wheeler" Date: Wed, 1 Apr 2026 13:27:37 -0400 Subject: [PATCH 08/13] Teach Makefile to automatically clone minicoro --- Makefile | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index ba317a5e..400f99c1 100644 --- a/Makefile +++ b/Makefile @@ -33,8 +33,9 @@ ifndef CH_BUILD endif # clickhouse-cpp source and build directories. -CH_CPP_DIR = vendor/clickhouse-cpp -CH_CPP_BUILD_DIR = vendor/_build/$(OS)-$(ARCH)-$(CH_BUILD)-$(shell git submodule status $(CH_CPP_DIR) | awk '{print substr($$1, 0, 7)}') +VENDOR_DIR = vendor +CH_CPP_DIR = $(VENDOR_DIR)/clickhouse-cpp +CH_CPP_BUILD_DIR = $(VENDOR_DIR)/_build/$(OS)-$(ARCH)-$(CH_BUILD)-$(shell git submodule status $(CH_CPP_DIR) | awk '{print substr($$1, 0, 7)}') # List the clickhouse-cpp libraries we require. CH_CPP_LIB = $(CH_CPP_BUILD_DIR)/clickhouse/libclickhouse-cpp-lib$(DLSUFFIX) @@ -56,7 +57,7 @@ else endif # Add include directories. -PG_CPPFLAGS = -I./src/include -I$(CH_CPP_DIR) -I$(CH_CPP_DIR)/contrib/absl -Ivendor/minicoro +PG_CPPFLAGS = -I./src/include -I$(CH_CPP_DIR) -I$(CH_CPP_DIR)/contrib/absl -I$(VENDOR_DIR)/minicoro # Include other libraries compiled into clickhouse-cpp. PG_LDFLAGS = -lstdc++ -lssl -lcrypto $(shell $(CURL_CONFIG) --libs) @@ -75,7 +76,7 @@ endif # Clean up the clickhouse-cpp build directory and generated files. EXTRA_CLEAN = sql/$(EXTENSION)--$(EXTVERSION).sql src/fdw.c compile_commands.json test/schedule $(EXTENSION)-$(DISTVERSION).zip ifndef NO_VENDOR_CLEAN - EXTRA_CLEAN += $(CH_CPP_BUILD_DIR) + EXTRA_CLEAN += $(VENDOR_DIR) endif # Import PGXS. @@ -101,8 +102,12 @@ $(shlib): $(CH_CPP_LIB) $(OBJS) $(CH_CPP_DIR)/CMakeLists.txt: git submodule update --init -# Require the vendored clickhouse-cpp. -$(OBJS): $(CH_CPP_DIR)/CMakeLists.txt +# Clone minicoro submodule. +$(VENDOR_DIR)/minicoro/minicoro.h: + git submodule update --init + +# Require the vendored libraries. +$(OBJS): $(CH_CPP_DIR)/CMakeLists.txt $(VENDOR_DIR)/minicoro/minicoro.h # Build clickhouse-cpp. $(CH_CPP_LIB): export CXXFLAGS=-fPIC @@ -112,7 +117,7 @@ $(CH_CPP_LIB): $(CH_CPP_DIR)/CMakeLists.txt # Sync with "Reset Vendor Timestamp" cmake --build $(CH_CPP_BUILD_DIR) --parallel $$(nproc) --target all # Require the versioned C source and SQL script. -all: sql/$(EXTENSION)--$(EXTVERSION).sql src/fdw.c +all: sql/$(EXTENSION)--$(EXTVERSION).sql src/fdw.c $(VENDOR_DIR)/minicoro/minicoro.h # Versioned SQL script. sql/$(EXTENSION)--$(EXTVERSION).sql: sql/$(EXTENSION).sql From 6b183b421c1a7372121352bc20fee8476c85f596 Mon Sep 17 00:00:00 2001 From: "David E. Wheeler" Date: Wed, 1 Apr 2026 13:46:17 -0400 Subject: [PATCH 09/13] Run `make indent` --- src/binary.cpp | 441 ++++++++++++++++++++++++------------------------- 1 file changed, 216 insertions(+), 225 deletions(-) diff --git a/src/binary.cpp b/src/binary.cpp index d6117071..20943804 100644 --- a/src/binary.cpp +++ b/src/binary.cpp @@ -1210,274 +1210,265 @@ extern "C" if (state->error) free(state->error); } -/* - * ============================================================ - * Streaming binary API - * - * Uses minicoro to run Client::Select inside a coroutine. - * The OnDataCancelable callback yields each block back to the - * consumer. No threads, no synchronization. - * ============================================================ - */ + /* + * ============================================================ + * Streaming binary API + * + * Uses minicoro to run Client::Select inside a coroutine. + * The OnDataCancelable callback yields each block back to the + * consumer. No threads, no synchronization. + * ============================================================ + */ #include "minicoro.h" -struct ch_binary_streaming_state -{ - mco_coro *co; - - /* Current block yielded by the coroutine */ - Block *current_block; /* points into coroutine stack; valid until next resume */ - size_t current_row; - bool have_block; /* true if current_block is valid */ - bool done; /* coroutine has finished */ - - /* Per-row scratch arrays (allocated once) */ - Oid *coltypes; - Datum *values; - bool *nulls; - size_t columns_count; - - /* Error / cancel */ - char *error; - bool canceled; - bool (*check_cancel)(void); - - /* Query details (owned, for coroutine to use) */ - Client *client; - std::string sql; - QuerySettings settings; - QueryParams params; -}; - -/* - * Coroutine function: runs Client::Select, yielding each block. - */ -static void -binary_streaming_coro(mco_coro *co) -{ - ch_binary_streaming_state *st = - (ch_binary_streaming_state *)mco_get_user_data(co); + struct ch_binary_streaming_state + { + mco_coro *co; + + /* Current block yielded by the coroutine */ + Block *current_block; /* points into coroutine stack; valid until next resume */ + size_t current_row; + bool have_block; /* true if current_block is valid */ + bool done; /* coroutine has finished */ + + /* Per-row scratch arrays (allocated once) */ + Oid *coltypes; + Datum *values; + bool *nulls; + size_t columns_count; + + /* Error / cancel */ + char *error; + bool canceled; + bool (*check_cancel)(void); + + /* Query details (owned, for coroutine to use) */ + Client *client; + std::string sql; + QuerySettings settings; + QueryParams params; + }; - try + /* + * Coroutine function: runs Client::Select, yielding each block. + */ + static void + binary_streaming_coro(mco_coro *co) { - st->client->Select( - clickhouse::Query(st->sql) - .SetQuerySettings(st->settings) - .SetParams(st->params) - .OnDataCancelable( - [st, co](const Block & block) -> bool - { - if (st->check_cancel && st->check_cancel()) - { - st->error = strdup("query was canceled"); - return false; - } + ch_binary_streaming_state *st = (ch_binary_streaming_state *)mco_get_user_data(co); - if (st->canceled) - return false; + try + { + st->client->Select(clickhouse::Query(st->sql) + .SetQuerySettings(st->settings) + .SetParams(st->params) + .OnDataCancelable( + [st, co](const Block &block) -> bool + { + if (st->check_cancel && st->check_cancel()) + { + st->error = strdup("query was canceled"); + return false; + } - if (block.GetColumnCount() == 0) - return true; + if (st->canceled) + return false; - /* Make the block available to the consumer and yield */ - st->current_block = const_cast(&block); - st->columns_count = block.GetColumnCount(); - st->have_block = true; - st->current_row = 0; + if (block.GetColumnCount() == 0) + return true; - mco_yield(co); + /* Make the block available to the consumer and yield */ + st->current_block = const_cast(&block); + st->columns_count = block.GetColumnCount(); + st->have_block = true; + st->current_row = 0; - /* Resumed by consumer — check if canceled */ - return !st->canceled; - } - ) - ); - } - catch (const std::exception & e) - { - if (!st->error) - st->error = strdup(e.what()); - } + mco_yield(co); - st->have_block = false; - st->done = true; -} + /* Resumed by consumer — check if canceled */ + return !st->canceled; + })); + } + catch (const std::exception &e) + { + if (!st->error) + st->error = strdup(e.what()); + } -ch_binary_streaming_state * -ch_binary_begin_streaming(ch_binary_connection_t * conn, - const ch_query * query, - bool (*check_cancel)(void)) -{ - mco_desc desc; - mco_result res; - - ch_binary_streaming_state *st = new (std::nothrow) ch_binary_streaming_state(); - if (!st) - return NULL; - - st->co = NULL; - st->current_block = NULL; - st->current_row = 0; - st->have_block = false; - st->done = false; - st->coltypes = NULL; - st->values = NULL; - st->nulls = NULL; - st->columns_count = 0; - st->error = NULL; - st->canceled = false; - st->check_cancel = check_cancel; - st->client = (Client *)conn->client; - st->sql = query->sql; - st->settings = ch_binary_settings(query); - st->params = ch_binary_params(query); - - desc = mco_desc_init(binary_streaming_coro, 1024 * 1024); - desc.user_data = st; - res = mco_create(&st->co, &desc); - if (res != MCO_SUCCESS) - { - st->error = strdup(mco_result_description(res)); + st->have_block = false; st->done = true; - return st; } - /* Resume to start the query and get the first block */ - mco_resume(st->co); - - return st; -} + ch_binary_streaming_state * + ch_binary_begin_streaming(ch_binary_connection_t *conn, const ch_query *query, bool (*check_cancel)(void)) + { + mco_desc desc; + mco_result res; -/* - * Fetch the next block from the stream. - * Returns true if a new block is available. - */ -bool -ch_binary_fetch_block(ch_binary_streaming_state * st) -{ - if (!st || st->done) - return false; + ch_binary_streaming_state *st = new (std::nothrow) ch_binary_streaming_state(); + if (!st) + return NULL; - if (st->have_block) - return true; + st->co = NULL; + st->current_block = NULL; + st->current_row = 0; + st->have_block = false; + st->done = false; + st->coltypes = NULL; + st->values = NULL; + st->nulls = NULL; + st->columns_count = 0; + st->error = NULL; + st->canceled = false; + st->check_cancel = check_cancel; + st->client = (Client *)conn->client; + st->sql = query->sql; + st->settings = ch_binary_settings(query); + st->params = ch_binary_params(query); + + desc = mco_desc_init(binary_streaming_coro, 1024 * 1024); + desc.user_data = st; + res = mco_create(&st->co, &desc); + if (res != MCO_SUCCESS) + { + st->error = strdup(mco_result_description(res)); + st->done = true; + return st; + } - /* Resume coroutine to get next block */ - if (mco_status(st->co) == MCO_SUSPENDED) + /* Resume to start the query and get the first block */ mco_resume(st->co); - return st->have_block; -} + return st; + } -/* - * Read the next row from the current block. Returns false when the - * current block is exhausted (caller should call ch_binary_fetch_block). - */ -bool -ch_binary_streaming_read_row(ch_binary_streaming_state * st) -{ - if (!st || !st->have_block || !st->current_block) - return false; + /* + * Fetch the next block from the stream. + * Returns true if a new block is available. + */ + bool + ch_binary_fetch_block(ch_binary_streaming_state *st) + { + if (!st || st->done) + return false; - Block & block = *st->current_block; - size_t row_count = block.GetRowCount(); + if (st->have_block) + return true; - if (st->current_row >= row_count) - { - /* Block exhausted — mark it consumed so next fetch_block resumes */ - st->have_block = false; - return false; - } + /* Resume coroutine to get next block */ + if (mco_status(st->co) == MCO_SUSPENDED) + mco_resume(st->co); - /* Allocate scratch arrays on first use */ - if (!st->coltypes && st->columns_count > 0) - { - st->coltypes = new Oid[st->columns_count]; - st->values = new Datum[st->columns_count]; - st->nulls = new bool[st->columns_count]; + return st->have_block; } - try + /* + * Read the next row from the current block. Returns false when the + * current block is exhausted (caller should call ch_binary_fetch_block). + */ + bool + ch_binary_streaming_read_row(ch_binary_streaming_state *st) { - for (size_t i = 0; i < st->columns_count; i++) + if (!st || !st->have_block || !st->current_block) + return false; + + Block &block = *st->current_block; + size_t row_count = block.GetRowCount(); + + if (st->current_row >= row_count) { - st->values[i] = make_datum(block[i], st->current_row, - &st->coltypes[i], &st->nulls[i]); + /* Block exhausted — mark it consumed so next fetch_block resumes */ + st->have_block = false; + return false; } - } - catch (const std::exception & e) - { - if (!st->error) - st->error = strdup(e.what()); - return false; - } - st->current_row++; - return true; -} + /* Allocate scratch arrays on first use */ + if (!st->coltypes && st->columns_count > 0) + { + st->coltypes = new Oid[st->columns_count]; + st->values = new Datum[st->columns_count]; + st->nulls = new bool[st->columns_count]; + } -size_t -ch_binary_streaming_columns(ch_binary_streaming_state * st) -{ - return st ? st->columns_count : 0; -} + try + { + for (size_t i = 0; i < st->columns_count; i++) + { + st->values[i] = make_datum(block[i], st->current_row, &st->coltypes[i], &st->nulls[i]); + } + } + catch (const std::exception &e) + { + if (!st->error) + st->error = strdup(e.what()); + return false; + } -Datum -ch_binary_streaming_value(ch_binary_streaming_state * st, size_t col, - Oid * valtype, bool * is_null) -{ - if (!st || col >= st->columns_count) + st->current_row++; + return true; + } + + size_t + ch_binary_streaming_columns(ch_binary_streaming_state *st) { - *is_null = true; - *valtype = InvalidOid; - return (Datum)0; + return st ? st->columns_count : 0; } - *valtype = st->coltypes[col]; - *is_null = st->nulls[col]; - return st->values[col]; -} + Datum + ch_binary_streaming_value(ch_binary_streaming_state *st, size_t col, Oid *valtype, bool *is_null) + { + if (!st || col >= st->columns_count) + { + *is_null = true; + *valtype = InvalidOid; + return (Datum)0; + } -char * -ch_binary_streaming_error(ch_binary_streaming_state * st) -{ - return st ? st->error : NULL; -} + *valtype = st->coltypes[col]; + *is_null = st->nulls[col]; + return st->values[col]; + } -void -ch_binary_end_streaming(ch_binary_streaming_state * st) -{ - if (!st) - return; + char * + ch_binary_streaming_error(ch_binary_streaming_state *st) + { + return st ? st->error : NULL; + } - /* - * If the coroutine is still suspended (query not fully consumed), - * destroy it without resuming. mco_destroy handles suspended - * coroutines. Then reset the connection to cancel any in-flight - * ClickHouse query so the server doesn't keep processing. - */ - if (st->co) + void + ch_binary_end_streaming(ch_binary_streaming_state *st) { - bool was_suspended = (mco_status(st->co) == MCO_SUSPENDED); + if (!st) + return; - mco_destroy(st->co); - st->co = NULL; + /* + * If the coroutine is still suspended (query not fully consumed), + * destroy it without resuming. mco_destroy handles suspended + * coroutines. Then reset the connection to cancel any in-flight + * ClickHouse query so the server doesn't keep processing. + */ + if (st->co) + { + bool was_suspended = (mco_status(st->co) == MCO_SUSPENDED); - if (was_suspended) - st->client->ResetConnection(); - } + mco_destroy(st->co); + st->co = NULL; - if (st->coltypes) - { - delete[] st->coltypes; - delete[] st->values; - delete[] st->nulls; - } + if (was_suspended) + st->client->ResetConnection(); + } - if (st->error) - free(st->error); + if (st->coltypes) + { + delete[] st->coltypes; + delete[] st->values; + delete[] st->nulls; + } - delete st; -} + if (st->error) + free(st->error); + delete st; + } } From 49b27b57edf94ce680e3b46e8e347e834dc2e4d2 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Wed, 1 Apr 2026 15:03:51 -0500 Subject: [PATCH 10/13] Split streaming scan code from buffered paths Review on PR #165 pointed out that the new streaming scan work had\nstarted to duplicate the existing buffered query paths while also making\nresource ownership harder to follow. The HTTP and binary code both had\nstreaming-specific setup, fetch, and cleanup logic mixed into files that\nalready handled the non-streaming paths, which made the cursor lifetime\nand error paths harder to reason about.\n\nMove the streaming implementations into dedicated HTTP and binary\ntranslation units and keep the shared cursor and request setup in the\nexisting code. This reduces the duplication in pglink.c and http.c,\nkeeps ownership boundaries clearer, and preserves the existing behavior\nfor the non-streaming utility paths that still need the buffered query\ninterfaces.\n\nWhile doing that, fix the HTTP streaming error and cancellation paths so\ntransport or coroutine failures do not quietly look like EOF, keep the\nbinary streaming row-read error text consistent with the old behavior,\nand replace ad-hoc out-of-memory elog() calls with proper FDW errors.\nAlso drop the unused max_result_size GUC from this series and document\nthe fetch_size option that now controls streamed scans.\n\nKeep minicoro wired in through the existing vendored submodule and\nsingle implementation unit. A reviewer suggested dropping that layout,\nbut this commit intentionally sticks to the original PR shape after the\nfollow-up direction on the branch.\n\nVerified with:\n\nmake tempcheck PG_CONFIG=~/git/pg_local/pgv18/install/bin/pg_config \\n CC=cc CXX=c++ \\n PG_LDFLAGS="-lstdc++ -lssl -lcrypto -lcurl -L/Users/kaushik/brew/opt/openssl@3/lib"\n\nAll 21 regression tests passed. --- doc/pg_clickhouse.md | 7 + src/binary.cpp | 280 +------------- src/binary_streaming.cpp | 299 ++++++++++++++ src/fdw.c.in | 5 +- src/http.c | 685 ++++++++++----------------------- src/http_streaming.c | 341 ++++++++++++++++ src/include/binary_internal.hh | 18 + src/include/fdw.h | 1 - src/include/http.h | 3 +- src/include/http_internal.h | 33 ++ src/option.c | 19 - src/pglink.c | 209 +++++----- 12 files changed, 1044 insertions(+), 856 deletions(-) create mode 100644 src/binary_streaming.cpp create mode 100644 src/http_streaming.c create mode 100644 src/include/binary_internal.hh create mode 100644 src/include/http_internal.h diff --git a/doc/pg_clickhouse.md b/doc/pg_clickhouse.md index 406d1b75..d6a6f352 100644 --- a/doc/pg_clickhouse.md +++ b/doc/pg_clickhouse.md @@ -139,6 +139,10 @@ The supported options are: * `dbname`: The ClickHouse database to use upon connecting. Defaults to "default". * `host`: The host name of the ClickHouse server. Defaults to "localhost"; +* `fetch_size`: The number of remote rows to buffer per streamed fetch while + scanning foreign tables. Defaults to `50000`. Set it to `0` to disable + streaming for tables that use this server and fall back to buffering the + full result set in memory. * `port`: The port to connect to on the ClickHouse server. Defaults as follows: * 9440 if `driver` is "binary" and `host` is a ClickHouse Cloud host @@ -299,6 +303,9 @@ The supported table options are: defined for the foreign server. * `table_name`: The name of the remote table. Default to the name specified for the foreign table. +* `fetch_size`: Overrides the server-level `fetch_size` for this table only. + Use it to make scans buffer fewer or more remote rows at a time, or set it + to `0` to disable streaming for this table. * `engine`: The [table engine] used by the ClickHouse table. For `CollapsingMergeTree()` and `AggregatingMergeTree()`, pg_clickhouse automatically applies the parameters to function expressions executed on diff --git a/src/binary.cpp b/src/binary.cpp index 20943804..8d48d24d 100644 --- a/src/binary.cpp +++ b/src/binary.cpp @@ -41,8 +41,7 @@ extern "C" #include "utils/uuid.h" #include "binary.hh" - - using namespace clickhouse; +#include "binary_internal.hh" #if defined(__APPLE__) /* Byte ordering on macOS */ #include @@ -55,6 +54,8 @@ extern "C" #define BIG_ENDIAN_64_TO_HOST(x) be64toh(x) #endif + using namespace clickhouse; + #define THROW_UNEXPECTED_COLUMN(exp_type, col) \ throw std::runtime_error("unexpected column type for " + std::string(exp_type) + ": " + col->Type()->GetName()) @@ -181,7 +182,7 @@ extern "C" /* * Converts query->settings to QuerySettings. */ - static QuerySettings + QuerySettings ch_binary_settings(const ch_query *query) { kv_iter iter; @@ -198,7 +199,7 @@ extern "C" /* * Converts query->param_values to QueryParams. */ - static QueryParams + QueryParams ch_binary_params(const ch_query *query) { int i; @@ -836,8 +837,8 @@ extern "C" * There is not an adequate (without huge overheads) solution, we just consider * this state unfixable. */ - static Datum - make_datum(clickhouse::ColumnRef col, size_t row, Oid *valtype, bool *is_null) + Datum + ch_binary_make_datum(clickhouse::ColumnRef col, size_t row, Oid *valtype, bool *is_null) { Datum ret = (Datum)0; @@ -1088,7 +1089,7 @@ extern "C" slot->nulls = (bool *)exc_palloc0(sizeof(bool) * len); for (size_t i = 0; i < len; ++i) - slot->datums[i] = make_datum(arr, i, &slot->item_type, &slot->nulls[i]); + slot->datums[i] = ch_binary_make_datum(arr, i, &slot->item_type, &slot->nulls[i]); } /* this one will need additional work, since we just return raw slot */ @@ -1112,7 +1113,7 @@ extern "C" slot->len = len; for (size_t i = 0; i < len; ++i) - slot->datums[i] = make_datum((*tuple)[i], row, &slot->types[i], &slot->nulls[i]); + slot->datums[i] = ch_binary_make_datum((*tuple)[i], row, &slot->types[i], &slot->nulls[i]); /* this one will need additional work, since we just return raw slot */ ret = PointerGetDatum(slot); @@ -1173,7 +1174,7 @@ extern "C" for (size_t i = 0; i < state->resp->columns_count; i++) { /* fill value and null arrays */ - state->values[i] = make_datum(block[i], state->row, &state->coltypes[i], &state->nulls[i]); + state->values[i] = ch_binary_make_datum(block[i], state->row, &state->coltypes[i], &state->nulls[i]); } res = true; @@ -1210,265 +1211,4 @@ extern "C" if (state->error) free(state->error); } - /* - * ============================================================ - * Streaming binary API - * - * Uses minicoro to run Client::Select inside a coroutine. - * The OnDataCancelable callback yields each block back to the - * consumer. No threads, no synchronization. - * ============================================================ - */ - -#include "minicoro.h" - - struct ch_binary_streaming_state - { - mco_coro *co; - - /* Current block yielded by the coroutine */ - Block *current_block; /* points into coroutine stack; valid until next resume */ - size_t current_row; - bool have_block; /* true if current_block is valid */ - bool done; /* coroutine has finished */ - - /* Per-row scratch arrays (allocated once) */ - Oid *coltypes; - Datum *values; - bool *nulls; - size_t columns_count; - - /* Error / cancel */ - char *error; - bool canceled; - bool (*check_cancel)(void); - - /* Query details (owned, for coroutine to use) */ - Client *client; - std::string sql; - QuerySettings settings; - QueryParams params; - }; - - /* - * Coroutine function: runs Client::Select, yielding each block. - */ - static void - binary_streaming_coro(mco_coro *co) - { - ch_binary_streaming_state *st = (ch_binary_streaming_state *)mco_get_user_data(co); - - try - { - st->client->Select(clickhouse::Query(st->sql) - .SetQuerySettings(st->settings) - .SetParams(st->params) - .OnDataCancelable( - [st, co](const Block &block) -> bool - { - if (st->check_cancel && st->check_cancel()) - { - st->error = strdup("query was canceled"); - return false; - } - - if (st->canceled) - return false; - - if (block.GetColumnCount() == 0) - return true; - - /* Make the block available to the consumer and yield */ - st->current_block = const_cast(&block); - st->columns_count = block.GetColumnCount(); - st->have_block = true; - st->current_row = 0; - - mco_yield(co); - - /* Resumed by consumer — check if canceled */ - return !st->canceled; - })); - } - catch (const std::exception &e) - { - if (!st->error) - st->error = strdup(e.what()); - } - - st->have_block = false; - st->done = true; - } - - ch_binary_streaming_state * - ch_binary_begin_streaming(ch_binary_connection_t *conn, const ch_query *query, bool (*check_cancel)(void)) - { - mco_desc desc; - mco_result res; - - ch_binary_streaming_state *st = new (std::nothrow) ch_binary_streaming_state(); - if (!st) - return NULL; - - st->co = NULL; - st->current_block = NULL; - st->current_row = 0; - st->have_block = false; - st->done = false; - st->coltypes = NULL; - st->values = NULL; - st->nulls = NULL; - st->columns_count = 0; - st->error = NULL; - st->canceled = false; - st->check_cancel = check_cancel; - st->client = (Client *)conn->client; - st->sql = query->sql; - st->settings = ch_binary_settings(query); - st->params = ch_binary_params(query); - - desc = mco_desc_init(binary_streaming_coro, 1024 * 1024); - desc.user_data = st; - res = mco_create(&st->co, &desc); - if (res != MCO_SUCCESS) - { - st->error = strdup(mco_result_description(res)); - st->done = true; - return st; - } - - /* Resume to start the query and get the first block */ - mco_resume(st->co); - - return st; - } - - /* - * Fetch the next block from the stream. - * Returns true if a new block is available. - */ - bool - ch_binary_fetch_block(ch_binary_streaming_state *st) - { - if (!st || st->done) - return false; - - if (st->have_block) - return true; - - /* Resume coroutine to get next block */ - if (mco_status(st->co) == MCO_SUSPENDED) - mco_resume(st->co); - - return st->have_block; - } - - /* - * Read the next row from the current block. Returns false when the - * current block is exhausted (caller should call ch_binary_fetch_block). - */ - bool - ch_binary_streaming_read_row(ch_binary_streaming_state *st) - { - if (!st || !st->have_block || !st->current_block) - return false; - - Block &block = *st->current_block; - size_t row_count = block.GetRowCount(); - - if (st->current_row >= row_count) - { - /* Block exhausted — mark it consumed so next fetch_block resumes */ - st->have_block = false; - return false; - } - - /* Allocate scratch arrays on first use */ - if (!st->coltypes && st->columns_count > 0) - { - st->coltypes = new Oid[st->columns_count]; - st->values = new Datum[st->columns_count]; - st->nulls = new bool[st->columns_count]; - } - - try - { - for (size_t i = 0; i < st->columns_count; i++) - { - st->values[i] = make_datum(block[i], st->current_row, &st->coltypes[i], &st->nulls[i]); - } - } - catch (const std::exception &e) - { - if (!st->error) - st->error = strdup(e.what()); - return false; - } - - st->current_row++; - return true; - } - - size_t - ch_binary_streaming_columns(ch_binary_streaming_state *st) - { - return st ? st->columns_count : 0; - } - - Datum - ch_binary_streaming_value(ch_binary_streaming_state *st, size_t col, Oid *valtype, bool *is_null) - { - if (!st || col >= st->columns_count) - { - *is_null = true; - *valtype = InvalidOid; - return (Datum)0; - } - - *valtype = st->coltypes[col]; - *is_null = st->nulls[col]; - return st->values[col]; - } - - char * - ch_binary_streaming_error(ch_binary_streaming_state *st) - { - return st ? st->error : NULL; - } - - void - ch_binary_end_streaming(ch_binary_streaming_state *st) - { - if (!st) - return; - - /* - * If the coroutine is still suspended (query not fully consumed), - * destroy it without resuming. mco_destroy handles suspended - * coroutines. Then reset the connection to cancel any in-flight - * ClickHouse query so the server doesn't keep processing. - */ - if (st->co) - { - bool was_suspended = (mco_status(st->co) == MCO_SUSPENDED); - - mco_destroy(st->co); - st->co = NULL; - - if (was_suspended) - st->client->ResetConnection(); - } - - if (st->coltypes) - { - delete[] st->coltypes; - delete[] st->values; - delete[] st->nulls; - } - - if (st->error) - free(st->error); - - delete st; - } } diff --git a/src/binary_streaming.cpp b/src/binary_streaming.cpp new file mode 100644 index 00000000..acfe99ce --- /dev/null +++ b/src/binary_streaming.cpp @@ -0,0 +1,299 @@ +#include +#include +#include + +#include +#include + +#include "minicoro.h" + +extern "C" +{ + +#include "postgres.h" +#include "internal.h" +#include "engine.h" + +} + +#include "binary_internal.hh" + +using namespace clickhouse; + +static constexpr size_t kBinaryStreamingStackSize = 1024 * 1024; +static const char kBinaryStreamingCanceled[] = "query was canceled"; +static const char kBinaryStreamingOom[] = "out of memory"; + +struct ch_binary_streaming_state +{ + mco_coro *co = nullptr; + + /* Current block yielded by the coroutine. Valid until the next resume. */ + Block *current_block = nullptr; + size_t current_row = 0; + bool have_block = false; + bool done = false; + + std::unique_ptr coltypes; + std::unique_ptr values; + std::unique_ptr nulls; + size_t columns_count = 0; + + char *error = nullptr; + bool error_owned = false; + bool (*check_cancel) (void) = nullptr; + + Client *client = nullptr; + std::string sql; + QuerySettings settings; + QueryParams params; + + ~ch_binary_streaming_state() + { + if (error && error_owned) + free(error); + } + + void + SetStaticError(const char *message) + { + if (error) + return; + error = const_cast(message); + error_owned = false; + } + + void + SetOwnedError(const char *message) + { + char *copy; + + if (error) + return; + if (!message) + { + SetStaticError(kBinaryStreamingOom); + return; + } + + copy = strdup(message); + if (!copy) + { + SetStaticError(kBinaryStreamingOom); + return; + } + + error = copy; + error_owned = true; + } +}; + +static bool +binary_streaming_resume(ch_binary_streaming_state * st) +{ + mco_result res; + + res = mco_resume(st->co); + if (res == MCO_SUCCESS) + return true; + + st->SetOwnedError(mco_result_description(res)); + st->done = true; + return false; +} + +/* + * Coroutine function: runs Client::Select, yielding each block. + */ +static void +binary_streaming_coro(mco_coro * co) +{ + ch_binary_streaming_state *st = + (ch_binary_streaming_state *) mco_get_user_data(co); + + try + { + st->client->Select( + clickhouse::Query(st->sql) + .SetQuerySettings(st->settings) + .SetParams(st->params) + .OnDataCancelable( + [st, co](const Block & block) -> bool + { + if (st->check_cancel && st->check_cancel()) + { + st->SetStaticError(kBinaryStreamingCanceled); + return false; + } + + if (block.GetColumnCount() == 0) + return true; + + st->current_block = const_cast(&block); + st->columns_count = block.GetColumnCount(); + st->have_block = true; + st->current_row = 0; + + mco_yield(co); + return true; + })); + } + catch (const std::exception & e) + { + st->SetOwnedError(e.what()); + } + + st->have_block = false; + st->done = true; +} + +extern "C" +{ + + ch_binary_streaming_state * + ch_binary_begin_streaming(ch_binary_connection_t * conn, + const ch_query * query, + bool (*check_cancel) (void)) + { + ch_binary_streaming_state *st; + mco_desc desc; + mco_result res; + + st = new (std::nothrow) ch_binary_streaming_state(); + if (!st) + return NULL; + + st->check_cancel = check_cancel; + st->client = (Client *) conn->client; + st->sql = query->sql; + st->settings = ch_binary_settings(query); + st->params = ch_binary_params(query); + + desc = mco_desc_init(binary_streaming_coro, kBinaryStreamingStackSize); + desc.user_data = st; + res = mco_create(&st->co, &desc); + if (res != MCO_SUCCESS) + { + st->SetOwnedError(mco_result_description(res)); + st->done = true; + return st; + } + + if (!binary_streaming_resume(st)) + return st; + + return st; + } + + bool + ch_binary_fetch_block(ch_binary_streaming_state * st) + { + if (!st || st->done) + return false; + if (st->have_block) + return true; + if (mco_status(st->co) != MCO_SUSPENDED) + return false; + + if (!binary_streaming_resume(st)) + return false; + + return st->have_block; + } + + bool + ch_binary_streaming_read_row(ch_binary_streaming_state * st) + { + Block *block; + size_t row_count; + + if (!st || !st->have_block || !st->current_block) + return false; + + block = st->current_block; + row_count = block->GetRowCount(); + if (st->current_row >= row_count) + { + st->have_block = false; + return false; + } + + if (!st->coltypes && st->columns_count > 0) + { + st->coltypes.reset(new (std::nothrow) Oid[st->columns_count]); + st->values.reset(new (std::nothrow) Datum[st->columns_count]); + st->nulls.reset(new (std::nothrow) bool[st->columns_count]); + if (!st->coltypes || !st->values || !st->nulls) + { + st->SetStaticError(kBinaryStreamingOom); + return false; + } + } + + try + { + for (size_t i = 0; i < st->columns_count; i++) + { + st->values[i] = ch_binary_make_datum((*block)[i], st->current_row, + &st->coltypes[i], &st->nulls[i]); + } + } + catch (const std::exception & e) + { + st->SetOwnedError(e.what()); + return false; + } + + st->current_row++; + return true; + } + + size_t + ch_binary_streaming_columns(ch_binary_streaming_state * st) + { + return st ? st->columns_count : 0; + } + + Datum + ch_binary_streaming_value(ch_binary_streaming_state * st, size_t col, + Oid * valtype, bool * is_null) + { + if (!st || col >= st->columns_count) + { + *is_null = true; + *valtype = InvalidOid; + return (Datum) 0; + } + + *valtype = st->coltypes[col]; + *is_null = st->nulls[col]; + return st->values[col]; + } + + char * + ch_binary_streaming_error(ch_binary_streaming_state * st) + { + return st ? st->error : NULL; + } + + void + ch_binary_end_streaming(ch_binary_streaming_state * st) + { + if (!st) + return; + + if (st->co) + { + bool was_suspended = mco_status(st->co) == MCO_SUSPENDED; + + mco_destroy(st->co); + st->co = NULL; + + if (was_suspended) + st->client->ResetConnection(); + } + + delete st; + } + +} diff --git a/src/fdw.c.in b/src/fdw.c.in index f0b6cd1e..85d6eece 100644 --- a/src/fdw.c.in +++ b/src/fdw.c.in @@ -57,6 +57,9 @@ PG_MODULE_MAGIC; /* If no remote estimates, assume a sort costs 20% extra */ #define DEFAULT_FDW_SORT_MULTIPLIER 1.2 +/* Default number of remote rows to fetch per streamed batch. */ +#define DEFAULT_FETCH_SIZE 50000 + /* * Indexes of FDW-private information stored in fdw_private lists. * @@ -358,7 +361,7 @@ clickhouseGetForeignRelSize(PlannerInfo * root, fpinfo->fdw_startup_cost = DEFAULT_FDW_STARTUP_COST; fpinfo->fdw_tuple_cost = DEFAULT_FDW_TUPLE_COST; fpinfo->shippable_extensions = NIL; - fpinfo->fetch_size = 50000; + fpinfo->fetch_size = DEFAULT_FETCH_SIZE; /* Server-level fetch_size overrides default. */ foreach(lc, fpinfo->server->options) diff --git a/src/http.c b/src/http.c index bd84ba0d..bf6c573b 100644 --- a/src/http.c +++ b/src/http.c @@ -6,8 +6,8 @@ #include #include -#include -#include + +#include "http_internal.h" #define DATABASE_HEADER "X-ClickHouse-Database" @@ -144,100 +144,244 @@ ch_http_connection(ch_connection_details * details) return NULL; } -static void -set_query_id(ch_http_response_t * resp) +void +ch_http_generate_query_id(char query_id[37]) { uuid_t id; uuid_generate(id); - uuid_unparse(id, resp->query_id); + uuid_unparse(id, query_id); } -ch_http_response_t * -ch_http_simple_query(ch_http_connection_t * conn, const ch_query * query) +static bool +http_skip_setting(const char *name) { - char *url; - CURLcode errcode; - static char errbuffer[CURL_ERROR_SIZE]; - struct curl_slist *headers = NULL; - CURLU *cu = curl_url(); - kv_iter iter; - char *buf = NULL; - curl_mime *form = NULL; - int i; + return strcmp(name, "date_time_output_format") == 0 || + strcmp(name, "format_tsv_null_representation") == 0 || + strcmp(name, "output_format_tsv_crlf_end_of_line") == 0; +} - ch_http_response_t *resp = calloc(sizeof(ch_http_response_t), 1); +static bool +http_append_query_param(CURLU * cu, const char *name, const char *value) +{ + char *buf = psprintf("%s=%s", name, value); + bool success; - if (resp == NULL) - return NULL; + success = curl_url_set(cu, CURLUPART_QUERY, buf, + CURLU_APPENDQUERY | CURLU_URLENCODE) == CURLUE_OK; + pfree(buf); + return success; +} - set_query_id(resp); +bool +ch_http_build_url(ch_http_connection_t * conn, const ch_query * query, + const char *query_id, char **url) +{ + CURLU *cu; + kv_iter iter; + bool success = false; - assert(conn && conn->curl); + *url = NULL; + cu = curl_url(); + if (!cu) + return false; - /* Construct the base URL with the query ID. */ - curl_url_set(cu, CURLUPART_URL, conn->base_url, 0); - buf = psprintf("query_id=%s", resp->query_id); - curl_url_set(cu, CURLUPART_QUERY, buf, CURLU_APPENDQUERY | CURLU_URLENCODE); - pfree(buf); + if (curl_url_set(cu, CURLUPART_URL, conn->base_url, 0) != CURLUE_OK) + goto cleanup; + + if (!http_append_query_param(cu, "query_id", query_id)) + goto cleanup; - /* Append each of the settings as a query param. */ - for (iter = new_kv_iter(query->settings); !kv_iter_done(&iter); kv_iter_next(&iter)) + for (iter = new_kv_iter(query->settings); + !kv_iter_done(&iter); + kv_iter_next(&iter)) { - /* Skip settings that would break parsing and type conversion. */ - if ( - strcmp(iter.name, "date_time_output_format") == 0 || - strcmp(iter.name, "format_tsv_null_representation") == 0 || - strcmp(iter.name, "output_format_tsv_crlf_end_of_line") == 0 - ) + if (http_skip_setting(iter.name)) continue; - buf = psprintf("%s=%s", iter.name, iter.value); - curl_url_set(cu, CURLUPART_QUERY, buf, CURLU_APPENDQUERY | CURLU_URLENCODE); - pfree(buf); + if (!http_append_query_param(cu, iter.name, iter.value)) + goto cleanup; } - /* Always use ISO date format, \N for NULL, \n for EOL. */ - curl_url_set(cu, CURLUPART_QUERY, "date_time_output_format=iso", CURLU_APPENDQUERY | CURLU_URLENCODE); - curl_url_set(cu, CURLUPART_QUERY, "format_tsv_null_representation=\\N", CURLU_APPENDQUERY | CURLU_URLENCODE); - curl_url_set(cu, CURLUPART_QUERY, "output_format_tsv_crlf_end_of_line=0", CURLU_APPENDQUERY | CURLU_URLENCODE); - curl_url_get(cu, CURLUPART_URL, &url, 0); + if (curl_url_set(cu, CURLUPART_QUERY, "date_time_output_format=iso", + CURLU_APPENDQUERY | CURLU_URLENCODE) != CURLUE_OK) + goto cleanup; + if (curl_url_set(cu, CURLUPART_QUERY, + "format_tsv_null_representation=\\N", + CURLU_APPENDQUERY | CURLU_URLENCODE) != CURLUE_OK) + goto cleanup; + if (curl_url_set(cu, CURLUPART_QUERY, + "output_format_tsv_crlf_end_of_line=0", + CURLU_APPENDQUERY | CURLU_URLENCODE) != CURLUE_OK) + goto cleanup; + if (curl_url_get(cu, CURLUPART_URL, url, 0) != CURLUE_OK) + goto cleanup; + + success = true; + +cleanup: curl_url_cleanup(cu); + if (!success && *url) + { + curl_free(*url); + *url = NULL; + } + return success; +} + +bool +ch_http_build_headers(ch_http_connection_t * conn, struct curl_slist **headers) +{ + char *header; + struct curl_slist *list; + + *headers = NULL; + if (!conn->dbname) + return true; + + header = psprintf("%s: %s", DATABASE_HEADER, conn->dbname); + list = curl_slist_append(NULL, header); + pfree(header); + if (!list) + return false; + + *headers = list; + return true; +} + +bool +ch_http_set_post_data(CURL *curl, const ch_query * query, curl_mime **form) +{ + int i; + + *form = NULL; + if (query->num_params == 0) + { + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, query->sql); + return true; + } + + *form = curl_mime_init(curl); + if (!*form) + return false; + + { + curl_mimepart *part = curl_mime_addpart(*form); + + if (!part) + goto oom; + curl_mime_name(part, "query"); + curl_mime_data(part, query->sql, CURL_ZERO_TERMINATED); + } + + for (i = 0; i < query->num_params; i++) + { + char *name = psprintf("param_p%d", i + 1); + curl_mimepart *part = curl_mime_addpart(*form); - /* constant */ + if (!part) + { + pfree(name); + goto oom; + } + curl_mime_name(part, name); + pfree(name); + curl_mime_data(part, query->param_values[i], CURL_ZERO_TERMINATED); + } + + curl_easy_setopt(curl, CURLOPT_MIMEPOST, *form); + return true; + +oom: + curl_mime_free(*form); + *form = NULL; + return false; +} + +void +ch_http_set_common_options(ch_http_connection_t * conn, const char *url, + char *errbuffer, ch_http_write_func write_func, + void *write_data) +{ errbuffer[0] = '\0'; curl_easy_reset(conn->curl); - curl_easy_setopt(conn->curl, CURLOPT_WRITEFUNCTION, write_data); + curl_easy_setopt(conn->curl, CURLOPT_WRITEFUNCTION, write_func); + curl_easy_setopt(conn->curl, CURLOPT_WRITEDATA, write_data); curl_easy_setopt(conn->curl, CURLOPT_ERRORBUFFER, errbuffer); curl_easy_setopt(conn->curl, CURLOPT_PATH_AS_IS, 1L); curl_easy_setopt(conn->curl, CURLOPT_URL, url); curl_easy_setopt(conn->curl, CURLOPT_NOSIGNAL, 1L); - - /* variable */ - curl_easy_setopt(conn->curl, CURLOPT_WRITEDATA, resp); curl_easy_setopt(conn->curl, CURLOPT_VERBOSE, curl_verbose); +} + +void +ch_http_set_progress_options(ch_http_connection_t * conn, void *progress_data) +{ if (curl_progressfunc) { curl_easy_setopt(conn->curl, CURLOPT_NOPROGRESS, 0L); curl_easy_setopt(conn->curl, CURLOPT_XFERINFOFUNCTION, curl_progressfunc); - curl_easy_setopt(conn->curl, CURLOPT_XFERINFODATA, conn); + curl_easy_setopt(conn->curl, CURLOPT_XFERINFODATA, progress_data); } else curl_easy_setopt(conn->curl, CURLOPT_NOPROGRESS, 1L); - if (conn->dbname) +} + +void +ch_http_get_transfer_info(ch_http_connection_t * conn, double *pretransfer_time, + double *total_time, long *http_status) +{ + long status = 0; + + if (curl_easy_getinfo(conn->curl, CURLINFO_PRETRANSFER_TIME, + pretransfer_time) != CURLE_OK) + *pretransfer_time = 0; + if (curl_easy_getinfo(conn->curl, CURLINFO_TOTAL_TIME, + total_time) != CURLE_OK) + *total_time = 0; + if (curl_easy_getinfo(conn->curl, CURLINFO_RESPONSE_CODE, + &status) == CURLE_OK && + (status != 0 || *http_status == 0)) + *http_status = status; +} + +ch_http_response_t * +ch_http_simple_query(ch_http_connection_t * conn, const ch_query * query) +{ + char *url; + CURLcode errcode; + static char errbuffer[CURL_ERROR_SIZE]; + struct curl_slist *headers = NULL; + curl_mime *form = NULL; + + ch_http_response_t *resp = calloc(sizeof(ch_http_response_t), 1); + + if (resp == NULL) + return NULL; + + ch_http_generate_query_id(resp->query_id); + + assert(conn && conn->curl); + + if (!ch_http_build_url(conn, query, resp->query_id, &url)) { - headers = curl_slist_append(headers, psprintf("%s: %s", DATABASE_HEADER, conn->dbname)); - if (!headers) - { - curl_free(url); - resp->http_status = -1; - resp->data = "out of memory"; - return resp; - } - curl_easy_setopt(conn->curl, CURLOPT_HTTPHEADER, headers); + resp->http_status = -1; + resp->data = "out of memory"; + return resp; } - if (query->num_params == 0) + ch_http_set_common_options(conn, url, errbuffer, write_data, resp); + ch_http_set_progress_options(conn, conn); + if (!ch_http_build_headers(conn, &headers)) + { + curl_free(url); + resp->http_status = -1; + resp->data = "out of memory"; + return resp; + } + if (headers) + curl_easy_setopt(conn->curl, CURLOPT_HTTPHEADER, headers); + if (query->num_params == 0) /* * Send the query as the POST body. This ensures that * date_time_output_format=iso will work for ClickHouse versions prior @@ -252,10 +396,7 @@ ch_http_simple_query(ch_http_connection_t * conn, const ch_query * query) * ClickHouse 25.8. Details: * https://github.com/ClickHouse/ClickHouse/pull/85570. */ - curl_mimepart *part; - - form = curl_mime_init(conn->curl); - if (!form) + if (!ch_http_set_post_data(conn->curl, query, &form)) { curl_free(url); if (headers) @@ -264,16 +405,6 @@ ch_http_simple_query(ch_http_connection_t * conn, const ch_query * query) resp->data = "out of memory"; return resp; } - part = curl_mime_addpart(form); - curl_mime_name(part, "query"); - curl_mime_data(part, query->sql, CURL_ZERO_TERMINATED); - for (i = 0; i < query->num_params; i++) - { - part = curl_mime_addpart(form); - curl_mime_name(part, psprintf("param_p%d", i + 1)); - curl_mime_data(part, query->param_values[i], CURL_ZERO_TERMINATED); - } - curl_easy_setopt(conn->curl, CURLOPT_MIMEPOST, form); } curl_error_happened = false; @@ -286,31 +417,19 @@ ch_http_simple_query(ch_http_connection_t * conn, const ch_query * query) if (errcode == CURLE_ABORTED_BY_CALLBACK) { - resp->http_status = 418; /* I'm teapot */ + resp->http_status = CH_HTTP_CANCELED_STATUS; return resp; } else if (errcode != CURLE_OK) { - resp->http_status = 419; /* illegal http status */ + resp->http_status = CH_HTTP_COMM_ERROR_STATUS; resp->data = strdup(errbuffer); resp->datasize = strlen(errbuffer); return resp; } - errcode = curl_easy_getinfo(conn->curl, CURLINFO_PRETRANSFER_TIME, - &resp->pretransfer_time); - if (errcode != CURLE_OK) - resp->pretransfer_time = 0; - - errcode = curl_easy_getinfo(conn->curl, CURLINFO_TOTAL_TIME, &resp->total_time); - if (errcode != CURLE_OK) - resp->total_time = 0; - - /* - * All good with request, but we need http status to make sure query went - * ok - */ - curl_easy_getinfo(conn->curl, CURLINFO_RESPONSE_CODE, &resp->http_status); + ch_http_get_transfer_info(conn, &resp->pretransfer_time, + &resp->total_time, &resp->http_status); if (curl_verbose && resp->http_status != 200) fprintf(stderr, "%s", resp->data); @@ -343,391 +462,3 @@ ch_http_response_free(ch_http_response_t * resp) free(resp); } - -/* - * ============================================================ - * Streaming HTTP API - * - * Uses minicoro to run curl_easy_perform inside a coroutine. - * The write callback counts complete TSV rows; when a batch is - * ready it yields back to the caller. Resuming the coroutine - * continues the transfer. - * ============================================================ - */ - -#include "minicoro.h" - -struct ch_http_streaming_state -{ - mco_coro *co; - ch_http_connection_t *conn; - - /* Owned resources that need cleanup */ - char *url; - struct curl_slist *headers; - curl_mime *form; - - bool transfer_done; /* coroutine has finished */ - - /* Rolling buffer: raw bytes from curl */ - char *buf; - size_t buf_alloc; - size_t buf_len; /* total bytes in buf */ - size_t batch_end; /* offset past last complete row in batch */ - int rows_ready; /* complete rows accumulated */ - int fetch_size; - - /* Error / status */ - long http_status; - char errbuffer[CURL_ERROR_SIZE]; - char *error; - double pretransfer_time; - double total_time; - char query_id[37]; - - /* Cumulative bytes received (for max_result_size check) */ - size_t total_bytes; -}; - -/* - * Streaming write callback. Appends data to the rolling buffer and counts - * complete TSV rows (terminated by '\n'). When fetch_size rows have been - * accumulated, yields the coroutine back to the caller. - */ -static size_t -write_data_streaming(void *contents, size_t size, size_t nmemb, void *userp) -{ - size_t realsize = size * nmemb; - ch_http_streaming_state *st = userp; - - /* Grow buffer if needed */ - if (st->buf_len + realsize + 1 > st->buf_alloc) - { - size_t newsize = st->buf_alloc * 2; - - if (newsize < st->buf_len + realsize + 1) - newsize = st->buf_len + realsize + 1; - - char *newbuf = realloc(st->buf, newsize); - - if (!newbuf) - return 0; /* signal error to curl */ - st->buf = newbuf; - st->buf_alloc = newsize; - } - - memcpy(st->buf + st->buf_len, contents, realsize); - st->buf_len += realsize; - st->buf[st->buf_len] = '\0'; - - /* Count new complete rows */ - for (size_t i = st->buf_len - realsize; i < st->buf_len; i++) - { - if (st->buf[i] == '\n') - { - st->rows_ready++; - st->batch_end = i + 1; - - if (st->rows_ready >= st->fetch_size) - { - /* Batch is ready — yield to consumer */ - mco_yield(st->co); - - /* - * Resumed: the consumer has consumed the batch and compacted - * the buffer. Continue receiving. - */ - return realsize; - } - } - } - - return realsize; -} - -/* - * Coroutine function: runs the entire curl_easy_perform, yielding - * whenever a batch is ready. - */ -static void -http_streaming_coro(mco_coro * co) -{ - ch_http_streaming_state *st = mco_get_user_data(co); - CURLcode errcode; - - errcode = curl_easy_perform(st->conn->curl); - - if (errcode != CURLE_OK && errcode != CURLE_WRITE_ERROR) - { - if (!st->error) - st->error = strdup(st->errbuffer[0] ? - st->errbuffer : curl_easy_strerror(errcode)); - } - else if (errcode == CURLE_ABORTED_BY_CALLBACK) - { - st->http_status = 418; - } - - curl_easy_getinfo(st->conn->curl, CURLINFO_PRETRANSFER_TIME, - &st->pretransfer_time); - curl_easy_getinfo(st->conn->curl, CURLINFO_TOTAL_TIME, - &st->total_time); - curl_easy_getinfo(st->conn->curl, CURLINFO_RESPONSE_CODE, - &st->http_status); - - /* Mark any remaining partial data as the final batch */ - if (st->buf_len > st->batch_end) - st->batch_end = st->buf_len; - - st->transfer_done = true; - - /* The coroutine returns here; mco_status becomes MCO_DEAD. */ -} - -ch_http_streaming_state * -ch_http_begin_streaming(ch_http_connection_t * conn, const ch_query * query, - int fetch_size) -{ - CURLU *cu; - kv_iter iter; - char *buf = NULL; - int i; - mco_desc desc; - mco_result res; - - ch_http_streaming_state *st = calloc(sizeof(ch_http_streaming_state), 1); - - if (!st) - return NULL; - - st->conn = conn; - st->fetch_size = fetch_size; - - /* Initial buffer: 64 KB */ - st->buf_alloc = 64 * 1024; - st->buf = malloc(st->buf_alloc); - if (!st->buf) - { - free(st); - return NULL; - } - - /* Generate query ID */ - { - uuid_t id; - - uuid_generate(id); - uuid_unparse(id, st->query_id); - } - - assert(conn && conn->curl); - - /* Build URL (same logic as ch_http_simple_query) */ - cu = curl_url(); - curl_url_set(cu, CURLUPART_URL, conn->base_url, 0); - buf = psprintf("query_id=%s", st->query_id); - curl_url_set(cu, CURLUPART_QUERY, buf, CURLU_APPENDQUERY | CURLU_URLENCODE); - pfree(buf); - - for (iter = new_kv_iter(query->settings); !kv_iter_done(&iter); kv_iter_next(&iter)) - { - /* Skip settings that would break parsing and type conversion. */ - if ( - strcmp(iter.name, "date_time_output_format") == 0 || - strcmp(iter.name, "format_tsv_null_representation") == 0 || - strcmp(iter.name, "output_format_tsv_crlf_end_of_line") == 0 - ) - continue; - buf = psprintf("%s=%s", iter.name, iter.value); - curl_url_set(cu, CURLUPART_QUERY, buf, CURLU_APPENDQUERY | CURLU_URLENCODE); - pfree(buf); - } - - /* Always use ISO date format, \N for NULL, \n for EOL. */ - curl_url_set(cu, CURLUPART_QUERY, "date_time_output_format=iso", CURLU_APPENDQUERY | CURLU_URLENCODE); - curl_url_set(cu, CURLUPART_QUERY, "format_tsv_null_representation=\\N", CURLU_APPENDQUERY | CURLU_URLENCODE); - curl_url_set(cu, CURLUPART_QUERY, "output_format_tsv_crlf_end_of_line=0", CURLU_APPENDQUERY | CURLU_URLENCODE); - curl_url_get(cu, CURLUPART_URL, &st->url, 0); - curl_url_cleanup(cu); - - /* Configure easy handle */ - st->errbuffer[0] = '\0'; - curl_easy_reset(conn->curl); - curl_easy_setopt(conn->curl, CURLOPT_WRITEFUNCTION, write_data_streaming); - curl_easy_setopt(conn->curl, CURLOPT_WRITEDATA, st); - curl_easy_setopt(conn->curl, CURLOPT_ERRORBUFFER, st->errbuffer); - curl_easy_setopt(conn->curl, CURLOPT_PATH_AS_IS, 1L); - curl_easy_setopt(conn->curl, CURLOPT_URL, st->url); - curl_easy_setopt(conn->curl, CURLOPT_NOSIGNAL, 1L); - curl_easy_setopt(conn->curl, CURLOPT_VERBOSE, curl_verbose); - curl_easy_setopt(conn->curl, CURLOPT_NOPROGRESS, 1L); - - if (conn->dbname) - { - st->headers = curl_slist_append(NULL, - psprintf("%s: %s", DATABASE_HEADER, conn->dbname)); - curl_easy_setopt(conn->curl, CURLOPT_HTTPHEADER, st->headers); - } - - /* Set POST data */ - if (query->num_params == 0) - { - curl_easy_setopt(conn->curl, CURLOPT_POSTFIELDS, query->sql); - } - else - { - curl_mimepart *part; - - st->form = curl_mime_init(conn->curl); - part = curl_mime_addpart(st->form); - curl_mime_name(part, "query"); - curl_mime_data(part, query->sql, CURL_ZERO_TERMINATED); - for (i = 0; i < query->num_params; i++) - { - part = curl_mime_addpart(st->form); - curl_mime_name(part, psprintf("param_p%d", i + 1)); - curl_mime_data(part, query->param_values[i], CURL_ZERO_TERMINATED); - } - curl_easy_setopt(conn->curl, CURLOPT_MIMEPOST, st->form); - } - - /* Create coroutine */ - desc = mco_desc_init(http_streaming_coro, 1024 * 1024); - desc.user_data = st; - res = mco_create(&st->co, &desc); - if (res != MCO_SUCCESS) - { - st->error = strdup(mco_result_description(res)); - st->transfer_done = true; - return st; - } - - /* Resume to start the transfer and get the first batch */ - mco_resume(st->co); - - return st; -} - -/* - * Fetch the next batch of rows from the stream. - * - * Returns true if a batch is available (access via ch_http_streaming_batch_data). - * Returns false when there is no more data. - */ -bool -ch_http_fetch_batch(ch_http_streaming_state * st) -{ - size_t consumed; - size_t remaining; - - if (!st) - return false; - - /* - * Consume the previous batch: compact the buffer. - */ - consumed = st->batch_end; - st->total_bytes += consumed; - - remaining = st->buf_len - consumed; - if (remaining > 0) - memmove(st->buf, st->buf + consumed, remaining); - st->buf_len = remaining; - st->buf[st->buf_len] = '\0'; - st->batch_end = 0; - st->rows_ready = 0; - - /* If the coroutine is already done, return whatever remains */ - if (st->transfer_done || mco_status(st->co) == MCO_DEAD) - { - st->transfer_done = true; - if (st->buf_len > 0) - { - st->batch_end = st->buf_len; - return true; - } - return false; - } - - /* Resume the coroutine to continue receiving data */ - mco_resume(st->co); - - /* Check if we got a batch or reached EOF */ - if (st->batch_end > 0) - return true; - - if (st->transfer_done && st->buf_len > 0) - { - st->batch_end = st->buf_len; - return true; - } - - return false; -} - -char * -ch_http_streaming_error(ch_http_streaming_state * st) -{ - return st ? st->error : NULL; -} - -long -ch_http_streaming_status(ch_http_streaming_state * st) -{ - return st ? st->http_status : 0; -} - -double -ch_http_streaming_request_time(ch_http_streaming_state * st) -{ - return st ? st->pretransfer_time : 0; -} - -double -ch_http_streaming_total_time(ch_http_streaming_state * st) -{ - return st ? st->total_time : 0; -} - -char * -ch_http_streaming_batch_data(ch_http_streaming_state * st) -{ - return st ? st->buf : NULL; -} - -size_t -ch_http_streaming_batch_size(ch_http_streaming_state * st) -{ - return st ? st->batch_end : 0; -} - -void -ch_http_end_streaming(ch_http_streaming_state * st) -{ - if (!st) - return; - - /* - * Destroy the coroutine without resuming. This abandons curl_easy_perform - * mid-transfer, so reset the easy handle afterwards to ensure curl cleans - * up its internal state. - */ - if (st->co) - mco_destroy(st->co); - - curl_easy_reset(st->conn->curl); - - /* Clean up owned resources */ - if (st->url) - curl_free(st->url); - if (st->headers) - curl_slist_free_all(st->headers); - if (st->form) - curl_mime_free(st->form); - if (st->buf) - free(st->buf); - if (st->error) - free(st->error); - - free(st); -} diff --git a/src/http_streaming.c b/src/http_streaming.c new file mode 100644 index 00000000..91c89689 --- /dev/null +++ b/src/http_streaming.c @@ -0,0 +1,341 @@ +#include "postgres.h" + +#include "http_internal.h" + +#include "minicoro.h" + +struct ch_http_streaming_state +{ + mco_coro *co; + ch_http_connection_t *conn; + + /* Owned resources that need cleanup */ + char *url; + struct curl_slist *headers; + curl_mime *form; + + bool transfer_done; /* coroutine has finished */ + + /* Rolling buffer: raw bytes from curl */ + char *buf; + size_t buf_alloc; + size_t buf_len; /* total bytes in buf */ + size_t batch_end; /* offset past last complete row in batch */ + int rows_ready; /* complete rows accumulated */ + int fetch_size; + + /* Error / status */ + long http_status; + char errbuffer[CURL_ERROR_SIZE]; + char *error; + bool error_owned; + double pretransfer_time; + double total_time; + char query_id[37]; +}; + +static const char http_streaming_oom_error[] = "out of memory"; +static const char http_streaming_write_error[] = "could not buffer response"; + +static void +http_streaming_set_static_error(ch_http_streaming_state * st, const char *error) +{ + if (st->error) + return; + + st->error = (char *) error; + st->error_owned = false; +} + +static void +http_streaming_set_owned_error(ch_http_streaming_state * st, const char *error) +{ + char *copy; + + if (st->error) + return; + + if (!error) + { + http_streaming_set_static_error(st, http_streaming_write_error); + return; + } + + copy = strdup(error); + if (!copy) + { + http_streaming_set_static_error(st, http_streaming_oom_error); + return; + } + + st->error = copy; + st->error_owned = true; +} + +static bool +http_streaming_resume(ch_http_streaming_state * st) +{ + mco_result res; + + res = mco_resume(st->co); + if (res == MCO_SUCCESS) + return true; + + http_streaming_set_owned_error(st, mco_result_description(res)); + st->transfer_done = true; + return false; +} + +/* + * Streaming write callback. Appends data to the rolling buffer and counts + * complete TSV rows (terminated by '\n'). When fetch_size rows have been + * accumulated, yields the coroutine back to the caller. + */ +static size_t +write_data_streaming(void *contents, size_t size, size_t nmemb, void *userp) +{ + size_t realsize = size * nmemb; + ch_http_streaming_state *st = userp; + + if (st->buf_len + realsize + 1 > st->buf_alloc) + { + size_t newsize = st->buf_alloc * 2; + char *newbuf; + + if (newsize < st->buf_len + realsize + 1) + newsize = st->buf_len + realsize + 1; + + newbuf = realloc(st->buf, newsize); + if (!newbuf) + { + http_streaming_set_static_error(st, http_streaming_oom_error); + return CURL_WRITEFUNC_ERROR; + } + + st->buf = newbuf; + st->buf_alloc = newsize; + } + + memcpy(st->buf + st->buf_len, contents, realsize); + st->buf_len += realsize; + st->buf[st->buf_len] = '\0'; + + for (size_t i = st->buf_len - realsize; i < st->buf_len; i++) + { + if (st->buf[i] != '\n') + continue; + + st->rows_ready++; + st->batch_end = i + 1; + if (st->rows_ready < st->fetch_size) + continue; + + mco_yield(st->co); + return realsize; + } + + return realsize; +} + +/* + * Coroutine function: runs the entire curl_easy_perform, yielding whenever a + * batch is ready. + */ +static void +http_streaming_coro(mco_coro * co) +{ + ch_http_streaming_state *st = mco_get_user_data(co); + CURLcode errcode; + + errcode = curl_easy_perform(st->conn->curl); + if (errcode == CURLE_ABORTED_BY_CALLBACK) + st->http_status = CH_HTTP_CANCELED_STATUS; + else if (errcode == CURLE_WRITE_ERROR) + { + if (!st->error) + http_streaming_set_static_error(st, http_streaming_write_error); + } + else if (errcode != CURLE_OK) + { + http_streaming_set_owned_error(st, + st->errbuffer[0] ? + st->errbuffer : + curl_easy_strerror(errcode)); + } + + ch_http_get_transfer_info(st->conn, &st->pretransfer_time, + &st->total_time, &st->http_status); + + if (st->buf_len > st->batch_end) + st->batch_end = st->buf_len; + + st->transfer_done = true; +} + +ch_http_streaming_state * +ch_http_begin_streaming(ch_http_connection_t * conn, const ch_query * query, + int fetch_size) +{ + ch_http_streaming_state *st; + mco_desc desc; + mco_result res; + + st = calloc(sizeof(ch_http_streaming_state), 1); + if (!st) + return NULL; + + st->conn = conn; + st->fetch_size = fetch_size > 0 ? fetch_size : 1; + st->buf_alloc = CH_HTTP_STREAM_BUFFER_SIZE; + st->buf = malloc(st->buf_alloc); + if (!st->buf) + { + free(st); + return NULL; + } + + ch_http_generate_query_id(st->query_id); + if (!ch_http_build_url(conn, query, st->query_id, &st->url)) + { + ch_http_end_streaming(st); + return NULL; + } + + ch_http_set_common_options(conn, st->url, st->errbuffer, + write_data_streaming, st); + ch_http_set_progress_options(conn, conn); + if (!ch_http_build_headers(conn, &st->headers)) + { + ch_http_end_streaming(st); + return NULL; + } + if (st->headers) + curl_easy_setopt(conn->curl, CURLOPT_HTTPHEADER, st->headers); + if (!ch_http_set_post_data(conn->curl, query, &st->form)) + { + ch_http_end_streaming(st); + return NULL; + } + + desc = mco_desc_init(http_streaming_coro, CH_MINICORO_STACK_SIZE); + desc.user_data = st; + res = mco_create(&st->co, &desc); + if (res != MCO_SUCCESS) + { + http_streaming_set_owned_error(st, mco_result_description(res)); + st->transfer_done = true; + return st; + } + + if (!http_streaming_resume(st)) + return st; + + return st; +} + +bool +ch_http_fetch_batch(ch_http_streaming_state * st) +{ + size_t consumed; + size_t remaining; + + if (!st) + return false; + + consumed = st->batch_end; + remaining = st->buf_len - consumed; + if (remaining > 0) + memmove(st->buf, st->buf + consumed, remaining); + st->buf_len = remaining; + st->buf[st->buf_len] = '\0'; + st->batch_end = 0; + st->rows_ready = 0; + + if (st->transfer_done || mco_status(st->co) == MCO_DEAD) + { + st->transfer_done = true; + if (st->buf_len > 0) + { + st->batch_end = st->buf_len; + return true; + } + return false; + } + + if (!http_streaming_resume(st)) + return false; + if (st->batch_end > 0) + return true; + if (st->transfer_done && st->buf_len > 0) + { + st->batch_end = st->buf_len; + return true; + } + + return false; +} + +char * +ch_http_streaming_error(ch_http_streaming_state * st) +{ + return st ? st->error : NULL; +} + +long +ch_http_streaming_status(ch_http_streaming_state * st) +{ + return st ? st->http_status : 0; +} + +double +ch_http_streaming_request_time(ch_http_streaming_state * st) +{ + return st ? st->pretransfer_time : 0; +} + +double +ch_http_streaming_total_time(ch_http_streaming_state * st) +{ + return st ? st->total_time : 0; +} + +char * +ch_http_streaming_batch_data(ch_http_streaming_state * st) +{ + return st ? st->buf : NULL; +} + +void +ch_http_streaming_init_read_state(ch_http_streaming_state * st, + ch_http_read_state * read_state) +{ + if (!st || !read_state) + return; + + ch_http_read_state_init(read_state, st->buf, st->batch_end); +} + +void +ch_http_end_streaming(ch_http_streaming_state * st) +{ + if (!st) + return; + + if (st->co) + mco_destroy(st->co); + + curl_easy_reset(st->conn->curl); + + if (st->url) + curl_free(st->url); + if (st->headers) + curl_slist_free_all(st->headers); + if (st->form) + curl_mime_free(st->form); + if (st->buf) + free(st->buf); + if (st->error && st->error_owned) + free(st->error); + + free(st); +} diff --git a/src/include/binary_internal.hh b/src/include/binary_internal.hh new file mode 100644 index 00000000..b7f73fdf --- /dev/null +++ b/src/include/binary_internal.hh @@ -0,0 +1,18 @@ +#ifndef CLICKHOUSE_BINARY_INTERNAL_HH +#define CLICKHOUSE_BINARY_INTERNAL_HH + +#ifdef __cplusplus +extern "C" +{ +#endif + +clickhouse::QuerySettings ch_binary_settings(const ch_query * query); +clickhouse::QueryParams ch_binary_params(const ch_query * query); +Datum ch_binary_make_datum(clickhouse::ColumnRef col, size_t row, + Oid * valtype, bool * is_null); + +#ifdef __cplusplus +} +#endif + +#endif /* CLICKHOUSE_BINARY_INTERNAL_HH */ diff --git a/src/include/fdw.h b/src/include/fdw.h index 4ce4abd8..afccc93a 100644 --- a/src/include/fdw.h +++ b/src/include/fdw.h @@ -236,7 +236,6 @@ extern void chfdw_report_error(int elevel, ch_connection conn, /* in option.c */ extern kv_list * chfdw_get_session_settings(void); -extern int chfdw_get_max_result_size(void); extern void chfdw_extract_options(List * defelems, char **driver, char **host, int *port, diff --git a/src/include/http.h b/src/include/http.h index fd91a47f..82574c99 100644 --- a/src/include/http.h +++ b/src/include/http.h @@ -65,7 +65,8 @@ long ch_http_streaming_status(ch_http_streaming_state * state); double ch_http_streaming_request_time(ch_http_streaming_state * state); double ch_http_streaming_total_time(ch_http_streaming_state * state); char *ch_http_streaming_batch_data(ch_http_streaming_state * state); -size_t ch_http_streaming_batch_size(ch_http_streaming_state * state); +void ch_http_streaming_init_read_state(ch_http_streaming_state * state, + ch_http_read_state * read_state); void ch_http_end_streaming(ch_http_streaming_state * state); #endif /* CLICKHOUSE_HTTP_H */ diff --git a/src/include/http_internal.h b/src/include/http_internal.h new file mode 100644 index 00000000..410d6a4b --- /dev/null +++ b/src/include/http_internal.h @@ -0,0 +1,33 @@ +#ifndef CLICKHOUSE_HTTP_INTERNAL_H +#define CLICKHOUSE_HTTP_INTERNAL_H + +#include "http.h" +#include "internal.h" + +#define CH_HTTP_CANCELED_STATUS 418 +#define CH_HTTP_COMM_ERROR_STATUS 419 +#define CH_HTTP_STREAM_BUFFER_SIZE (64 * 1024) +#define CH_MINICORO_STACK_SIZE (1024 * 1024) + +typedef size_t (*ch_http_write_func) (void *contents, size_t size, + size_t nmemb, void *userp); + +void ch_http_generate_query_id(char query_id[37]); +bool ch_http_build_url(ch_http_connection_t * conn, const ch_query * query, + const char *query_id, char **url); +bool ch_http_build_headers(ch_http_connection_t * conn, + struct curl_slist **headers); +bool ch_http_set_post_data(CURL *curl, const ch_query * query, + curl_mime **form); +void ch_http_set_common_options(ch_http_connection_t * conn, const char *url, + char *errbuffer, + ch_http_write_func write_func, + void *write_data); +void ch_http_set_progress_options(ch_http_connection_t * conn, + void *progress_data); +void ch_http_get_transfer_info(ch_http_connection_t * conn, + double *pretransfer_time, + double *total_time, + long *http_status); + +#endif /* CLICKHOUSE_HTTP_INTERNAL_H */ diff --git a/src/option.c b/src/option.c index 2c28bec8..35113bc8 100644 --- a/src/option.c +++ b/src/option.c @@ -70,7 +70,6 @@ static const ChFdwOption ch_options[] = */ static char *ch_session_settings = NULL; static kv_list * ch_session_settings_list = NULL; -static int ch_max_result_size = 0; /* MB, 0 = disabled */ /* * Helper functions @@ -564,25 +563,7 @@ _PG_init(void) chfdw_settings_assign_hook, NULL); - DefineCustomIntVariable("pg_clickhouse.max_result_size", - "Maximum size (MB) for a ClickHouse response.", - "Aborts a query if the accumulated response data exceeds " - "this limit. 0 disables the check.", - &ch_max_result_size, - 0, /* default: disabled */ - 0, /* min */ - 65536, /* max: 64 GB */ - PGC_USERSET, - GUC_UNIT_MB, - NULL, NULL, NULL); - #if PG_VERSION_NUM >= 150000 MarkGUCPrefixReserved("pg_clickhouse"); #endif } - -int -chfdw_get_max_result_size(void) -{ - return ch_max_result_size; -} diff --git a/src/pglink.c b/src/pglink.c index 553fccf0..c53e3567 100644 --- a/src/pglink.c +++ b/src/pglink.c @@ -188,9 +188,113 @@ kill_query(void *conn, const char *query_id) ch_http_response_free(resp); } +static void +report_oom(void) +{ + ereport(ERROR, + (errcode(ERRCODE_FDW_OUT_OF_MEMORY), + errmsg("out of memory"))); +} + +static void +report_http_status_error(long status, const char *query_sql, char *error) +{ + ereport(ERROR, + (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), + errmsg("pg_clickhouse: %s", format_error(error)), + status < 404 ? 0 : errdetail_internal("Remote Query: %.64000s", query_sql), + errcontext("HTTP status code: %li", status))); +} + +static void +report_http_streaming_state(ch_http_streaming_state * sstate, + const char *query_sql, + bool cleanup) +{ + char *err = ch_http_streaming_error(sstate); + long status = ch_http_streaming_status(sstate); + + if (err != NULL) + { + char *errcopy = pstrdup(err); + + if (cleanup) + ch_http_end_streaming(sstate); + ereport(ERROR, + (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), + errmsg("pg_clickhouse: %s", format_error(errcopy)))); + } + + if (status != 0 && status != 200) + { + char *data = ch_http_streaming_batch_data(sstate); + char *errcopy = data ? pstrdup(data) : pstrdup("unknown error"); + + if (cleanup) + ch_http_end_streaming(sstate); + report_http_status_error(status, query_sql, errcopy); + } +} + +static bool +http_streaming_load_batch(ch_cursor * cursor, ch_http_read_state * state) +{ + ch_http_streaming_state *sstate = cursor->streaming_state; + + if (!ch_http_fetch_batch(sstate)) + { + report_http_streaming_state(sstate, cursor->query, false); + return false; + } + + report_http_streaming_state(sstate, cursor->query, false); + ch_http_streaming_init_read_state(sstate, state); + return !(state->done || state->data == NULL); +} + +static void +report_binary_query_error(const char *query_sql, char *error) +{ + ereport(ERROR, + (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), + errmsg("pg_clickhouse: %s", error), + errdetail_internal("Remote Query: %.64000s", query_sql))); +} + +static void +report_binary_streaming_state(ch_binary_streaming_state * sstate, + const char *query_sql, + bool cleanup) +{ + char *err = ch_binary_streaming_error(sstate); + char *errcopy; + + if (err == NULL) + return; + + errcopy = pstrdup(err); + if (cleanup) + ch_binary_end_streaming(sstate); + report_binary_query_error(query_sql, errcopy); +} + +static void +report_binary_streaming_read_error(ch_binary_streaming_state * sstate) +{ + char *err = ch_binary_streaming_error(sstate); + + if (err == NULL) + return; + + ereport(ERROR, + (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), + errmsg("pg_clickhouse: error while reading row: %s", err))); +} + /* * Allocate a cursor in its own memory context with a cleanup callback. - * Used by both HTTP and binary query paths. + * The memory context owns the cursor itself; the callback only releases the + * external resources hanging off it. */ static ch_cursor * cursor_create(const char *sql, size_t columns_count, @@ -231,7 +335,7 @@ http_simple_query(void *conn, const ch_query * query) again: resp = ch_http_simple_query(conn, query); if (resp == NULL) - elog(ERROR, "out of memory"); + report_oom(); attempts++; if (resp->http_status == 419) @@ -263,12 +367,7 @@ http_simple_query(void *conn, const ch_query * query) ch_http_response_free(resp); - ereport(ERROR, ( - errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), - errmsg("pg_clickhouse: %s", format_error(error)), - status < 404 ? 0 : errdetail_internal("Remote Query: %.64000s", query->sql), - errcontext("HTTP status code: %li", status) - )); + report_http_status_error(status, query->sql, error); } cursor = cursor_create(query->sql, 0, http_cursor_free); @@ -310,39 +409,9 @@ http_streaming_query(void *conn, const ch_query * query, int fetch_size) sstate = ch_http_begin_streaming(conn, query, fetch_size); if (sstate == NULL) - elog(ERROR, "out of memory"); - - { - char *err = ch_http_streaming_error(sstate); - - if (err != NULL) - { - char *errcopy = pstrdup(err); - - ch_http_end_streaming(sstate); - ereport(ERROR, - (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), - errmsg("pg_clickhouse: %s", format_error(errcopy)))); - } - } - - { - long status = ch_http_streaming_status(sstate); - - if (status != 0 && status != 200) - { - char *data = ch_http_streaming_batch_data(sstate); - char *errcopy = data ? pstrdup(data) : pstrdup("unknown error"); + report_oom(); - ch_http_end_streaming(sstate); - ereport(ERROR, ( - errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), - errmsg("pg_clickhouse: %s", format_error(errcopy)), - status < 404 ? 0 : errdetail_internal("Remote Query: %.64000s", query->sql), - errcontext("HTTP status code: %li", status) - )); - } - } + report_http_streaming_state(sstate, query->sql, true); cursor = cursor_create(query->sql, 0, http_streaming_cursor_free); cursor->streaming_state = sstate; @@ -354,9 +423,7 @@ http_streaming_query(void *conn, const ch_query * query, int fetch_size) MemoryContext oldcxt = MemoryContextSwitchTo(cursor->memcxt); cursor->read_state = palloc0(sizeof(ch_http_read_state)); - ch_http_read_state_init(cursor->read_state, - ch_http_streaming_batch_data(sstate), - ch_http_streaming_batch_size(sstate)); + ch_http_streaming_init_read_state(sstate, cursor->read_state); MemoryContextSwitchTo(oldcxt); } @@ -386,13 +453,7 @@ http_simple_insert(void *conn, const ch_query * query) long status = resp->http_status; ch_http_response_free(resp); - - ereport(ERROR, ( - errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), - errmsg("pg_clickhouse: %s", format_error(error)), - status < 404 ? 0 : errdetail_internal("Remote Query: %.64000s", query->sql), - errcontext("HTTP status code: %li", status) - )); + report_http_status_error(status, query->sql, error); } ch_http_response_free(resp); @@ -428,19 +489,12 @@ http_fetch_row(ChFdwScanRowContext * ctx) Datum *values; ch_http_read_state *state = cursor->read_state; - if (state->done || state->data == NULL) + while (state->done || state->data == NULL) { if (!cursor->is_streaming) return NULL; - if (!ch_http_fetch_batch(cursor->streaming_state)) - return NULL; - - ch_http_read_state_init(state, - ch_http_streaming_batch_data(cursor->streaming_state), - ch_http_streaming_batch_size(cursor->streaming_state)); - - if (state->done || state->data == NULL) + if (!http_streaming_load_batch(cursor, state)) return NULL; } @@ -764,11 +818,7 @@ binary_simple_query(void *conn, const ch_query * query) char *error = pstrdup(resp->error); ch_binary_response_free(resp); - ereport(ERROR, ( - errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), - errmsg("pg_clickhouse: %s", error), - errdetail_internal("Remote Query: %.64000s", query->sql) - )); + report_binary_query_error(query->sql, error); } cursor = cursor_create(query->sql, resp->columns_count, @@ -813,25 +863,12 @@ binary_streaming_query(void *conn, const ch_query * query, int fetch_size) { ch_binary_streaming_state *sstate; + (void) fetch_size; sstate = ch_binary_begin_streaming(conn, query, &is_canceled); if (sstate == NULL) - elog(ERROR, "out of memory"); + report_oom(); - { - char *err = ch_binary_streaming_error(sstate); - - if (err != NULL) - { - char *errcopy = pstrdup(err); - - ch_binary_end_streaming(sstate); - ereport(ERROR, ( - errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), - errmsg("pg_clickhouse: %s", errcopy), - errdetail_internal("Remote Query: %.64000s", query->sql) - )); - } - } + report_binary_streaming_state(sstate, query->sql, true); { ch_cursor *cursor; @@ -909,15 +946,13 @@ binary_streaming_fetch_row(ChFdwScanRowContext * ctx) /* Try to read a row; if block exhausted, fetch next block. */ while (!ch_binary_streaming_read_row(sstate)) { - char *err = ch_binary_streaming_error(sstate); - - if (err != NULL) - ereport(ERROR, - (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), - errmsg("pg_clickhouse: error while reading row: %s", err))); + report_binary_streaming_read_error(sstate); if (!ch_binary_fetch_block(sstate)) - return NULL; /* no more blocks — EOF */ + { + report_binary_streaming_state(sstate, cursor->query, false); + return NULL; + } } if (tupdesc) From 38b075956e34991fcb269955b0189cc5a7587f70 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Wed, 1 Apr 2026 15:09:22 -0500 Subject: [PATCH 11/13] Make binary streaming errors const-correct The streaming state stored string literals and strdup() copies in the\nsame mutable pointer, then tracked ownership with a boolean flag.\nThat worked, but it forced a const_cast and obscured the actual\nlifetime of the error message.\n\nSplit the state into a borrowed error pointer plus an owned buffer,\nfree only the owned allocation, and make the streaming error accessors\nand reporting paths const-correct. This keeps the OOM fallback behavior\nwhile making the ownership model obvious in the code. --- src/binary_streaming.cpp | 26 ++++++++++++-------------- src/include/binary.hh | 2 +- src/pglink.c | 6 +++--- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/src/binary_streaming.cpp b/src/binary_streaming.cpp index acfe99ce..da810375 100644 --- a/src/binary_streaming.cpp +++ b/src/binary_streaming.cpp @@ -39,8 +39,8 @@ struct ch_binary_streaming_state std::unique_ptr nulls; size_t columns_count = 0; - char *error = nullptr; - bool error_owned = false; + const char *error = nullptr; + char *owned_error = nullptr; bool (*check_cancel) (void) = nullptr; Client *client = nullptr; @@ -50,17 +50,15 @@ struct ch_binary_streaming_state ~ch_binary_streaming_state() { - if (error && error_owned) - free(error); + free(owned_error); } void - SetStaticError(const char *message) + SetBorrowedError(const char *message) { if (error) return; - error = const_cast(message); - error_owned = false; + error = message; } void @@ -72,19 +70,19 @@ struct ch_binary_streaming_state return; if (!message) { - SetStaticError(kBinaryStreamingOom); + SetBorrowedError(kBinaryStreamingOom); return; } copy = strdup(message); if (!copy) { - SetStaticError(kBinaryStreamingOom); + SetBorrowedError(kBinaryStreamingOom); return; } - error = copy; - error_owned = true; + owned_error = copy; + error = owned_error; } }; @@ -122,7 +120,7 @@ binary_streaming_coro(mco_coro * co) { if (st->check_cancel && st->check_cancel()) { - st->SetStaticError(kBinaryStreamingCanceled); + st->SetBorrowedError(kBinaryStreamingCanceled); return false; } @@ -225,7 +223,7 @@ extern "C" st->nulls.reset(new (std::nothrow) bool[st->columns_count]); if (!st->coltypes || !st->values || !st->nulls) { - st->SetStaticError(kBinaryStreamingOom); + st->SetBorrowedError(kBinaryStreamingOom); return false; } } @@ -270,7 +268,7 @@ extern "C" return st->values[col]; } - char * + const char * ch_binary_streaming_error(ch_binary_streaming_state * st) { return st ? st->error : NULL; diff --git a/src/include/binary.hh b/src/include/binary.hh index de91ff79..8a21178d 100644 --- a/src/include/binary.hh +++ b/src/include/binary.hh @@ -85,7 +85,7 @@ extern "C" extern size_t ch_binary_streaming_columns(ch_binary_streaming_state * state); extern Datum ch_binary_streaming_value(ch_binary_streaming_state * state, size_t col, Oid * valtype, bool *is_null); - extern char *ch_binary_streaming_error(ch_binary_streaming_state * state); + extern const char *ch_binary_streaming_error(ch_binary_streaming_state * state); extern void ch_binary_end_streaming(ch_binary_streaming_state * state); /* reading */ diff --git a/src/pglink.c b/src/pglink.c index c53e3567..ce4489a3 100644 --- a/src/pglink.c +++ b/src/pglink.c @@ -253,7 +253,7 @@ http_streaming_load_batch(ch_cursor * cursor, ch_http_read_state * state) } static void -report_binary_query_error(const char *query_sql, char *error) +report_binary_query_error(const char *query_sql, const char *error) { ereport(ERROR, (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), @@ -266,7 +266,7 @@ report_binary_streaming_state(ch_binary_streaming_state * sstate, const char *query_sql, bool cleanup) { - char *err = ch_binary_streaming_error(sstate); + const char *err = ch_binary_streaming_error(sstate); char *errcopy; if (err == NULL) @@ -281,7 +281,7 @@ report_binary_streaming_state(ch_binary_streaming_state * sstate, static void report_binary_streaming_read_error(ch_binary_streaming_state * sstate) { - char *err = ch_binary_streaming_error(sstate); + const char *err = ch_binary_streaming_error(sstate); if (err == NULL) return; From 58893632c30a36e16827b16f5e8f278c664a7ee6 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Wed, 1 Apr 2026 20:38:59 -0500 Subject: [PATCH 12/13] Stream binary scans without coroutines The binary driver can now consume result blocks through the pull-based streaming select API added in clickhouse-cpp, so we no longer need the minicoro adapter there. This keeps the FDW-facing streaming code in a block-at-a-time form while avoiding the control-flow bridge that the callback-only client API required. Switch the binary streaming state over to BeginSelect, ReceiveSelectBlock, and EndSelect, close the select session promptly at EOF so later queries can reuse the same connection, and restrict FDW streaming to the binary driver. Leave the HTTP protocol on its buffered path for now, and update the docs to make that distinction explicit. Point the clickhouse-cpp submodule at the corresponding branch in the iskakaushik fork so this PR references a fetchable commit with the new client API. --- .gitmodules | 2 +- doc/pg_clickhouse.md | 10 +-- src/binary_streaming.cpp | 147 +++++++++++++++++++-------------------- src/include/binary.hh | 2 +- src/pglink.c | 112 ++--------------------------- vendor/clickhouse-cpp | 2 +- 6 files changed, 89 insertions(+), 186 deletions(-) diff --git a/.gitmodules b/.gitmodules index b8157233..2a9614cc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "clickhouse-cpp"] path = vendor/clickhouse-cpp - url = https://github.com/ClickHouse/clickhouse-cpp.git + url = git@github.com:iskakaushik/clickhouse-cpp.git [submodule "vendor/minicoro"] path = vendor/minicoro url = https://github.com/edubart/minicoro.git diff --git a/doc/pg_clickhouse.md b/doc/pg_clickhouse.md index d6a6f352..48084613 100644 --- a/doc/pg_clickhouse.md +++ b/doc/pg_clickhouse.md @@ -140,9 +140,10 @@ The supported options are: "default". * `host`: The host name of the ClickHouse server. Defaults to "localhost"; * `fetch_size`: The number of remote rows to buffer per streamed fetch while - scanning foreign tables. Defaults to `50000`. Set it to `0` to disable - streaming for tables that use this server and fall back to buffering the - full result set in memory. + scanning foreign tables with the `binary` driver. Defaults to `50000`. Set + it to `0` to disable streaming for tables that use this server and fall + back to buffering the full result set in memory. The `http` driver always + uses the buffered query path. * `port`: The port to connect to on the ClickHouse server. Defaults as follows: * 9440 if `driver` is "binary" and `host` is a ClickHouse Cloud host @@ -305,7 +306,8 @@ The supported table options are: for the foreign table. * `fetch_size`: Overrides the server-level `fetch_size` for this table only. Use it to make scans buffer fewer or more remote rows at a time, or set it - to `0` to disable streaming for this table. + to `0` to disable streaming for this table. Streaming currently applies + only to tables that use the `binary` driver. * `engine`: The [table engine] used by the ClickHouse table. For `CollapsingMergeTree()` and `AggregatingMergeTree()`, pg_clickhouse automatically applies the parameters to function expressions executed on diff --git a/src/binary_streaming.cpp b/src/binary_streaming.cpp index da810375..bcf1bad4 100644 --- a/src/binary_streaming.cpp +++ b/src/binary_streaming.cpp @@ -1,12 +1,11 @@ #include #include +#include #include #include #include -#include "minicoro.h" - extern "C" { @@ -20,16 +19,13 @@ extern "C" using namespace clickhouse; -static constexpr size_t kBinaryStreamingStackSize = 1024 * 1024; static const char kBinaryStreamingCanceled[] = "query was canceled"; static const char kBinaryStreamingOom[] = "out of memory"; struct ch_binary_streaming_state { - mco_coro *co = nullptr; - - /* Current block yielded by the coroutine. Valid until the next resume. */ - Block *current_block = nullptr; + /* Current block returned by clickhouse-cpp. */ + std::optional current_block; size_t current_row = 0; bool have_block = false; bool done = false; @@ -87,62 +83,64 @@ struct ch_binary_streaming_state }; static bool -binary_streaming_resume(ch_binary_streaming_state * st) -{ - mco_result res; - - res = mco_resume(st->co); - if (res == MCO_SUCCESS) - return true; - - st->SetOwnedError(mco_result_description(res)); - st->done = true; - return false; -} - -/* - * Coroutine function: runs Client::Select, yielding each block. - */ -static void -binary_streaming_coro(mco_coro * co) +binary_streaming_fill_block(ch_binary_streaming_state * st) { - ch_binary_streaming_state *st = - (ch_binary_streaming_state *) mco_get_user_data(co); + std::optional block; - try + for (;;) { - st->client->Select( - clickhouse::Query(st->sql) - .SetQuerySettings(st->settings) - .SetParams(st->params) - .OnDataCancelable( - [st, co](const Block & block) -> bool + try { if (st->check_cancel && st->check_cancel()) { st->SetBorrowedError(kBinaryStreamingCanceled); + st->done = true; return false; } - if (block.GetColumnCount() == 0) - return true; + block = st->client->ReceiveSelectBlock(); + } + catch (const std::exception & e) + { + st->SetOwnedError(e.what()); + st->done = true; + return false; + } - st->current_block = const_cast(&block); - st->columns_count = block.GetColumnCount(); - st->have_block = true; - st->current_row = 0; + if (!block) + { + try + { + st->client->EndSelect(); + } + catch (const std::exception & e) + { + st->SetOwnedError(e.what()); + } + st->current_block.reset(); + st->have_block = false; + st->done = true; + return false; + } - mco_yield(co); - return true; - })); - } - catch (const std::exception & e) - { - st->SetOwnedError(e.what()); - } + /* Match the old callback path, which ignored zero-column blocks. */ + if (block->GetColumnCount() == 0) + continue; + + if (st->columns_count != 0 && + block->GetColumnCount() != st->columns_count) + { + st->SetBorrowedError("columns mismatch in blocks"); + st->done = true; + return false; + } - st->have_block = false; - st->done = true; + st->current_block = std::move(block); + st->columns_count = st->current_block->GetColumnCount(); + st->have_block = true; + st->current_row = 0; + return true; + } } extern "C" @@ -154,8 +152,6 @@ extern "C" bool (*check_cancel) (void)) { ch_binary_streaming_state *st; - mco_desc desc; - mco_result res; st = new (std::nothrow) ch_binary_streaming_state(); if (!st) @@ -167,18 +163,20 @@ extern "C" st->settings = ch_binary_settings(query); st->params = ch_binary_params(query); - desc = mco_desc_init(binary_streaming_coro, kBinaryStreamingStackSize); - desc.user_data = st; - res = mco_create(&st->co, &desc); - if (res != MCO_SUCCESS) + try { - st->SetOwnedError(mco_result_description(res)); + st->client->BeginSelect(clickhouse::Query(st->sql) + .SetQuerySettings(st->settings) + .SetParams(st->params)); + } + catch (const std::exception & e) + { + st->SetOwnedError(e.what()); st->done = true; return st; } - if (!binary_streaming_resume(st)) - return st; + (void) binary_streaming_fill_block(st); return st; } @@ -190,13 +188,8 @@ extern "C" return false; if (st->have_block) return true; - if (mco_status(st->co) != MCO_SUSPENDED) - return false; - - if (!binary_streaming_resume(st)) - return false; - return st->have_block; + return binary_streaming_fill_block(st); } bool @@ -208,7 +201,7 @@ extern "C" if (!st || !st->have_block || !st->current_block) return false; - block = st->current_block; + block = &*st->current_block; row_count = block->GetRowCount(); if (st->current_row >= row_count) { @@ -280,15 +273,21 @@ extern "C" if (!st) return; - if (st->co) + try { - bool was_suspended = mco_status(st->co) == MCO_SUSPENDED; - - mco_destroy(st->co); - st->co = NULL; - - if (was_suspended) - st->client->ResetConnection(); + if (st->client) + st->client->EndSelect(); + } + catch (const std::exception &) + { + try + { + if (st->client) + st->client->ResetConnection(); + } + catch (const std::exception &) + { + } } delete st; diff --git a/src/include/binary.hh b/src/include/binary.hh index 8a21178d..2c01d48a 100644 --- a/src/include/binary.hh +++ b/src/include/binary.hh @@ -74,7 +74,7 @@ extern "C" const ch_query * query, bool (*check_cancel) (void)); extern void ch_binary_response_free(ch_binary_response_t * resp); -/* Streaming binary API: coroutine-based block-at-a-time iteration */ +/* Streaming binary API: block-at-a-time iteration */ typedef struct ch_binary_streaming_state ch_binary_streaming_state; extern ch_binary_streaming_state * ch_binary_begin_streaming( diff --git a/src/pglink.c b/src/pglink.c index ce4489a3..c7fc3268 100644 --- a/src/pglink.c +++ b/src/pglink.c @@ -28,10 +28,8 @@ static bool initialized = false; static void http_disconnect(void *conn); static ch_cursor * http_simple_query(void *conn, const ch_query * query); -static ch_cursor * http_streaming_query(void *conn, const ch_query * query, int fetch_size); static void http_simple_insert(void *conn, const ch_query * query); static void http_cursor_free(void *); -static void http_streaming_cursor_free(void *); static Datum * http_fetch_row(ChFdwScanRowContext * ctx); static void *http_prepare_insert(void *, ResultRelInfo *, List *, const ch_query *, char *); static void http_insert_tuple(void *, TupleTableSlot *); @@ -41,7 +39,8 @@ static libclickhouse_methods http_methods = { .disconnect = http_disconnect, .simple_query = http_simple_query, - .streaming_query = http_streaming_query, + /* HTTP currently uses only the buffered query path. */ + .streaming_query = NULL, .fetch_row = http_fetch_row, .prepare_insert = http_prepare_insert, .insert_tuple = http_insert_tuple @@ -206,52 +205,6 @@ report_http_status_error(long status, const char *query_sql, char *error) errcontext("HTTP status code: %li", status))); } -static void -report_http_streaming_state(ch_http_streaming_state * sstate, - const char *query_sql, - bool cleanup) -{ - char *err = ch_http_streaming_error(sstate); - long status = ch_http_streaming_status(sstate); - - if (err != NULL) - { - char *errcopy = pstrdup(err); - - if (cleanup) - ch_http_end_streaming(sstate); - ereport(ERROR, - (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), - errmsg("pg_clickhouse: %s", format_error(errcopy)))); - } - - if (status != 0 && status != 200) - { - char *data = ch_http_streaming_batch_data(sstate); - char *errcopy = data ? pstrdup(data) : pstrdup("unknown error"); - - if (cleanup) - ch_http_end_streaming(sstate); - report_http_status_error(status, query_sql, errcopy); - } -} - -static bool -http_streaming_load_batch(ch_cursor * cursor, ch_http_read_state * state) -{ - ch_http_streaming_state *sstate = cursor->streaming_state; - - if (!ch_http_fetch_batch(sstate)) - { - report_http_streaming_state(sstate, cursor->query, false); - return false; - } - - report_http_streaming_state(sstate, cursor->query, false); - ch_http_streaming_init_read_state(sstate, state); - return !(state->done || state->data == NULL); -} - static void report_binary_query_error(const char *query_sql, const char *error) { @@ -386,50 +339,6 @@ http_simple_query(void *conn, const ch_query * query) return cursor; } -static void -http_streaming_cursor_free(void *c) -{ - ch_cursor *cursor = c; - - if (cursor->streaming_state) - ch_http_end_streaming(cursor->streaming_state); -} - -/* - * Start a streaming query over HTTP. Returns a cursor that yields rows - * in bounded-memory batches of fetch_size rows each. - */ -static ch_cursor * -http_streaming_query(void *conn, const ch_query * query, int fetch_size) -{ - ch_cursor *cursor; - ch_http_streaming_state *sstate; - - ch_http_set_progress_func(http_progress_callback); - - sstate = ch_http_begin_streaming(conn, query, fetch_size); - if (sstate == NULL) - report_oom(); - - report_http_streaming_state(sstate, query->sql, true); - - cursor = cursor_create(query->sql, 0, http_streaming_cursor_free); - cursor->streaming_state = sstate; - cursor->is_streaming = true; - cursor->request_time = ch_http_streaming_request_time(sstate) * 1000; - cursor->total_time = ch_http_streaming_total_time(sstate) * 1000; - - { - MemoryContext oldcxt = MemoryContextSwitchTo(cursor->memcxt); - - cursor->read_state = palloc0(sizeof(ch_http_read_state)); - ch_http_streaming_init_read_state(sstate, cursor->read_state); - MemoryContextSwitchTo(oldcxt); - } - - return cursor; -} - static void http_simple_insert(void *conn, const ch_query * query) { @@ -489,14 +398,8 @@ http_fetch_row(ChFdwScanRowContext * ctx) Datum *values; ch_http_read_state *state = cursor->read_state; - while (state->done || state->data == NULL) - { - if (!cursor->is_streaming) - return NULL; - - if (!http_streaming_load_batch(cursor, state)) - return NULL; - } + if (state->done || state->data == NULL) + return NULL; /* Special case: SELECT NULL. */ if (attcount == 0) @@ -856,7 +759,7 @@ binary_streaming_cursor_free(void *c) /* * Start a streaming query over the binary protocol. Returns a cursor - * that yields rows block-by-block via coroutine. + * that yields rows block-by-block via clickhouse-cpp's streaming select API. */ static ch_cursor * binary_streaming_query(void *conn, const ch_query * query, int fetch_size) @@ -928,9 +831,8 @@ binary_convert_column(ch_cursor * cursor, TupleDesc tupdesc, } /* - * Streaming variant of binary_fetch_row. Reads rows from the coroutine- - * backed streaming state, fetching the next block when the current one - * is exhausted. + * Streaming variant of binary_fetch_row. Reads rows from the block-backed + * streaming state, fetching the next block when the current one is exhausted. */ static Datum * binary_streaming_fetch_row(ChFdwScanRowContext * ctx) diff --git a/vendor/clickhouse-cpp b/vendor/clickhouse-cpp index 3c94b448..d3fdc3b7 160000 --- a/vendor/clickhouse-cpp +++ b/vendor/clickhouse-cpp @@ -1 +1 @@ -Subproject commit 3c94b44894197860cabbba97c015b967055a55c2 +Subproject commit d3fdc3b712a8c5f1e30f14fdc3885198fd48a94e From 38600b93c332f00e6e6ef838c19fd4a040b6c3f7 Mon Sep 17 00:00:00 2001 From: Kaushik Iska Date: Wed, 1 Apr 2026 21:06:39 -0500 Subject: [PATCH 13/13] Limit streaming scans to the binary driver The PR originally carried coroutine-based HTTP streaming and the\nvendored minicoro dependency even after the binary path moved to\nthe new clickhouse-cpp pull API. That made the diff much larger\nthan the final design and left the PR description out of date.\n\nDrop the unused HTTP streaming implementation, remove minicoro\nfrom the build and submodules, and restore the buffered HTTP\npath. Keep only the binary scan changes: the FDW streaming hook,\nthe block-backed binary row reader, and the fetch_size plumbing.\nAlso restore the buffered binary cancel callback and remove the\nnew C-linkage warning from binary_internal.hh. --- .gitmodules | 3 - Makefile | 19 +- src/binary.cpp | 6 + src/binary_streaming.cpp | 52 ++---- src/fdw.c.in | 3 + src/http.c | 297 ++++++++++--------------------- src/http_streaming.c | 341 ------------------------------------ src/include/fdw.h | 5 +- src/include/http.h | 16 -- src/include/http_internal.h | 33 ---- src/minicoro.c | 6 - src/pglink.c | 59 ++++--- vendor/minicoro | 1 - 13 files changed, 157 insertions(+), 684 deletions(-) delete mode 100644 src/http_streaming.c delete mode 100644 src/include/http_internal.h delete mode 100644 src/minicoro.c delete mode 160000 vendor/minicoro diff --git a/.gitmodules b/.gitmodules index 2a9614cc..20bc2831 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ [submodule "clickhouse-cpp"] path = vendor/clickhouse-cpp url = git@github.com:iskakaushik/clickhouse-cpp.git -[submodule "vendor/minicoro"] - path = vendor/minicoro - url = https://github.com/edubart/minicoro.git diff --git a/Makefile b/Makefile index 400f99c1..a2a57b82 100644 --- a/Makefile +++ b/Makefile @@ -33,9 +33,8 @@ ifndef CH_BUILD endif # clickhouse-cpp source and build directories. -VENDOR_DIR = vendor -CH_CPP_DIR = $(VENDOR_DIR)/clickhouse-cpp -CH_CPP_BUILD_DIR = $(VENDOR_DIR)/_build/$(OS)-$(ARCH)-$(CH_BUILD)-$(shell git submodule status $(CH_CPP_DIR) | awk '{print substr($$1, 0, 7)}') +CH_CPP_DIR = vendor/clickhouse-cpp +CH_CPP_BUILD_DIR = vendor/_build/$(OS)-$(ARCH)-$(CH_BUILD)-$(shell git submodule status $(CH_CPP_DIR) | awk '{print substr($$1, 0, 7)}') # List the clickhouse-cpp libraries we require. CH_CPP_LIB = $(CH_CPP_BUILD_DIR)/clickhouse/libclickhouse-cpp-lib$(DLSUFFIX) @@ -57,7 +56,7 @@ else endif # Add include directories. -PG_CPPFLAGS = -I./src/include -I$(CH_CPP_DIR) -I$(CH_CPP_DIR)/contrib/absl -I$(VENDOR_DIR)/minicoro +PG_CPPFLAGS = -I./src/include -I$(CH_CPP_DIR) -I$(CH_CPP_DIR)/contrib/absl # Include other libraries compiled into clickhouse-cpp. PG_LDFLAGS = -lstdc++ -lssl -lcrypto $(shell $(CURL_CONFIG) --libs) @@ -76,7 +75,7 @@ endif # Clean up the clickhouse-cpp build directory and generated files. EXTRA_CLEAN = sql/$(EXTENSION)--$(EXTVERSION).sql src/fdw.c compile_commands.json test/schedule $(EXTENSION)-$(DISTVERSION).zip ifndef NO_VENDOR_CLEAN - EXTRA_CLEAN += $(VENDOR_DIR) + EXTRA_CLEAN += $(CH_CPP_BUILD_DIR) endif # Import PGXS. @@ -102,12 +101,8 @@ $(shlib): $(CH_CPP_LIB) $(OBJS) $(CH_CPP_DIR)/CMakeLists.txt: git submodule update --init -# Clone minicoro submodule. -$(VENDOR_DIR)/minicoro/minicoro.h: - git submodule update --init - -# Require the vendored libraries. -$(OBJS): $(CH_CPP_DIR)/CMakeLists.txt $(VENDOR_DIR)/minicoro/minicoro.h +# Require the vendored clickhouse-cpp. +$(OBJS): $(CH_CPP_DIR)/CMakeLists.txt # Build clickhouse-cpp. $(CH_CPP_LIB): export CXXFLAGS=-fPIC @@ -117,7 +112,7 @@ $(CH_CPP_LIB): $(CH_CPP_DIR)/CMakeLists.txt # Sync with "Reset Vendor Timestamp" cmake --build $(CH_CPP_BUILD_DIR) --parallel $$(nproc) --target all # Require the versioned C source and SQL script. -all: sql/$(EXTENSION)--$(EXTVERSION).sql src/fdw.c $(VENDOR_DIR)/minicoro/minicoro.h +all: sql/$(EXTENSION)--$(EXTVERSION).sql src/fdw.c # Versioned SQL script. sql/$(EXTENSION)--$(EXTVERSION).sql: sql/$(EXTENSION).sql diff --git a/src/binary.cpp b/src/binary.cpp index 8d48d24d..3cf5c260 100644 --- a/src/binary.cpp +++ b/src/binary.cpp @@ -233,6 +233,12 @@ extern "C" client->Select(clickhouse::Query(query->sql) .SetQuerySettings(ch_binary_settings(query)) .SetParams(ch_binary_params(query)) + .OnProgress( + [&check_cancel](const Progress &) + { + if (check_cancel && check_cancel()) + throw std::runtime_error("query was canceled"); + }) .OnDataCancelable( [&resp, &values, &check_cancel](const Block &block) { diff --git a/src/binary_streaming.cpp b/src/binary_streaming.cpp index bcf1bad4..deb3530d 100644 --- a/src/binary_streaming.cpp +++ b/src/binary_streaming.cpp @@ -35,8 +35,7 @@ struct ch_binary_streaming_state std::unique_ptr nulls; size_t columns_count = 0; - const char *error = nullptr; - char *owned_error = nullptr; + std::optional error; bool (*check_cancel) (void) = nullptr; Client *client = nullptr; @@ -44,41 +43,18 @@ struct ch_binary_streaming_state QuerySettings settings; QueryParams params; - ~ch_binary_streaming_state() - { - free(owned_error); - } - void - SetBorrowedError(const char *message) + SetError(const char *message) { if (error) return; - error = message; + error.emplace(message ? message : kBinaryStreamingOom); } - void - SetOwnedError(const char *message) + const char * + GetError() const { - char *copy; - - if (error) - return; - if (!message) - { - SetBorrowedError(kBinaryStreamingOom); - return; - } - - copy = strdup(message); - if (!copy) - { - SetBorrowedError(kBinaryStreamingOom); - return; - } - - owned_error = copy; - error = owned_error; + return error ? error->c_str() : nullptr; } }; @@ -93,7 +69,7 @@ binary_streaming_fill_block(ch_binary_streaming_state * st) { if (st->check_cancel && st->check_cancel()) { - st->SetBorrowedError(kBinaryStreamingCanceled); + st->SetError(kBinaryStreamingCanceled); st->done = true; return false; } @@ -102,7 +78,7 @@ binary_streaming_fill_block(ch_binary_streaming_state * st) } catch (const std::exception & e) { - st->SetOwnedError(e.what()); + st->SetError(e.what()); st->done = true; return false; } @@ -115,7 +91,7 @@ binary_streaming_fill_block(ch_binary_streaming_state * st) } catch (const std::exception & e) { - st->SetOwnedError(e.what()); + st->SetError(e.what()); } st->current_block.reset(); st->have_block = false; @@ -130,7 +106,7 @@ binary_streaming_fill_block(ch_binary_streaming_state * st) if (st->columns_count != 0 && block->GetColumnCount() != st->columns_count) { - st->SetBorrowedError("columns mismatch in blocks"); + st->SetError("columns mismatch in blocks"); st->done = true; return false; } @@ -171,7 +147,7 @@ extern "C" } catch (const std::exception & e) { - st->SetOwnedError(e.what()); + st->SetError(e.what()); st->done = true; return st; } @@ -216,7 +192,7 @@ extern "C" st->nulls.reset(new (std::nothrow) bool[st->columns_count]); if (!st->coltypes || !st->values || !st->nulls) { - st->SetBorrowedError(kBinaryStreamingOom); + st->SetError(kBinaryStreamingOom); return false; } } @@ -231,7 +207,7 @@ extern "C" } catch (const std::exception & e) { - st->SetOwnedError(e.what()); + st->SetError(e.what()); return false; } @@ -264,7 +240,7 @@ extern "C" const char * ch_binary_streaming_error(ch_binary_streaming_state * st) { - return st ? st->error : NULL; + return st ? st->GetError() : NULL; } void diff --git a/src/fdw.c.in b/src/fdw.c.in index 85d6eece..f06f7173 100644 --- a/src/fdw.c.in +++ b/src/fdw.c.in @@ -956,6 +956,9 @@ clickhouseIterateForeignScan(ForeignScanState * node) TupleDesc tupdesc; ch_query query = new_query(fsstate->query, fsstate->numParams, fsstate->param_values); + /* Allow query cancel (e.g. Ctrl+C) between tuple fetches. */ + CHECK_FOR_INTERRUPTS(); + /* make query if needed */ if (fsstate->ch_cursor == NULL) { diff --git a/src/http.c b/src/http.c index bf6c573b..10ff0095 100644 --- a/src/http.c +++ b/src/http.c @@ -6,8 +6,8 @@ #include #include - -#include "http_internal.h" +#include +#include #define DATABASE_HEADER "X-ClickHouse-Database" @@ -144,244 +144,100 @@ ch_http_connection(ch_connection_details * details) return NULL; } -void -ch_http_generate_query_id(char query_id[37]) +static void +set_query_id(ch_http_response_t * resp) { uuid_t id; uuid_generate(id); - uuid_unparse(id, query_id); -} - -static bool -http_skip_setting(const char *name) -{ - return strcmp(name, "date_time_output_format") == 0 || - strcmp(name, "format_tsv_null_representation") == 0 || - strcmp(name, "output_format_tsv_crlf_end_of_line") == 0; + uuid_unparse(id, resp->query_id); } -static bool -http_append_query_param(CURLU * cu, const char *name, const char *value) +ch_http_response_t * +ch_http_simple_query(ch_http_connection_t * conn, const ch_query * query) { - char *buf = psprintf("%s=%s", name, value); - bool success; + char *url; + CURLcode errcode; + static char errbuffer[CURL_ERROR_SIZE]; + struct curl_slist *headers = NULL; + CURLU *cu = curl_url(); + kv_iter iter; + char *buf = NULL; + curl_mime *form = NULL; + int i; - success = curl_url_set(cu, CURLUPART_QUERY, buf, - CURLU_APPENDQUERY | CURLU_URLENCODE) == CURLUE_OK; - pfree(buf); - return success; -} + ch_http_response_t *resp = calloc(sizeof(ch_http_response_t), 1); -bool -ch_http_build_url(ch_http_connection_t * conn, const ch_query * query, - const char *query_id, char **url) -{ - CURLU *cu; - kv_iter iter; - bool success = false; + if (resp == NULL) + return NULL; - *url = NULL; - cu = curl_url(); - if (!cu) - return false; + set_query_id(resp); - if (curl_url_set(cu, CURLUPART_URL, conn->base_url, 0) != CURLUE_OK) - goto cleanup; + assert(conn && conn->curl); - if (!http_append_query_param(cu, "query_id", query_id)) - goto cleanup; + /* Construct the base URL with the query ID. */ + curl_url_set(cu, CURLUPART_URL, conn->base_url, 0); + buf = psprintf("query_id=%s", resp->query_id); + curl_url_set(cu, CURLUPART_QUERY, buf, CURLU_APPENDQUERY | CURLU_URLENCODE); + pfree(buf); - for (iter = new_kv_iter(query->settings); - !kv_iter_done(&iter); - kv_iter_next(&iter)) + /* Append each of the settings as a query param. */ + for (iter = new_kv_iter(query->settings); !kv_iter_done(&iter); kv_iter_next(&iter)) { - if (http_skip_setting(iter.name)) + /* Skip settings that would break parsing and type conversion. */ + if ( + strcmp(iter.name, "date_time_output_format") == 0 || + strcmp(iter.name, "format_tsv_null_representation") == 0 || + strcmp(iter.name, "output_format_tsv_crlf_end_of_line") == 0 + ) continue; - if (!http_append_query_param(cu, iter.name, iter.value)) - goto cleanup; + buf = psprintf("%s=%s", iter.name, iter.value); + curl_url_set(cu, CURLUPART_QUERY, buf, CURLU_APPENDQUERY | CURLU_URLENCODE); + pfree(buf); } - if (curl_url_set(cu, CURLUPART_QUERY, "date_time_output_format=iso", - CURLU_APPENDQUERY | CURLU_URLENCODE) != CURLUE_OK) - goto cleanup; - if (curl_url_set(cu, CURLUPART_QUERY, - "format_tsv_null_representation=\\N", - CURLU_APPENDQUERY | CURLU_URLENCODE) != CURLUE_OK) - goto cleanup; - if (curl_url_set(cu, CURLUPART_QUERY, - "output_format_tsv_crlf_end_of_line=0", - CURLU_APPENDQUERY | CURLU_URLENCODE) != CURLUE_OK) - goto cleanup; - if (curl_url_get(cu, CURLUPART_URL, url, 0) != CURLUE_OK) - goto cleanup; - - success = true; - -cleanup: + /* Always use ISO date format, \N for NULL, \n for EOL. */ + curl_url_set(cu, CURLUPART_QUERY, "date_time_output_format=iso", CURLU_APPENDQUERY | CURLU_URLENCODE); + curl_url_set(cu, CURLUPART_QUERY, "format_tsv_null_representation=\\N", CURLU_APPENDQUERY | CURLU_URLENCODE); + curl_url_set(cu, CURLUPART_QUERY, "output_format_tsv_crlf_end_of_line=0", CURLU_APPENDQUERY | CURLU_URLENCODE); + curl_url_get(cu, CURLUPART_URL, &url, 0); curl_url_cleanup(cu); - if (!success && *url) - { - curl_free(*url); - *url = NULL; - } - return success; -} - -bool -ch_http_build_headers(ch_http_connection_t * conn, struct curl_slist **headers) -{ - char *header; - struct curl_slist *list; - - *headers = NULL; - if (!conn->dbname) - return true; - header = psprintf("%s: %s", DATABASE_HEADER, conn->dbname); - list = curl_slist_append(NULL, header); - pfree(header); - if (!list) - return false; - - *headers = list; - return true; -} - -bool -ch_http_set_post_data(CURL *curl, const ch_query * query, curl_mime **form) -{ - int i; - - *form = NULL; - if (query->num_params == 0) - { - curl_easy_setopt(curl, CURLOPT_POSTFIELDS, query->sql); - return true; - } - - *form = curl_mime_init(curl); - if (!*form) - return false; - - { - curl_mimepart *part = curl_mime_addpart(*form); - - if (!part) - goto oom; - curl_mime_name(part, "query"); - curl_mime_data(part, query->sql, CURL_ZERO_TERMINATED); - } - - for (i = 0; i < query->num_params; i++) - { - char *name = psprintf("param_p%d", i + 1); - curl_mimepart *part = curl_mime_addpart(*form); - - if (!part) - { - pfree(name); - goto oom; - } - curl_mime_name(part, name); - pfree(name); - curl_mime_data(part, query->param_values[i], CURL_ZERO_TERMINATED); - } - - curl_easy_setopt(curl, CURLOPT_MIMEPOST, *form); - return true; - -oom: - curl_mime_free(*form); - *form = NULL; - return false; -} - -void -ch_http_set_common_options(ch_http_connection_t * conn, const char *url, - char *errbuffer, ch_http_write_func write_func, - void *write_data) -{ + /* constant */ errbuffer[0] = '\0'; curl_easy_reset(conn->curl); - curl_easy_setopt(conn->curl, CURLOPT_WRITEFUNCTION, write_func); - curl_easy_setopt(conn->curl, CURLOPT_WRITEDATA, write_data); + curl_easy_setopt(conn->curl, CURLOPT_WRITEFUNCTION, write_data); curl_easy_setopt(conn->curl, CURLOPT_ERRORBUFFER, errbuffer); curl_easy_setopt(conn->curl, CURLOPT_PATH_AS_IS, 1L); curl_easy_setopt(conn->curl, CURLOPT_URL, url); curl_easy_setopt(conn->curl, CURLOPT_NOSIGNAL, 1L); - curl_easy_setopt(conn->curl, CURLOPT_VERBOSE, curl_verbose); -} -void -ch_http_set_progress_options(ch_http_connection_t * conn, void *progress_data) -{ + /* variable */ + curl_easy_setopt(conn->curl, CURLOPT_WRITEDATA, resp); + curl_easy_setopt(conn->curl, CURLOPT_VERBOSE, curl_verbose); if (curl_progressfunc) { curl_easy_setopt(conn->curl, CURLOPT_NOPROGRESS, 0L); curl_easy_setopt(conn->curl, CURLOPT_XFERINFOFUNCTION, curl_progressfunc); - curl_easy_setopt(conn->curl, CURLOPT_XFERINFODATA, progress_data); + curl_easy_setopt(conn->curl, CURLOPT_XFERINFODATA, conn); } else curl_easy_setopt(conn->curl, CURLOPT_NOPROGRESS, 1L); -} - -void -ch_http_get_transfer_info(ch_http_connection_t * conn, double *pretransfer_time, - double *total_time, long *http_status) -{ - long status = 0; - - if (curl_easy_getinfo(conn->curl, CURLINFO_PRETRANSFER_TIME, - pretransfer_time) != CURLE_OK) - *pretransfer_time = 0; - if (curl_easy_getinfo(conn->curl, CURLINFO_TOTAL_TIME, - total_time) != CURLE_OK) - *total_time = 0; - if (curl_easy_getinfo(conn->curl, CURLINFO_RESPONSE_CODE, - &status) == CURLE_OK && - (status != 0 || *http_status == 0)) - *http_status = status; -} - -ch_http_response_t * -ch_http_simple_query(ch_http_connection_t * conn, const ch_query * query) -{ - char *url; - CURLcode errcode; - static char errbuffer[CURL_ERROR_SIZE]; - struct curl_slist *headers = NULL; - curl_mime *form = NULL; - - ch_http_response_t *resp = calloc(sizeof(ch_http_response_t), 1); - - if (resp == NULL) - return NULL; - - ch_http_generate_query_id(resp->query_id); - - assert(conn && conn->curl); - - if (!ch_http_build_url(conn, query, resp->query_id, &url)) - { - resp->http_status = -1; - resp->data = "out of memory"; - return resp; - } - - ch_http_set_common_options(conn, url, errbuffer, write_data, resp); - ch_http_set_progress_options(conn, conn); - if (!ch_http_build_headers(conn, &headers)) + if (conn->dbname) { - curl_free(url); - resp->http_status = -1; - resp->data = "out of memory"; - return resp; - } - if (headers) + headers = curl_slist_append(headers, psprintf("%s: %s", DATABASE_HEADER, conn->dbname)); + if (!headers) + { + curl_free(url); + resp->http_status = -1; + resp->data = "out of memory"; + return resp; + } curl_easy_setopt(conn->curl, CURLOPT_HTTPHEADER, headers); + } if (query->num_params == 0) + /* * Send the query as the POST body. This ensures that * date_time_output_format=iso will work for ClickHouse versions prior @@ -396,7 +252,10 @@ ch_http_simple_query(ch_http_connection_t * conn, const ch_query * query) * ClickHouse 25.8. Details: * https://github.com/ClickHouse/ClickHouse/pull/85570. */ - if (!ch_http_set_post_data(conn->curl, query, &form)) + curl_mimepart *part; + + form = curl_mime_init(conn->curl); + if (!form) { curl_free(url); if (headers) @@ -405,6 +264,16 @@ ch_http_simple_query(ch_http_connection_t * conn, const ch_query * query) resp->data = "out of memory"; return resp; } + part = curl_mime_addpart(form); + curl_mime_name(part, "query"); + curl_mime_data(part, query->sql, CURL_ZERO_TERMINATED); + for (i = 0; i < query->num_params; i++) + { + part = curl_mime_addpart(form); + curl_mime_name(part, psprintf("param_p%d", i + 1)); + curl_mime_data(part, query->param_values[i], CURL_ZERO_TERMINATED); + } + curl_easy_setopt(conn->curl, CURLOPT_MIMEPOST, form); } curl_error_happened = false; @@ -417,19 +286,31 @@ ch_http_simple_query(ch_http_connection_t * conn, const ch_query * query) if (errcode == CURLE_ABORTED_BY_CALLBACK) { - resp->http_status = CH_HTTP_CANCELED_STATUS; + resp->http_status = 418; /* I'm teapot */ return resp; } else if (errcode != CURLE_OK) { - resp->http_status = CH_HTTP_COMM_ERROR_STATUS; + resp->http_status = 419; /* illegal http status */ resp->data = strdup(errbuffer); resp->datasize = strlen(errbuffer); return resp; } - ch_http_get_transfer_info(conn, &resp->pretransfer_time, - &resp->total_time, &resp->http_status); + errcode = curl_easy_getinfo(conn->curl, CURLINFO_PRETRANSFER_TIME, + &resp->pretransfer_time); + if (errcode != CURLE_OK) + resp->pretransfer_time = 0; + + errcode = curl_easy_getinfo(conn->curl, CURLINFO_TOTAL_TIME, &resp->total_time); + if (errcode != CURLE_OK) + resp->total_time = 0; + + /* + * All good with request, but we need http status to make sure query went + * ok + */ + curl_easy_getinfo(conn->curl, CURLINFO_RESPONSE_CODE, &resp->http_status); if (curl_verbose && resp->http_status != 200) fprintf(stderr, "%s", resp->data); diff --git a/src/http_streaming.c b/src/http_streaming.c deleted file mode 100644 index 91c89689..00000000 --- a/src/http_streaming.c +++ /dev/null @@ -1,341 +0,0 @@ -#include "postgres.h" - -#include "http_internal.h" - -#include "minicoro.h" - -struct ch_http_streaming_state -{ - mco_coro *co; - ch_http_connection_t *conn; - - /* Owned resources that need cleanup */ - char *url; - struct curl_slist *headers; - curl_mime *form; - - bool transfer_done; /* coroutine has finished */ - - /* Rolling buffer: raw bytes from curl */ - char *buf; - size_t buf_alloc; - size_t buf_len; /* total bytes in buf */ - size_t batch_end; /* offset past last complete row in batch */ - int rows_ready; /* complete rows accumulated */ - int fetch_size; - - /* Error / status */ - long http_status; - char errbuffer[CURL_ERROR_SIZE]; - char *error; - bool error_owned; - double pretransfer_time; - double total_time; - char query_id[37]; -}; - -static const char http_streaming_oom_error[] = "out of memory"; -static const char http_streaming_write_error[] = "could not buffer response"; - -static void -http_streaming_set_static_error(ch_http_streaming_state * st, const char *error) -{ - if (st->error) - return; - - st->error = (char *) error; - st->error_owned = false; -} - -static void -http_streaming_set_owned_error(ch_http_streaming_state * st, const char *error) -{ - char *copy; - - if (st->error) - return; - - if (!error) - { - http_streaming_set_static_error(st, http_streaming_write_error); - return; - } - - copy = strdup(error); - if (!copy) - { - http_streaming_set_static_error(st, http_streaming_oom_error); - return; - } - - st->error = copy; - st->error_owned = true; -} - -static bool -http_streaming_resume(ch_http_streaming_state * st) -{ - mco_result res; - - res = mco_resume(st->co); - if (res == MCO_SUCCESS) - return true; - - http_streaming_set_owned_error(st, mco_result_description(res)); - st->transfer_done = true; - return false; -} - -/* - * Streaming write callback. Appends data to the rolling buffer and counts - * complete TSV rows (terminated by '\n'). When fetch_size rows have been - * accumulated, yields the coroutine back to the caller. - */ -static size_t -write_data_streaming(void *contents, size_t size, size_t nmemb, void *userp) -{ - size_t realsize = size * nmemb; - ch_http_streaming_state *st = userp; - - if (st->buf_len + realsize + 1 > st->buf_alloc) - { - size_t newsize = st->buf_alloc * 2; - char *newbuf; - - if (newsize < st->buf_len + realsize + 1) - newsize = st->buf_len + realsize + 1; - - newbuf = realloc(st->buf, newsize); - if (!newbuf) - { - http_streaming_set_static_error(st, http_streaming_oom_error); - return CURL_WRITEFUNC_ERROR; - } - - st->buf = newbuf; - st->buf_alloc = newsize; - } - - memcpy(st->buf + st->buf_len, contents, realsize); - st->buf_len += realsize; - st->buf[st->buf_len] = '\0'; - - for (size_t i = st->buf_len - realsize; i < st->buf_len; i++) - { - if (st->buf[i] != '\n') - continue; - - st->rows_ready++; - st->batch_end = i + 1; - if (st->rows_ready < st->fetch_size) - continue; - - mco_yield(st->co); - return realsize; - } - - return realsize; -} - -/* - * Coroutine function: runs the entire curl_easy_perform, yielding whenever a - * batch is ready. - */ -static void -http_streaming_coro(mco_coro * co) -{ - ch_http_streaming_state *st = mco_get_user_data(co); - CURLcode errcode; - - errcode = curl_easy_perform(st->conn->curl); - if (errcode == CURLE_ABORTED_BY_CALLBACK) - st->http_status = CH_HTTP_CANCELED_STATUS; - else if (errcode == CURLE_WRITE_ERROR) - { - if (!st->error) - http_streaming_set_static_error(st, http_streaming_write_error); - } - else if (errcode != CURLE_OK) - { - http_streaming_set_owned_error(st, - st->errbuffer[0] ? - st->errbuffer : - curl_easy_strerror(errcode)); - } - - ch_http_get_transfer_info(st->conn, &st->pretransfer_time, - &st->total_time, &st->http_status); - - if (st->buf_len > st->batch_end) - st->batch_end = st->buf_len; - - st->transfer_done = true; -} - -ch_http_streaming_state * -ch_http_begin_streaming(ch_http_connection_t * conn, const ch_query * query, - int fetch_size) -{ - ch_http_streaming_state *st; - mco_desc desc; - mco_result res; - - st = calloc(sizeof(ch_http_streaming_state), 1); - if (!st) - return NULL; - - st->conn = conn; - st->fetch_size = fetch_size > 0 ? fetch_size : 1; - st->buf_alloc = CH_HTTP_STREAM_BUFFER_SIZE; - st->buf = malloc(st->buf_alloc); - if (!st->buf) - { - free(st); - return NULL; - } - - ch_http_generate_query_id(st->query_id); - if (!ch_http_build_url(conn, query, st->query_id, &st->url)) - { - ch_http_end_streaming(st); - return NULL; - } - - ch_http_set_common_options(conn, st->url, st->errbuffer, - write_data_streaming, st); - ch_http_set_progress_options(conn, conn); - if (!ch_http_build_headers(conn, &st->headers)) - { - ch_http_end_streaming(st); - return NULL; - } - if (st->headers) - curl_easy_setopt(conn->curl, CURLOPT_HTTPHEADER, st->headers); - if (!ch_http_set_post_data(conn->curl, query, &st->form)) - { - ch_http_end_streaming(st); - return NULL; - } - - desc = mco_desc_init(http_streaming_coro, CH_MINICORO_STACK_SIZE); - desc.user_data = st; - res = mco_create(&st->co, &desc); - if (res != MCO_SUCCESS) - { - http_streaming_set_owned_error(st, mco_result_description(res)); - st->transfer_done = true; - return st; - } - - if (!http_streaming_resume(st)) - return st; - - return st; -} - -bool -ch_http_fetch_batch(ch_http_streaming_state * st) -{ - size_t consumed; - size_t remaining; - - if (!st) - return false; - - consumed = st->batch_end; - remaining = st->buf_len - consumed; - if (remaining > 0) - memmove(st->buf, st->buf + consumed, remaining); - st->buf_len = remaining; - st->buf[st->buf_len] = '\0'; - st->batch_end = 0; - st->rows_ready = 0; - - if (st->transfer_done || mco_status(st->co) == MCO_DEAD) - { - st->transfer_done = true; - if (st->buf_len > 0) - { - st->batch_end = st->buf_len; - return true; - } - return false; - } - - if (!http_streaming_resume(st)) - return false; - if (st->batch_end > 0) - return true; - if (st->transfer_done && st->buf_len > 0) - { - st->batch_end = st->buf_len; - return true; - } - - return false; -} - -char * -ch_http_streaming_error(ch_http_streaming_state * st) -{ - return st ? st->error : NULL; -} - -long -ch_http_streaming_status(ch_http_streaming_state * st) -{ - return st ? st->http_status : 0; -} - -double -ch_http_streaming_request_time(ch_http_streaming_state * st) -{ - return st ? st->pretransfer_time : 0; -} - -double -ch_http_streaming_total_time(ch_http_streaming_state * st) -{ - return st ? st->total_time : 0; -} - -char * -ch_http_streaming_batch_data(ch_http_streaming_state * st) -{ - return st ? st->buf : NULL; -} - -void -ch_http_streaming_init_read_state(ch_http_streaming_state * st, - ch_http_read_state * read_state) -{ - if (!st || !read_state) - return; - - ch_http_read_state_init(read_state, st->buf, st->batch_end); -} - -void -ch_http_end_streaming(ch_http_streaming_state * st) -{ - if (!st) - return; - - if (st->co) - mco_destroy(st->co); - - curl_easy_reset(st->conn->curl); - - if (st->url) - curl_free(st->url); - if (st->headers) - curl_slist_free_all(st->headers); - if (st->form) - curl_mime_free(st->form); - if (st->buf) - free(st->buf); - if (st->error && st->error_owned) - free(st->error); - - free(st); -} diff --git a/src/include/fdw.h b/src/include/fdw.h index afccc93a..0e2cfe89 100644 --- a/src/include/fdw.h +++ b/src/include/fdw.h @@ -55,9 +55,8 @@ typedef struct ch_cursor size_t columns_count; uintptr_t *conversion_states; /* for binary */ - /* Streaming support */ - void *streaming_state; /* opaque: ch_http_streaming_state or - * ch_binary_streaming_state */ + /* Streaming support for binary scans. */ + void *streaming_state; bool is_streaming; } ch_cursor; diff --git a/src/include/http.h b/src/include/http.h index 82574c99..ff771c41 100644 --- a/src/include/http.h +++ b/src/include/http.h @@ -53,20 +53,4 @@ void ch_http_read_state_init(ch_http_read_state * state, char *data, size_t dat int ch_http_read_next(ch_http_read_state * state); void ch_http_response_free(ch_http_response_t * resp); -/* Streaming API: single HTTP request with coroutine-based batching */ -typedef struct ch_http_streaming_state ch_http_streaming_state; - -ch_http_streaming_state *ch_http_begin_streaming(ch_http_connection_t * conn, - const ch_query * query, - int fetch_size); -bool ch_http_fetch_batch(ch_http_streaming_state * state); -char *ch_http_streaming_error(ch_http_streaming_state * state); -long ch_http_streaming_status(ch_http_streaming_state * state); -double ch_http_streaming_request_time(ch_http_streaming_state * state); -double ch_http_streaming_total_time(ch_http_streaming_state * state); -char *ch_http_streaming_batch_data(ch_http_streaming_state * state); -void ch_http_streaming_init_read_state(ch_http_streaming_state * state, - ch_http_read_state * read_state); -void ch_http_end_streaming(ch_http_streaming_state * state); - #endif /* CLICKHOUSE_HTTP_H */ diff --git a/src/include/http_internal.h b/src/include/http_internal.h deleted file mode 100644 index 410d6a4b..00000000 --- a/src/include/http_internal.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef CLICKHOUSE_HTTP_INTERNAL_H -#define CLICKHOUSE_HTTP_INTERNAL_H - -#include "http.h" -#include "internal.h" - -#define CH_HTTP_CANCELED_STATUS 418 -#define CH_HTTP_COMM_ERROR_STATUS 419 -#define CH_HTTP_STREAM_BUFFER_SIZE (64 * 1024) -#define CH_MINICORO_STACK_SIZE (1024 * 1024) - -typedef size_t (*ch_http_write_func) (void *contents, size_t size, - size_t nmemb, void *userp); - -void ch_http_generate_query_id(char query_id[37]); -bool ch_http_build_url(ch_http_connection_t * conn, const ch_query * query, - const char *query_id, char **url); -bool ch_http_build_headers(ch_http_connection_t * conn, - struct curl_slist **headers); -bool ch_http_set_post_data(CURL *curl, const ch_query * query, - curl_mime **form); -void ch_http_set_common_options(ch_http_connection_t * conn, const char *url, - char *errbuffer, - ch_http_write_func write_func, - void *write_data); -void ch_http_set_progress_options(ch_http_connection_t * conn, - void *progress_data); -void ch_http_get_transfer_info(ch_http_connection_t * conn, - double *pretransfer_time, - double *total_time, - long *http_status); - -#endif /* CLICKHOUSE_HTTP_INTERNAL_H */ diff --git a/src/minicoro.c b/src/minicoro.c deleted file mode 100644 index 68470c3a..00000000 --- a/src/minicoro.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * minicoro.c - * Compile the minicoro implementation exactly once. - */ -#define MINICORO_IMPL -#include "minicoro.h" diff --git a/src/pglink.c b/src/pglink.c index c7fc3268..32e20922 100644 --- a/src/pglink.c +++ b/src/pglink.c @@ -195,16 +195,6 @@ report_oom(void) errmsg("out of memory"))); } -static void -report_http_status_error(long status, const char *query_sql, char *error) -{ - ereport(ERROR, - (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), - errmsg("pg_clickhouse: %s", format_error(error)), - status < 404 ? 0 : errdetail_internal("Remote Query: %.64000s", query_sql), - errcontext("HTTP status code: %li", status))); -} - static void report_binary_query_error(const char *query_sql, const char *error) { @@ -280,6 +270,8 @@ static ch_cursor * http_simple_query(void *conn, const ch_query * query) { int attempts = 0; + MemoryContext tempcxt, + oldcxt; ch_cursor *cursor; ch_http_response_t *resp; @@ -288,7 +280,7 @@ http_simple_query(void *conn, const ch_query * query) again: resp = ch_http_simple_query(conn, query); if (resp == NULL) - report_oom(); + elog(ERROR, "out of memory"); attempts++; if (resp->http_status == 419) @@ -320,21 +312,35 @@ http_simple_query(void *conn, const ch_query * query) ch_http_response_free(resp); - report_http_status_error(status, query->sql, error); + ereport(ERROR, ( + errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), + errmsg("pg_clickhouse: %s", format_error(error)), + status < 404 ? 0 : errdetail_internal("Remote Query: %.64000s", query->sql), + errcontext("HTTP status code: %li", status) + )); } - cursor = cursor_create(query->sql, 0, http_cursor_free); + /* + * we could not control properly deallocation of libclickhouse memory, so + * we use memory context callbacks for that + */ + tempcxt = AllocSetContextCreate(PortalContext, "pg_clickhouse cursor", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(tempcxt); - { - MemoryContext oldcxt = MemoryContextSwitchTo(cursor->memcxt); + cursor = palloc0(sizeof(ch_cursor)); + cursor->query_response = resp; + cursor->read_state = palloc0(sizeof(ch_http_read_state)); + cursor->query = pstrdup(query->sql); + cursor->request_time = resp->pretransfer_time * 1000; + cursor->total_time = resp->total_time * 1000; + ch_http_read_state_init(cursor->read_state, resp->data, resp->datasize); - cursor->query_response = resp; - cursor->read_state = palloc0(sizeof(ch_http_read_state)); - cursor->request_time = resp->pretransfer_time * 1000; - cursor->total_time = resp->total_time * 1000; - ch_http_read_state_init(cursor->read_state, resp->data, resp->datasize); - MemoryContextSwitchTo(oldcxt); - } + cursor->memcxt = tempcxt; + cursor->callback.func = http_cursor_free; + cursor->callback.arg = cursor; + MemoryContextRegisterResetCallback(tempcxt, &cursor->callback); + MemoryContextSwitchTo(oldcxt); return cursor; } @@ -362,7 +368,13 @@ http_simple_insert(void *conn, const ch_query * query) long status = resp->http_status; ch_http_response_free(resp); - report_http_status_error(status, query->sql, error); + + ereport(ERROR, ( + errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), + errmsg("pg_clickhouse: %s", format_error(error)), + status < 404 ? 0 : errdetail_internal("Remote Query: %.64000s", query->sql), + errcontext("HTTP status code: %li", status) + )); } ch_http_response_free(resp); @@ -398,6 +410,7 @@ http_fetch_row(ChFdwScanRowContext * ctx) Datum *values; ch_http_read_state *state = cursor->read_state; + /* All rows or empty table. */ if (state->done || state->data == NULL) return NULL; diff --git a/vendor/minicoro b/vendor/minicoro deleted file mode 160000 index 02dad0f8..00000000 --- a/vendor/minicoro +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 02dad0f8b7cbb12fe6e216ae7a76db15ca55cd7b