From a346fc0eb2d83ecd63fb645e58f1a896945003c6 Mon Sep 17 00:00:00 2001 From: Esteban Zimanyi Date: Sat, 16 May 2026 18:13:18 +0200 Subject: [PATCH] Route constant-geometry spatial-relationship predicates to the TRTREE index eIntersects/eContains/eDisjoint/eTouches against a constant geometry are now recognised by the TRTREE scan optimizer, which synthesizes the bounding-box && prefilter from the constant and probes the index (single-box or MEST). The bbox is a lossy superset, so the index scan reports supports_pushdown_type = false and DuckDB keeps the original predicate as an exact recheck filter above the scan, mirroring the lossy-index-always-rechecks contract of PostGIS GiST and MobilityDB's tspatial_supportfn. A parity block proves that a row whose bbox overlaps the query polygon but does not actually intersect it is dropped, identically with and without the index and under MEST multi-entry. --- src/index/rtree_index_scan.cpp | 22 +++++++-- src/index/rtree_module.cpp | 9 +++- src/index/rtree_optimize_scan.cpp | 66 +++++++++++++++++++++----- test/sql/parity/050_index_types.test | 69 ++++++++++++++++++++++++++++ 4 files changed, 151 insertions(+), 15 deletions(-) diff --git a/src/index/rtree_index_scan.cpp b/src/index/rtree_index_scan.cpp index db2003b8..188b6d7f 100644 --- a/src/index/rtree_index_scan.cpp +++ b/src/index/rtree_index_scan.cpp @@ -125,14 +125,30 @@ static void RTreeIndexScanExecute(ClientContext &context, TableFunctionInput &da //------------------------------------------------------------------------- // Get Function //------------------------------------------------------------------------- + +// The R-tree probe narrows rows by bounding-box overlap only. For an exact +// predicate (&&, @>) that is the answer; for a lossy predicate (the +// spatial-rel functions, whose bbox is only a superset) the original +// predicate must still be evaluated or the scan would emit false positives. +// Reporting that this scan can apply no pushed expression filter makes +// DuckDB keep every pushed predicate as an exact recheck PhysicalFilter +// directly above the scan (execution/physical_plan/plan_get.cpp rebuilds it +// via ExpressionFilter::ToExpression). This is the lossy-index-always- +// rechecks contract of PostGIS GiST and MobilityDB's tspatial_supportfn: +// the index is a prefilter, the recheck is correctness. +static bool RTreeIndexScanSupportsPushdownType(const FunctionData &, idx_t) { + return false; +} + TableFunction TRTreeIndexScanFunction::GetFunction() { TableFunction func("mobility rtree index", {}, RTreeIndexScanExecute); func.init_global = RTreeIndexScanInitGlobal; - + func.get_bind_info = TRTreeIndexScanBindInfo; - + func.projection_pushdown = true; - func.filter_pushdown = false; + func.filter_pushdown = false; + func.supports_pushdown_type = RTreeIndexScanSupportsPushdownType; return func; } diff --git a/src/index/rtree_module.cpp b/src/index/rtree_module.cpp index c931f087..3e5b0cbf 100644 --- a/src/index/rtree_module.cpp +++ b/src/index/rtree_module.cpp @@ -531,7 +531,14 @@ unique_ptr TRTreeIndex::MakeFunctionMatcher() const { unordered_set supported_functions; if (bbox_meostype == T_STBOX) { - supported_functions = {"&&"}; + // && is the exact bbox predicate; the spatial-rel functions are + // lossy supersets whose bbox prefilter the index serves while the + // original predicate is rechecked exactly above the scan by the + // recheck PhysicalFilter (the scan reports supports_pushdown_type + // = false; see RTreeIndexScanSupportsPushdownType). Mirrors + // MobilityDB's tspatial_supportfn, in function form. + supported_functions = {"&&", "eIntersects", "eContains", + "eDisjoint", "eTouches"}; } else if (bbox_meostype == T_TSTZSPAN) { supported_functions = {"&&", "@>"}; } else { diff --git a/src/index/rtree_optimize_scan.cpp b/src/index/rtree_optimize_scan.cpp index 2c5ec15e..cb9f2ba9 100644 --- a/src/index/rtree_optimize_scan.cpp +++ b/src/index/rtree_optimize_scan.cpp @@ -17,6 +17,9 @@ #include "index/rtree_module.hpp" #include "index/rtree_index_scan.hpp" #include "time_util.hpp" +#include "geo_util.hpp" +#include +#include @@ -84,41 +87,82 @@ class TRTreeIndexScanOptimizer : public OptimizerExtension { } const auto &constant = const_expr->Cast(); - + + static const std::unordered_set spatial_rel_fns = + {"eIntersects", "eContains", "eDisjoint", "eTouches"}; + const bool is_spatial_rel = + spatial_rel_fns.count(function_name) > 0; + void *query_box = nullptr; size_t box_size = 0; - - if (constant.value.type().id() == LogicalTypeId::BLOB) { - + + if (is_spatial_rel) { + // supportfn-equivalent (mirrors MobilityDB + // tspatial_supportfn): the predicate is a lossy spatial + // relationship; synthesize its bbox && prefilter from the + // constant geometry argument. The original spatial-rel + // predicate is rechecked exactly above the index scan + // (the scan reports supports_pushdown_type = false, so + // plan_get.cpp keeps it as a recheck PhysicalFilter), so + // the bbox superset never drops nor wrongly keeps a row. + if (constant.value.type().id() != LogicalTypeId::BLOB) { + return false; + } + auto blob_data = constant.value.GetValueUnsafe(); + GSERIALIZED *gs = GeometryToGSerialized(blob_data, 0); + if (!gs) { + return false; + } + STBox *box = geo_to_stbox(gs); + free(gs); + if (!box) { + return false; + } + box_size = sizeof(STBox); + query_box = malloc(box_size); + if (query_box) { + memcpy(query_box, box, box_size); + } + free(box); + } + else if (constant.value.type().id() == LogicalTypeId::BLOB) { + auto blob_data = constant.value.GetValueUnsafe(); const uint8_t *data = reinterpret_cast(blob_data.GetDataUnsafe()); box_size = blob_data.GetSize(); - + query_box = malloc(box_size); memcpy(query_box, data, box_size); - + } else if (constant.value.type().id() == LogicalTypeId::TIMESTAMP_TZ) { auto timestamp_duckdb = constant.value.GetValueUnsafe(); - + timestamp_tz_t ts_meos = DuckDBToMeosTimestamp(timestamp_duckdb); - + box_size = sizeof(timestamp_tz_t); query_box = malloc(box_size); - + if (query_box) { memcpy(query_box, &ts_meos, box_size); } } - + if (!query_box) { return false; } + // The index probe is always a bbox overlap; a spatial-rel + // name is only the recognition key, not an index operation. + // Exactness is restored by the recheck PhysicalFilter that + // plan_get.cpp builds above this scan (see + // RTreeIndexScanSupportsPushdownType). + const string index_op = + is_spatial_rel ? string("&&") : function_name; bind_data = make_uniq( - duck_table, rtree_index, 1000, query_box, box_size, function_name); + duck_table, rtree_index, 1000, query_box, box_size, index_op); return true; }); diff --git a/test/sql/parity/050_index_types.test b/test/sql/parity/050_index_types.test index 84495b5a..23e86e34 100644 --- a/test/sql/parity/050_index_types.test +++ b/test/sql/parity/050_index_types.test @@ -360,3 +360,72 @@ query I SELECT count(*) FROM idx_mest WHERE t && 'STBOX X((999,999),(1002,1002))'::stbox; ---- 1 + +# ============================================================================= +# Spatial-relationship predicate pushdown (supportfn-equivalent). +# +# A predicate like eIntersects(trip, ) is lossy: its bbox +# is only a superset of the true answer. The optimizer rewrites the scan to +# probe the TRTREE with the synthesized bbox &&, and the original predicate +# is rechecked exactly above the scan. The decisive regression is that the +# index must NOT leak a row whose bbox overlaps the query geometry but which +# does not actually intersect it (a missing recheck would return 2, not 1). +# ============================================================================= + +statement ok +CREATE TABLE idx_srel(t tgeompoint); + +# A: an L-shaped path. Its bbox X[0,10] Y[0,10] overlaps the query polygon +# P = Polygon((4 4,6 4,6 6,4 6,4 4)), but the path runs along y=0 then +# x=10 and never enters P -> bbox-overlap false positive. +# B: a vertical path through x=5 that crosses P -> true match. +# C: a far-away path, bbox-disjoint from P -> true miss. +statement ok +INSERT INTO idx_srel VALUES + ('[Point(0 0)@2000-01-01, Point(10 0)@2000-01-02, Point(10 10)@2000-01-03]'::tgeompoint), + ('[Point(5 0)@2000-01-01, Point(5 10)@2000-01-02]'::tgeompoint), + ('[Point(100 100)@2000-01-01, Point(101 101)@2000-01-02]'::tgeompoint); + +# Ground truth without an index: only B intersects P. +query I +SELECT count(*) FROM idx_srel + WHERE eIntersects(t, geometry 'Polygon((4 4, 6 4, 6 6, 4 6, 4 4))'); +---- +1 + +query I +SELECT count(*) FROM idx_srel + WHERE NOT eIntersects(t, geometry 'Polygon((4 4, 6 4, 6 6, 4 6, 4 4))'); +---- +2 + +statement ok +CREATE INDEX i_srel ON idx_srel USING TRTREE (t); + +# Same answer with the index: the bbox-overlap bait A is dropped by the +# exact recheck. A regression to a missing recheck would return 2. +query I +SELECT count(*) FROM idx_srel + WHERE eIntersects(t, geometry 'Polygon((4 4, 6 4, 6 6, 4 6, 4 4))'); +---- +1 + +# A polygon whose bbox is disjoint from every row: clean true negative. +query I +SELECT count(*) FROM idx_srel + WHERE eIntersects(t, geometry 'Polygon((50 50, 51 50, 51 51, 50 51, 50 50))'); +---- +0 + +statement ok +DROP INDEX i_srel; + +# Same soundness under MEST multi-entry indexing. +statement ok +CREATE INDEX i_srel ON idx_srel USING TRTREE (t) WITH (max_boxes = 8); + +query I +SELECT count(*) FROM idx_srel + WHERE eIntersects(t, geometry 'Polygon((4 4, 6 4, 6 6, 4 6, 4 4))'); +---- +1