From 71b093ea5bc014f158cb6576f2151c968f437339 Mon Sep 17 00:00:00 2001 From: dmgcodevil Date: Sat, 18 Apr 2026 21:28:37 -0400 Subject: [PATCH 1/7] memo get_vars in where clause + from=>match --- apps/tundra_shell.cpp | 14 ++-- bench/tundra_runner.cpp | 111 ++++++++++++++++------------- include/main/database.hpp | 8 +-- include/query/execution.hpp | 2 +- include/query/query.hpp | 36 ++++++---- src/main/database.cpp | 2 +- src/query/execution.cpp | 24 +++---- src/query/query.cpp | 20 ++++-- src/query/query_bootstrap.cpp | 24 +++---- src/query/result_builder.cpp | 16 ++--- src/update/update_executor.cpp | 6 +- tests/array_query_test.cpp | 10 +-- tests/benchmark_test.cpp | 20 +++--- tests/join_test.cpp | 38 +++++----- tests/snapshot_test.cpp | 2 +- tests/temporal_query_test.cpp | 76 ++++++++++---------- tests/update_query_join_test.cpp | 33 ++++----- tests/update_query_test.cpp | 31 ++++---- tests/where_expression_test.cpp | 33 ++++----- tests/where_pushdown_join_test.cpp | 8 +-- 20 files changed, 271 insertions(+), 243 deletions(-) diff --git a/apps/tundra_shell.cpp b/apps/tundra_shell.cpp index a13de13..2dfd639 100644 --- a/apps/tundra_shell.cpp +++ b/apps/tundra_shell.cpp @@ -28,11 +28,11 @@ #include "TundraQLParser.h" #include "arrow/map_union_types.hpp" #include "common/constants.hpp" -#include "main/database.hpp" -#include "linenoise.h" #include "common/logger.hpp" #include "common/types.hpp" #include "common/utils.hpp" +#include "linenoise.h" +#include "main/database.hpp" // Tee stream class that outputs to both console and file class TeeStream : public std::ostream { @@ -525,7 +525,7 @@ class TundraQLVisitorImpl : public tundraql::TundraQLBaseVisitor { node_type = node_alias; } - auto query_builder = tundradb::Query::from(node_alias + ":" + node_type); + auto query_builder = tundradb::Query::match(node_alias + ":" + node_type); for (size_t i = 0; i < edges.size(); i++) { auto edge = edges[i]; @@ -807,7 +807,7 @@ class TundraQLVisitorImpl : public tundraql::TundraQLBaseVisitor { schema_name = alias; } - auto query_builder = tundradb::Query::from(alias + ":" + schema_name); + auto query_builder = tundradb::Query::match(alias + ":" + schema_name); if (ctx->whereClause()) { processWhereClause(query_builder, ctx->whereClause()); @@ -1082,7 +1082,7 @@ class TundraQLVisitorImpl : public tundraql::TundraQLBaseVisitor { return qb; } // Mode 2: single nodePattern → build a trivial query - return tundradb::Query::from(alias + ":" + schema_name); + return tundradb::Query::match(alias + ":" + schema_name); }(); // Build alias→schema map from the query builder's pattern @@ -1456,7 +1456,7 @@ class TundraQLVisitorImpl : public tundraql::TundraQLBaseVisitor { try { // Build a query to find matching nodes - auto query_builder = tundradb::Query::from("n:" + node_type); + auto query_builder = tundradb::Query::match("n:" + node_type); // Add WHERE conditions for each property for (const auto& [prop_name, prop_value] : properties) { @@ -2211,4 +2211,4 @@ int main(int argc, char* argv[]) { g_tee_stream.reset(); return 0; -} \ No newline at end of file +} diff --git a/bench/tundra_runner.cpp b/bench/tundra_runner.cpp index a52e002..f0b78d2 100644 --- a/bench/tundra_runner.cpp +++ b/bench/tundra_runner.cpp @@ -1,44 +1,45 @@ #include #include #include + #include #include #include #include +#include "common/types.hpp" #include "main/database.hpp" #include "query/query.hpp" -#include "common/types.hpp" using namespace tundradb; -static arrow::Result> read_csv(const std::string& path) { +static arrow::Result> read_csv( + const std::string& path) { ARROW_ASSIGN_OR_RAISE(auto input, arrow::io::ReadableFile::Open(path)); auto read_options = arrow::csv::ReadOptions::Defaults(); auto parse_options = arrow::csv::ParseOptions::Defaults(); auto convert_options = arrow::csv::ConvertOptions::Defaults(); - ARROW_ASSIGN_OR_RAISE(auto reader, - arrow::csv::TableReader::Make(arrow::io::default_io_context(), input, - read_options, parse_options, convert_options)); + ARROW_ASSIGN_OR_RAISE( + auto reader, arrow::csv::TableReader::Make( + arrow::io::default_io_context(), input, read_options, + parse_options, convert_options)); return reader->Read(); } -void load_data(Database& db, const std::string& users_csv, - const std::string& companies_csv, - const std::string& friend_csv, +void load_data(Database& db, const std::string& users_csv, + const std::string& companies_csv, const std::string& friend_csv, const std::string& works_at_csv) { auto load_start = std::chrono::high_resolution_clock::now(); // Define schemas (must include "id" field) - auto user_schema = arrow::schema({ - arrow::field("name", arrow::utf8()), + auto user_schema = arrow::schema({arrow::field("name", arrow::utf8()), arrow::field("age", arrow::int64()), arrow::field("country", arrow::utf8())}); db.get_schema_registry()->create("User", user_schema).ValueOrDie(); - auto company_schema = arrow::schema({ - arrow::field("name", arrow::utf8()), - arrow::field("industry", arrow::utf8())}); + auto company_schema = + arrow::schema({arrow::field("name", arrow::utf8()), + arrow::field("industry", arrow::utf8())}); db.get_schema_registry()->create("Company", company_schema).ValueOrDie(); auto users_tbl = read_csv(users_csv).ValueOrDie(); users_tbl = users_tbl->CombineChunks().ValueOrDie(); @@ -58,9 +59,12 @@ void load_data(Database& db, const std::string& users_csv, auto name_idx = users_tbl->schema()->GetFieldIndex("name"); auto age_idx = users_tbl->schema()->GetFieldIndex("age"); auto country_idx = users_tbl->schema()->GetFieldIndex("country"); - auto name_arr = std::static_pointer_cast(users_tbl->column(name_idx)->chunk(0)); - auto age_arr = std::static_pointer_cast(users_tbl->column(age_idx)->chunk(0)); - auto country_arr = std::static_pointer_cast(users_tbl->column(country_idx)->chunk(0)); + auto name_arr = std::static_pointer_cast( + users_tbl->column(name_idx)->chunk(0)); + auto age_arr = std::static_pointer_cast( + users_tbl->column(age_idx)->chunk(0)); + auto country_arr = std::static_pointer_cast( + users_tbl->column(country_idx)->chunk(0)); for (int64_t i = 0; i < users_tbl->num_rows(); ++i) { std::unordered_map data; data["name"] = Value(std::string(name_arr->GetView(i))); @@ -73,11 +77,12 @@ void load_data(Database& db, const std::string& users_csv, // Load Companies (global ids continue after users) - auto cname_idx = companies_tbl->schema()->GetFieldIndex("name"); auto ind_idx = companies_tbl->schema()->GetFieldIndex("industry"); - auto cname_arr = std::static_pointer_cast(companies_tbl->column(cname_idx)->chunk(0)); - auto ind_arr = std::static_pointer_cast(companies_tbl->column(ind_idx)->chunk(0)); + auto cname_arr = std::static_pointer_cast( + companies_tbl->column(cname_idx)->chunk(0)); + auto ind_arr = std::static_pointer_cast( + companies_tbl->column(ind_idx)->chunk(0)); for (int64_t i = 0; i < companies_tbl->num_rows(); ++i) { std::unordered_map data; data["name"] = Value(std::string(cname_arr->GetView(i))); @@ -89,41 +94,47 @@ void load_data(Database& db, const std::string& users_csv, auto fsrc_idx = friend_tbl->schema()->GetFieldIndex("src"); auto fdst_idx = friend_tbl->schema()->GetFieldIndex("dst"); - auto fsrc = std::static_pointer_cast(friend_tbl->column(fsrc_idx)->chunk(0)); - auto fdst = std::static_pointer_cast(friend_tbl->column(fdst_idx)->chunk(0)); + auto fsrc = std::static_pointer_cast( + friend_tbl->column(fsrc_idx)->chunk(0)); + auto fdst = std::static_pointer_cast( + friend_tbl->column(fdst_idx)->chunk(0)); for (int64_t i = 0; i < friend_tbl->num_rows(); ++i) { db.connect(fsrc->Value(i), "FRIEND", fdst->Value(i)).ValueOrDie(); } - auto wsrc_idx = works_tbl->schema()->GetFieldIndex("src"); auto wdst_idx = works_tbl->schema()->GetFieldIndex("dst"); - auto wsrc = std::static_pointer_cast(works_tbl->column(wsrc_idx)->chunk(0)); - auto wdst = std::static_pointer_cast(works_tbl->column(wdst_idx)->chunk(0)); + auto wsrc = std::static_pointer_cast( + works_tbl->column(wsrc_idx)->chunk(0)); + auto wdst = std::static_pointer_cast( + works_tbl->column(wdst_idx)->chunk(0)); for (int64_t i = 0; i < works_tbl->num_rows(); ++i) { - db.connect(wsrc->Value(i), "WORKS_AT", users_count + wdst->Value(i)).ValueOrDie(); + db.connect(wsrc->Value(i), "WORKS_AT", users_count + wdst->Value(i)) + .ValueOrDie(); } db.get_table("User", nullptr).ValueOrDie(); db.get_table("Company", nullptr).ValueOrDie(); - + auto load_end = std::chrono::high_resolution_clock::now(); auto load_duration = std::chrono::duration_cast( load_end - load_start); - std::cerr << "Data load time: " << load_duration.count() << " ms" << std::endl; + std::cerr << "Data load time: " << load_duration.count() << " ms" + << std::endl; } int64_t run_query(Database& db) { auto query_start_time = std::chrono::high_resolution_clock::now(); - Query query = Query::from("u:User") - .where("u.age", CompareOp::Gt, Value(30)) - .and_where("u.country", CompareOp::Eq, Value(std::string("US"))) - .traverse("u", "FRIEND", "f:User", TraverseType::Inner) - .where("f.age", CompareOp::Gt, Value((int64_t)25)) - .select() - .parallel(true) - .inline_where() - .build(); + Query query = + Query::match("u:User") + .where("u.age", CompareOp::Gt, Value(30)) + .and_where("u.country", CompareOp::Eq, Value(std::string("US"))) + .traverse("u", "FRIEND", "f:User", TraverseType::Inner) + .where("f.age", CompareOp::Gt, Value((int64_t)25)) + .select() + .parallel(true) + .inline_where() + .build(); auto res = db.query(query); auto query_end_time = std::chrono::high_resolution_clock::now(); @@ -134,22 +145,24 @@ int64_t run_query(Database& db) { std::cerr << "Query failed: " << res.status().ToString() << "\n"; return -1; } - + auto table = res.ValueOrDie()->table(); int64_t row_count = table ? table->num_rows() : 0; - + // Output in machine-readable format for Python parser std::cout << query_duration.count() << std::endl; // Just the time in ms - + return row_count; } int main(int argc, char** argv) { if (argc < 5) { - std::cerr << "Usage: " << argv[0] << " [repetitions]\n"; + std::cerr << "Usage: " << argv[0] + << " " + "[repetitions]\n"; return 1; } - + std::string users_csv = argv[1]; std::string companies_csv = argv[2]; std::string friend_csv = argv[3]; @@ -158,16 +171,16 @@ int main(int argc, char** argv) { // Build in-memory DB auto config = make_config() - .with_persistence_enabled(false) - .with_shard_capacity(200000) - .with_chunk_size(100000) - .build(); + .with_persistence_enabled(false) + .with_shard_capacity(200000) + .with_chunk_size(100000) + .build(); Database db(config); - + // Load data once (not timed for benchmark) load_data(db, users_csv, companies_csv, friend_csv, works_at_csv); - + // Run query multiple times and output each timing int64_t rows = 0; for (int i = 0; i < repetitions; i++) { @@ -177,7 +190,7 @@ int main(int argc, char** argv) { return 2; } } - + std::cerr << "rows=" << rows << std::endl; return 0; -} \ No newline at end of file +} diff --git a/include/main/database.hpp b/include/main/database.hpp index bb65393..fa9bf7b 100644 --- a/include/main/database.hpp +++ b/include/main/database.hpp @@ -150,7 +150,7 @@ class Database { * * Mode 2 - by MATCH query (alias-qualified SET, multi-schema): * db.update(UpdateQuery::match( - * Query::from("u:User") + * Query::match("u:User") * .traverse("u", "WORKS_AT", "c:Company") * .where("c.name", CompareOp::Eq, Value("Google")) * .build() @@ -177,13 +177,13 @@ class Database { const std::vector &fields, UpdateType update_type, UpdateResult &result); - /** Initialize QueryState from query: temporal context, FROM table, prepare. + /** Initialize QueryState from query: temporal context, root table, prepare. */ [[nodiscard]] arrow::Status init_query_state(const Query &query, QueryState &query_state) const; - /** Inline WHERE clauses applicable to the FROM alias. */ - [[nodiscard]] arrow::Status inline_from_where(const Query &query, + /** Inline WHERE clauses applicable to the root alias. */ + [[nodiscard]] arrow::Status inline_root_where(const Query &query, QueryState &query_state, QueryResult &result) const; diff --git a/include/query/execution.hpp b/include/query/execution.hpp index f26d363..d10996e 100644 --- a/include/query/execution.hpp +++ b/include/query/execution.hpp @@ -448,7 +448,7 @@ struct QueryState { /// Arrow tables keyed by schema alias. std::unordered_map> tables; - SchemaRef from; ///< Source schema from the FROM clause. + SchemaRef root; ///< Root schema for query execution. std::vector traversals; ///< Traverse clauses in query order. std::shared_ptr schema_registry; ///< Node schema registry. diff --git a/include/query/query.hpp b/include/query/query.hpp index db7bcae..70e2539 100644 --- a/include/query/query.hpp +++ b/include/query/query.hpp @@ -244,7 +244,9 @@ class WhereExpr { get_conditions_for_variable(const std::string& variable) const = 0; /** @brief Returns the set of all variables referenced in this expression. */ - virtual std::set get_all_variables() const = 0; + virtual const std::set& get_all_variables() const = 0; + + size_t get_vars_count() const { return get_all_variables().size(); } /** @brief Returns the first variable name found (useful for single-var * conditions). */ @@ -253,6 +255,9 @@ class WhereExpr { /** @brief Returns true if this expression can be inlined for the given * variable. */ virtual bool can_inline(const std::string& variable) const = 0; + + protected: + mutable std::set vars_; }; /** @brief The type of graph traversal / join to perform. */ @@ -361,7 +366,7 @@ class ComparisonExpr : public Clause, public WhereExpr { std::string extract_first_variable() const override; - std::set get_all_variables() const override; + const std::set& get_all_variables() const override; arrow::Result resolve_field_ref( const std::function>( @@ -422,7 +427,7 @@ class LogicalExpr : public Clause, public WhereExpr { friend std::ostream& operator<<(std::ostream& os, const LogicalExpr& expr); - std::set get_all_variables() const override; + const std::set& get_all_variables() const override; bool can_inline(const std::string& variable) const override; }; @@ -463,12 +468,12 @@ struct ExecutionConfig { /** * @brief Immutable query descriptor built via Query::Builder. * - * Contains the FROM schema, a list of clauses (TRAVERSE, WHERE, SELECT), - * execution configuration, and optional temporal snapshot. + * Contains the initial MATCH binding, a list of clauses (TRAVERSE, WHERE, + * SELECT), execution configuration, and optional temporal snapshot. */ class Query { private: - SchemaRef from_; + SchemaRef root_; std::vector> clauses_; std::shared_ptr select, bool optimize_where, ExecutionConfig execution_config, std::optional temporal_snapshot = std::nullopt) - : from_(std::move(from)), + : root_(std::move(root)), clauses_(std::move(clauses)), select_(std::move(select)), inline_where_(optimize_where), @@ -488,7 +493,7 @@ class Query { temporal_snapshot_(std::move(temporal_snapshot)) {} class Builder; - [[nodiscard]] const SchemaRef& from() const { return from_; } + [[nodiscard]] const SchemaRef& root() const { return root_; } [[nodiscard]] const std::vector>& clauses() const { return clauses_; } @@ -518,12 +523,13 @@ class Query { return nullptr; } - static Builder from(const std::string& schema) { return Builder(schema); } + /** @brief Begin a MATCH query from the initial bound schema alias. */ + static Builder match(const std::string& schema) { return Builder(schema); } /** @brief Fluent builder for constructing Query objects. */ class Builder { private: - SchemaRef from_; + SchemaRef root_; std::vector> clauses_; std::shared_ptr(std::vector( id_column_set.begin(), id_column_set.end())), match_query.inline_where(), match_query.execution_config(), diff --git a/tests/array_query_test.cpp b/tests/array_query_test.cpp index 3369536..3144d76 100644 --- a/tests/array_query_test.cpp +++ b/tests/array_query_test.cpp @@ -79,7 +79,7 @@ class ArrayQueryTest : public ::testing::Test { /// Query the "Item" table and return the full Arrow table. std::shared_ptr query_items() { - auto query = Query::from("i:Item").build(); + auto query = Query::match("i:Item").build(); auto result = db_->query(query).ValueOrDie(); return result->table(); } @@ -336,7 +336,7 @@ TEST_F(ArrayQueryTest, SequentialArrayUpdatesAccumulate) { } TEST_F(ArrayQueryTest, UpdateByMatchSetsArray) { - auto q = Query::from("i:Item") + auto q = Query::match("i:Item") .where("i.name", CompareOp::Eq, Value("Bob"s)) .build(); std::vector new_tags = {Value{"matched"s}}; @@ -488,7 +488,7 @@ TEST_F(ArrayQueryTest, AppendEmptyVectorIsNoop) { TEST_F(ArrayQueryTest, AppendByMatchQuery) { // Append "matched" to all Items where name = "Bob" - auto q = Query::from("i:Item") + auto q = Query::match("i:Item") .where("i.name", CompareOp::Eq, Value("Bob"s)) .build(); std::vector to_append = {Value{"matched"s}}; @@ -569,12 +569,12 @@ class VersionedArrayTest : public ::testing::Test { } std::shared_ptr query_items() { - auto query = Query::from("i:Item").build(); + auto query = Query::match("i:Item").build(); return db_->query(query).ValueOrDie()->table(); } std::shared_ptr query_items_as_of(uint64_t valid_time) { - auto query = Query::from("i:Item").as_of_valid_time(valid_time).build(); + auto query = Query::match("i:Item").as_of_valid_time(valid_time).build(); return db_->query(query).ValueOrDie()->table(); } diff --git a/tests/benchmark_test.cpp b/tests/benchmark_test.cpp index af04f74..732e9af 100644 --- a/tests/benchmark_test.cpp +++ b/tests/benchmark_test.cpp @@ -242,7 +242,7 @@ void BM_FullScan(::benchmark::State& state) { fixture->createUsers(node_count); for (auto _ : state) { - Query query = Query::from("u:User").build(); + Query query = Query::match("u:User").build(); auto result = fixture->db()->query(query); if (!result.ok() || result.ValueOrDie()->table()->num_rows() != node_count) { @@ -268,7 +268,7 @@ void BM_SimpleJoin(::benchmark::State& state) { for (auto _ : state) { Query query = - Query::from("u:User") + Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company", TraverseType::Inner) .build(); auto result = fixture->db()->query(query); @@ -308,7 +308,7 @@ void BM_ComplexJoin(::benchmark::State& state) { for (auto _ : state) { // Complex 3-way join: Users -> Friends -> Companies Query query = - Query::from("u:User") + Query::match("u:User") .traverse("u", "FRIEND", "f:User", TraverseType::Inner) .traverse("f", "WORKS_AT", "c:Company", TraverseType::Inner) .build(); @@ -329,7 +329,7 @@ void BM_FilteredQuery(::benchmark::State& state) { for (auto _ : state) { // Query with WHERE clause - users over 50 Query query = - Query::from("u:User").where("u.age", CompareOp::Gt, Value(50)).build(); + Query::match("u:User").where("u.age", CompareOp::Gt, Value(50)).build(); auto result = fixture->db()->query(query); if (!result.ok()) { state.SkipWithError("Filtered query failed"); @@ -341,14 +341,14 @@ void BM_FilteredQuery(::benchmark::State& state) { // Google Test cases for correctness verification TEST_F(SmallDatasetTest, NodeCreationCorrectness) { - Query query = Query::from("u:User").build(); + Query query = Query::match("u:User").build(); auto result = fixture->db()->query(query); ASSERT_TRUE(result.ok()); EXPECT_EQ(result.ValueOrDie()->table()->num_rows(), 100); } TEST_F(SmallDatasetTest, SimpleJoinCorrectness) { - Query query = Query::from("u:User") + Query query = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company", TraverseType::Inner) .build(); auto result = fixture->db()->query(query); @@ -366,7 +366,7 @@ TEST_F(SmallDatasetTest, SimpleJoinCorrectness) { } TEST_F(SmallDatasetTest, ComplexJoinCorrectness) { - Query query = Query::from("u:User") + Query query = Query::match("u:User") .traverse("u", "FRIEND", "f:User", TraverseType::Inner) .traverse("f", "WORKS_AT", "c:Company", TraverseType::Inner) .build(); @@ -380,7 +380,7 @@ TEST_F(SmallDatasetTest, ComplexJoinCorrectness) { TEST_F(SmallDatasetTest, FilteredQueryCorrectness) { Query query = - Query::from("u:User").where("u.age", CompareOp::Gt, Value(50)).build(); + Query::match("u:User").where("u.age", CompareOp::Gt, Value(50)).build(); auto result = fixture->db()->query(query); ASSERT_TRUE(result.ok()); @@ -404,7 +404,7 @@ TEST_F(SmallDatasetTest, FilteredQueryCorrectness) { TEST_F(MediumDatasetTest, ScalabilityTest) { auto start_time = std::chrono::high_resolution_clock::now(); - Query query = Query::from("u:User") + Query query = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company", TraverseType::Inner) .build(); auto result = fixture->db()->query(query); @@ -429,7 +429,7 @@ TEST_F(LargeDatasetTest, PerformanceBaseline) { auto start = std::chrono::high_resolution_clock::now(); Query query = - Query::from("u:User") + Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company", TraverseType::Inner) //.parallel(true) //.parallel_thread_count(4) diff --git a/tests/join_test.cpp b/tests/join_test.cpp index 5df8a24..f7d156f 100644 --- a/tests/join_test.cpp +++ b/tests/join_test.cpp @@ -96,7 +96,7 @@ std::shared_ptr setup_test_db() { TEST(JoinTest, MatchAll) { auto db = setup_test_db(); - Query query = Query::from("u:users").build(); + Query query = Query::match("u:users").build(); auto query_result = db->query(query); ASSERT_TRUE(query_result.ok()); @@ -116,7 +116,7 @@ TEST(JoinTest, UserFriendCompanyInnerJoin) { db->connect(1, "works-at", 1).ValueOrDie(); Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Inner) .traverse("f", "works-at", "c:companies", TraverseType::Inner) .build(); @@ -179,7 +179,7 @@ TEST(JoinTest, JoinFromSameNode) { db->connect(0, "friend", 1).ValueOrDie(); // alex -> bob db->connect(0, "friend", 2).ValueOrDie(); // alex -> jeff - Query query = Query::from("u:users") + Query query = Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Inner) .build(); @@ -252,7 +252,7 @@ TEST(JoinTest, InnerJoinFromSameNodeMultiTarget) { db->connect(0, "works-at", 1).ValueOrDie(); // alex -> google Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Inner) .traverse("u", "works-at", "c:companies", TraverseType::Inner) .build(); @@ -340,7 +340,7 @@ TEST(JoinTest, InnerJoinFromSameNodeAndEndConnections) { db->connect(2, "works-at", 2).ValueOrDie(); // jeff -> aws Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Inner) .traverse("u", "works-at", "c:companies", TraverseType::Inner) .build(); @@ -430,7 +430,7 @@ TEST(JoinTest, EmptyResultFromInnerJoin) { // Query that will return no results because jeff doesn't work anywhere Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f1:users", TraverseType::Inner) .traverse("f1", "friend", "f2:users", TraverseType::Inner) .traverse("f2", "works-at", "c:companies", TraverseType::Inner) @@ -465,7 +465,7 @@ TEST(JoinTest, MultiPathToSameTarget) { // Query: Find all friends of alex who work at the same company as alex Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Inner) .traverse("u", "works-at", "c1:companies", TraverseType::Inner) .traverse("f", "works-at", "c2:companies", TraverseType::Inner) @@ -540,7 +540,7 @@ TEST(JoinTest, CartesianProductExplosion) { // Query: Friends of alex and where they work // Results in 3 friends × ~2 companies each = ~6 rows total Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Inner) .traverse("f", "works-at", "c:companies", TraverseType::Inner) .build(); @@ -588,7 +588,7 @@ TEST(JoinTest, LeftJoin) { // LEFT JOIN: Keep all users even if they don't work at any company Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Inner) .traverse("f", "works-at", "c:companies", TraverseType::Left) .build(); @@ -700,7 +700,7 @@ TEST(JoinTest, RightJoin) { // RIGHT JOIN: Keep all companies even if no users work there Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Inner) .traverse("f", "works-at", "c:companies", TraverseType::Right) .build(); @@ -751,7 +751,7 @@ TEST(JoinTest, CombinedJoinTypes) { // Query that combines INNER, LEFT and RIGHT joins Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Left) .traverse("f", "works-at", "c:companies", TraverseType::Right) .build(); @@ -885,7 +885,7 @@ TEST(JoinTest, MultiLevelLeftJoin) { // Multi-level LEFT JOINs: Keep all users at each level Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Left) .traverse("f", "works-at", "c:companies", TraverseType::Left) .traverse("f", "likes", "l:companies", TraverseType::Left) @@ -1018,7 +1018,7 @@ TEST(JoinTest, SelfJoinWithLeftJoin) { // LEFT JOIN with self: Find all management chains, including users with no // manager or subordinates Query query = - Query::from("manager:users") + Query::match("manager:users") .traverse("manager", "manages", "employee:users", TraverseType::Left) .build(); @@ -1140,7 +1140,7 @@ TEST(JoinTest, FullOuterJoin) { // FULL OUTER JOIN: Keep all records from both sides Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Full) .traverse("f", "works-at", "c:companies", TraverseType::Full) .build(); @@ -1290,7 +1290,7 @@ TEST(JoinTest, SelectClauseFiltering) { // Query with SELECT - only get user (u) and friend (f) columns Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Inner) .traverse("f", "works-at", "c:companies", TraverseType::Inner) .select({"u", "f"}) // Only select u.* and f.* columns @@ -1369,7 +1369,7 @@ TEST(JoinTest, SelectSpecificColumns) { // Query with SELECT for specific columns Query query = - Query::from("u:users") + Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Inner) .select({"u.name", "f.age"}) // Only select specific columns .build(); @@ -1533,7 +1533,7 @@ TEST(JoinTest, MultiPatternPathThroughFriends) { // Run the query: MATCH (u:User)-[:FRIEND INNER]->(f:User), (f)-[:WORKS_AT // INNER]->(c:Company) Query query_custom = - Query::from("u:User") + Query::match("u:User") .traverse("u", "FRIEND", "f:User", TraverseType::Inner) .traverse("f", "WORKS_AT", "c:Company", TraverseType::Inner) .build(); @@ -1651,7 +1651,7 @@ TEST(JoinTest, MultiPatternWithSharedVars) { db->connect(2, "WORKS_AT", 1).ValueOrDie(); // Jeff -> Google (Company ID 1) db->connect(1, "WORKS_AT", 0).ValueOrDie(); // Bob -> IBM (Company ID 0) - Query query = Query::from("u:users") + Query query = Query::match("u:users") .traverse("u", "FRIEND", "f:users") .traverse("f", "WORKS_AT", "c:companies") .traverse("u", "WORKS_AT", "c") @@ -1727,7 +1727,7 @@ TEST(JoinTest, FullJoinFriendRelationship) { db->connect(0, "friend", 1).ValueOrDie(); // alex -> bob db->connect(0, "friend", 2).ValueOrDie(); // alex -> jeff - Query query = Query::from("u:users") + Query query = Query::match("u:users") .traverse("u", "friend", "f:users", TraverseType::Full) .build(); diff --git a/tests/snapshot_test.cpp b/tests/snapshot_test.cpp index 1fd7cd7..74725d4 100644 --- a/tests/snapshot_test.cpp +++ b/tests/snapshot_test.cpp @@ -349,7 +349,7 @@ TEST_F(DatabaseSnapshotTest, SnapshotReloadPreservesEdgeSchemaProperties) { auto new_db = create_test_database(); ASSERT_TRUE(new_db->initialize().ValueOrDie()); - auto q = Query::from("u:User") + auto q = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company", TraverseType::Inner, "e") .select({"u.name", "e.since", "e.role"}) .build(); diff --git a/tests/temporal_query_test.cpp b/tests/temporal_query_test.cpp index 07c8d7a..cd6ff23 100644 --- a/tests/temporal_query_test.cpp +++ b/tests/temporal_query_test.cpp @@ -115,7 +115,7 @@ TEST_F(TemporalQueryTest, NodeUpdateAtDifferentTimes) { ASSERT_TRUE(update_result2.ok()) << update_result2.status(); // Query current version (at t2): should see age=27 - auto query_current = Query::from("u:User") + auto query_current = Query::match("u:User") .where("u.name", CompareOp::Eq, Value("Alice")) .build(); auto result_current = db_->query(query_current); @@ -135,7 +135,7 @@ TEST_F(TemporalQueryTest, NodeUpdateAtDifferentTimes) { // ======================================================================== // Query AS OF t0: should see age=25 (original version) - auto query_t0 = Query::from("u:User") + auto query_t0 = Query::match("u:User") .as_of_valid_time(t0_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -150,7 +150,7 @@ TEST_F(TemporalQueryTest, NodeUpdateAtDifferentTimes) { EXPECT_EQ(age_array_t0->Value(0), 25); // Versioning enabled! // Query AS OF t1: should see age=26 (first update) - auto query_t1 = Query::from("u:User") + auto query_t1 = Query::match("u:User") .as_of_valid_time(t1_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -165,7 +165,7 @@ TEST_F(TemporalQueryTest, NodeUpdateAtDifferentTimes) { EXPECT_EQ(age_array_t1->Value(0), 26); // Versioning enabled! // Query AS OF t2: should see age=27 (second update) - auto query_t2 = Query::from("u:User") + auto query_t2 = Query::match("u:User") .as_of_valid_time(t2_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -196,7 +196,7 @@ TEST_F(TemporalQueryTest, MultipleFieldUpdateAtSameTime) { ASSERT_TRUE(update2.ok()); // Query at current time: should see age=31, active=false - auto query = Query::from("u:User") + auto query = Query::match("u:User") .where("u.name", CompareOp::Eq, Value("Bob")) .build(); @@ -237,7 +237,7 @@ TEST_F(TemporalQueryTest, ClockAdvanceAndQuery) { ASSERT_TRUE(update2.ok()); // Query current: should see age=37 - auto query = Query::from("u:User") + auto query = Query::match("u:User") .where("u.name", CompareOp::Eq, Value("Charlie")) .build(); @@ -256,7 +256,7 @@ TEST_F(TemporalQueryTest, ClockAdvanceAndQuery) { // ======================================================================== // Query AS OF creation_time: should see age=35 - auto query_creation = Query::from("u:User") + auto query_creation = Query::match("u:User") .as_of_valid_time(creation_time) .where("u.name", CompareOp::Eq, Value("Charlie")) .build(); @@ -269,7 +269,7 @@ TEST_F(TemporalQueryTest, ClockAdvanceAndQuery) { EXPECT_EQ(age_creation->Value(0), 35); // Query AS OF update1_time: should see age=36 - auto query_update1 = Query::from("u:User") + auto query_update1 = Query::match("u:User") .as_of_valid_time(update1_time) .where("u.name", CompareOp::Eq, Value("Charlie")) .build(); @@ -282,7 +282,7 @@ TEST_F(TemporalQueryTest, ClockAdvanceAndQuery) { EXPECT_EQ(age_update1->Value(0), 36); // Query AS OF update2_time: should see age=37 - auto query_update2 = Query::from("u:User") + auto query_update2 = Query::match("u:User") .as_of_valid_time(update2_time) .where("u.name", CompareOp::Eq, Value("Charlie")) .build(); @@ -317,7 +317,7 @@ TEST_F(TemporalQueryTest, BitemporalQueryWithUpdates) { // ======================================================================== // Query AS OF (valid=t0, tx=t0): should see age=40 - auto query_t0_t0 = Query::from("u:User") + auto query_t0_t0 = Query::match("u:User") .as_of(t0_, t0_) .where("u.name", CompareOp::Eq, Value("Diana")) .build(); @@ -329,7 +329,7 @@ TEST_F(TemporalQueryTest, BitemporalQueryWithUpdates) { EXPECT_EQ(age_t0_t0->Value(0), 40); // Query AS OF (valid=t1, tx=t1): should see age=41 - auto query_t1_t1 = Query::from("u:User") + auto query_t1_t1 = Query::match("u:User") .as_of(t1_, t1_) .where("u.name", CompareOp::Eq, Value("Diana")) .build(); @@ -341,7 +341,7 @@ TEST_F(TemporalQueryTest, BitemporalQueryWithUpdates) { EXPECT_EQ(age_t1_t1->Value(0), 41); // Query AS OF (valid=t2, tx=t2): should see age=42 - auto query_t2_t2 = Query::from("u:User") + auto query_t2_t2 = Query::match("u:User") .as_of(t2_, t2_) .where("u.name", CompareOp::Eq, Value("Diana")) .build(); @@ -354,7 +354,7 @@ TEST_F(TemporalQueryTest, BitemporalQueryWithUpdates) { // Query AS OF (valid=t0, tx=t2): "What did we know at t2 about t0?" // Should see age=40 (the value that was true at t0) - auto query_t0_tx_t2 = Query::from("u:User") + auto query_t0_tx_t2 = Query::match("u:User") .as_of(t0_, t2_) .where("u.name", CompareOp::Eq, Value("Diana")) .build(); @@ -382,7 +382,7 @@ TEST_F(TemporalQueryTest, TemporalQueryBetweenUpdateTimes) { uint64_t t_mid = (t0_ + t1_) / 2; // Query AS OF t_mid (between t0 and t1): should see age=50 (the t0 version) - auto query_mid = Query::from("u:User") + auto query_mid = Query::match("u:User") .as_of_valid_time(t_mid) .where("u.name", CompareOp::Eq, Value("Eve")) .build(); @@ -400,7 +400,7 @@ TEST_F(TemporalQueryTest, CurrentVersionQuery) { int64_t user_id = create_simple_user("Alice", 27); // Query current version (no AS OF clause) - auto query = Query::from("u:User") + auto query = Query::match("u:User") .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -423,7 +423,7 @@ TEST_F(TemporalQueryTest, AsOfValidTimeQuery) { int64_t user_id = create_simple_user("Alice", 25); // Query at t1 - auto query = Query::from("u:User") + auto query = Query::match("u:User") .as_of_valid_time(t1_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -445,7 +445,7 @@ TEST_F(TemporalQueryTest, AsOfTxTimeQuery) { int64_t user_id = create_simple_user("Alice", 26); // Query as of transaction time t1 - auto query = Query::from("u:User") + auto query = Query::match("u:User") .as_of_tx_time(t1_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -467,7 +467,7 @@ TEST_F(TemporalQueryTest, BitemporalQuery) { int64_t user_id = create_simple_user("Alice", 26); // Query both dimensions: valid_time=t1, tx_time=t1 - auto query = Query::from("u:User") + auto query = Query::match("u:User") .as_of(t1_, t1_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -490,7 +490,7 @@ TEST_F(TemporalQueryTest, TemporalQueryWithWhereClause) { create_simple_user("Bob", 30); // Query at t0 where age > 26 (should find only Bob) - auto query = Query::from("u:User") + auto query = Query::match("u:User") .as_of_valid_time(t0_) .where("u.age", CompareOp::Gt, Value(26)) .build(); @@ -517,7 +517,7 @@ TEST_F(TemporalQueryTest, TemporalSnapshotInQueryState) { int64_t user_id = create_simple_user("Alice", 25); // Create query with temporal snapshot - auto query = Query::from("u:User") + auto query = Query::match("u:User") .as_of_valid_time(t1_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -539,7 +539,7 @@ TEST_F(TemporalQueryTest, QueryBeforeFirstVersion) { // Query before t0 (should return current data as versioning not fully enabled // yet) uint64_t before_t0 = t0_ - 1000000000ULL; // 1 second before t0 - auto query = Query::from("u:User") + auto query = Query::match("u:User") .as_of_valid_time(before_t0) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -555,28 +555,28 @@ TEST_F(TemporalQueryTest, QueryBeforeFirstVersion) { TEST_F(TemporalQueryTest, AsOfBuilderMethods) { // Test as_of_valid_time() - auto query1 = Query::from("u:User").as_of_valid_time(t1_).build(); + auto query1 = Query::match("u:User").as_of_valid_time(t1_).build(); ASSERT_TRUE(query1.temporal_snapshot().has_value()); EXPECT_EQ(query1.temporal_snapshot()->valid_time, t1_); EXPECT_EQ(query1.temporal_snapshot()->tx_time, std::numeric_limits::max()); // Test as_of_tx_time() - auto query2 = Query::from("u:User").as_of_tx_time(t2_).build(); + auto query2 = Query::match("u:User").as_of_tx_time(t2_).build(); ASSERT_TRUE(query2.temporal_snapshot().has_value()); EXPECT_EQ(query2.temporal_snapshot()->valid_time, std::numeric_limits::max()); EXPECT_EQ(query2.temporal_snapshot()->tx_time, t2_); // Test as_of() with both dimensions - auto query3 = Query::from("u:User").as_of(t1_, t2_).build(); + auto query3 = Query::match("u:User").as_of(t1_, t2_).build(); ASSERT_TRUE(query3.temporal_snapshot().has_value()); EXPECT_EQ(query3.temporal_snapshot()->valid_time, t1_); EXPECT_EQ(query3.temporal_snapshot()->tx_time, t2_); // Test chaining: as_of_valid_time() then as_of_tx_time() auto query4 = - Query::from("u:User").as_of_valid_time(t1_).as_of_tx_time(t2_).build(); + Query::match("u:User").as_of_valid_time(t1_).as_of_tx_time(t2_).build(); ASSERT_TRUE(query4.temporal_snapshot().has_value()); EXPECT_EQ(query4.temporal_snapshot()->valid_time, t1_); EXPECT_EQ(query4.temporal_snapshot()->tx_time, t2_); @@ -600,7 +600,7 @@ TEST_F(TemporalQueryTest, NullFieldInVersionChain) { ASSERT_TRUE(update2.ok()); // Query at t0: should see age=25 - auto query_t0 = Query::from("u:User") + auto query_t0 = Query::match("u:User") .as_of_valid_time(t0_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -616,7 +616,7 @@ TEST_F(TemporalQueryTest, NullFieldInVersionChain) { EXPECT_EQ(age_array_t0->Value(0), 25); // Query at t1: should see age=30 - auto query_t1 = Query::from("u:User") + auto query_t1 = Query::match("u:User") .as_of_valid_time(t1_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -632,7 +632,7 @@ TEST_F(TemporalQueryTest, NullFieldInVersionChain) { EXPECT_EQ(age_array_t1->Value(0), 30); // Query at t2: should see age=NULL - auto query_t2 = Query::from("u:User") + auto query_t2 = Query::match("u:User") .as_of_valid_time(t2_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -655,7 +655,7 @@ TEST_F(TemporalQueryTest, NodeNotVisibleBeforeCreation) { // Query at t0 (before node creation): should return 0 rows // Because the node's valid_from = t1, it shouldn't be visible at t0 - auto query_before = Query::from("u:User") + auto query_before = Query::match("u:User") .as_of_valid_time(t0_) .build(); // No WHERE clause - get all users at t0 auto result_before = db_->query(query_before); @@ -667,7 +667,7 @@ TEST_F(TemporalQueryTest, NodeNotVisibleBeforeCreation) { EXPECT_EQ(table_before->num_rows(), 0); // Query at t1 (at creation): should return 1 row - auto query_at_creation = Query::from("u:User") + auto query_at_creation = Query::match("u:User") .as_of_valid_time(t1_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -676,7 +676,7 @@ TEST_F(TemporalQueryTest, NodeNotVisibleBeforeCreation) { EXPECT_EQ(result_at.ValueOrDie()->table()->num_rows(), 1); // Query at t2 (after creation): should also return 1 row - auto query_after = Query::from("u:User") + auto query_after = Query::match("u:User") .as_of_valid_time(t2_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -711,7 +711,7 @@ TEST_F(TemporalQueryTest, MultipleNodesIndependentVersions) { // Query at t0: should see only Alice (age=25) // Bob's valid_from = t1 > t0, so Bob should NOT be visible at t0 - auto query_t0 = Query::from("u:User").as_of_valid_time(t0_).build(); + auto query_t0 = Query::match("u:User").as_of_valid_time(t0_).build(); auto result_t0 = db_->query(query_t0); ASSERT_TRUE(result_t0.ok()); auto table_t0 = result_t0.ValueOrDie()->table(); @@ -730,14 +730,14 @@ TEST_F(TemporalQueryTest, MultipleNodesIndependentVersions) { EXPECT_EQ(age_array_t0->Value(0), 25); // Query at t1: should see Alice (age=26) and Bob (age=30) - auto query_t1 = Query::from("u:User").as_of_valid_time(t1_).build(); + auto query_t1 = Query::match("u:User").as_of_valid_time(t1_).build(); auto result_t1 = db_->query(query_t1); ASSERT_TRUE(result_t1.ok()); auto table_t1 = result_t1.ValueOrDie()->table(); EXPECT_EQ(table_t1->num_rows(), 2); // Query at t2: should see Alice (age=26) and Bob (age=31) - auto query_t2 = Query::from("u:User").as_of_valid_time(t2_).build(); + auto query_t2 = Query::match("u:User").as_of_valid_time(t2_).build(); auto result_t2 = db_->query(query_t2); ASSERT_TRUE(result_t2.ok()); auto table_t2 = result_t2.ValueOrDie()->table(); @@ -789,7 +789,7 @@ TEST_F(TemporalQueryTest, VersioningDisabledFallback) { // Temporal query at t0 (should return CURRENT version, not historical) // Because versioning is disabled, no history is kept - auto query_past = Query::from("u:User") + auto query_past = Query::match("u:User") .as_of_valid_time(t0_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -807,7 +807,7 @@ TEST_F(TemporalQueryTest, VersioningDisabledFallback) { EXPECT_EQ(age_array->Value(0), 26); // Current value, not historical // Current query should also return age=26 - auto query_current = Query::from("u:User") + auto query_current = Query::match("u:User") .where("u.name", CompareOp::Eq, Value("Alice")) .build(); auto result_current = db_no_version->query(query_current); @@ -870,7 +870,7 @@ TEST_F(TemporalQueryTest, NoOpUpdateDoesNotCreateNewVersion) { EXPECT_EQ(version_count_after, version_count_before + 1); // Query at t0: should see age=25 - auto query_t0 = Query::from("u:User") + auto query_t0 = Query::match("u:User") .as_of_valid_time(t0_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); @@ -885,7 +885,7 @@ TEST_F(TemporalQueryTest, NoOpUpdateDoesNotCreateNewVersion) { EXPECT_EQ(age_array_t0->Value(0), 25); // Query at t1: should also see age=25 (no change) - auto query_t1 = Query::from("u:User") + auto query_t1 = Query::match("u:User") .as_of_valid_time(t1_) .where("u.name", CompareOp::Eq, Value("Alice")) .build(); diff --git a/tests/update_query_join_test.cpp b/tests/update_query_join_test.cpp index 933c6c9..f10c6c4 100644 --- a/tests/update_query_join_test.cpp +++ b/tests/update_query_join_test.cpp @@ -84,7 +84,7 @@ class UpdateJoinCrossSchemaTest : public ::testing::Test { template T get_field(const std::string& schema, int64_t id, const std::string& field_name) { - auto query = Query::from("_:" + schema).build(); + auto query = Query::match("_:" + schema).build(); auto result = db_->query(query).ValueOrDie(); auto table = result->table(); auto ids = get_column_values(table, "_.id").ValueOrDie(); @@ -112,7 +112,7 @@ TEST_F(UpdateJoinCrossSchemaTest, UpdateBothSidesOfTraversal) { // MATCH (u:User)-[:WORKS_AT]->(c:Company) // WHERE c.name = "Acme" // SET u.employed = true, c.size = 1 - auto q = Query::from("u:User") + auto q = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company") .where("c.name", CompareOp::Eq, Value("Acme"s)) .build(); @@ -144,7 +144,8 @@ TEST_F(UpdateJoinCrossSchemaTest, UpdateBothSidesOfTraversal) { TEST_F(UpdateJoinCrossSchemaTest, UpdateOnlyUserSide) { // Only update User.employed, leave Company untouched - auto q = Query::from("u:User").traverse("u", "WORKS_AT", "c:Company").build(); + auto q = + Query::match("u:User").traverse("u", "WORKS_AT", "c:Company").build(); auto uq = UpdateQuery::match(q).set("u.employed", Value(true)).build(); auto result = db_->update(uq); @@ -161,7 +162,7 @@ TEST_F(UpdateJoinCrossSchemaTest, UpdateOnlyUserSide) { TEST_F(UpdateJoinCrossSchemaTest, UpdateOnlyCompanySide) { // Only update Company.size, leave User untouched - auto q = Query::from("u:User") + auto q = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company") .where("c.name", CompareOp::Eq, Value("Acme"s)) .build(); @@ -180,7 +181,7 @@ TEST_F(UpdateJoinCrossSchemaTest, UpdateOnlyCompanySide) { } TEST_F(UpdateJoinCrossSchemaTest, UpdateWithEdgeAliasTraversal) { - auto q = Query::from("u:User") + auto q = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company", TraverseType::Inner, std::optional{"e"}) .where("c.name", CompareOp::Eq, Value("Acme"s)) @@ -197,7 +198,7 @@ TEST_F(UpdateJoinCrossSchemaTest, UpdateWithEdgeAliasTraversal) { } TEST_F(UpdateJoinCrossSchemaTest, FilterByEdgeFieldAndSelectEdgeFields) { - auto query = Query::from("u:User") + auto query = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company", TraverseType::Inner, std::optional{"e"}) .where("e.since", CompareOp::Gte, Value(int64_t(2021))) @@ -220,7 +221,7 @@ TEST_F(UpdateJoinCrossSchemaTest, FilterByEdgeFieldAndSelectEdgeFields) { } TEST_F(UpdateJoinCrossSchemaTest, UpdateEdgeFieldByMatchAlias) { - auto q = Query::from("u:User") + auto q = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company", TraverseType::Inner, std::optional{"e"}) .where("u.name", CompareOp::Eq, Value("Alice"s)) @@ -232,7 +233,7 @@ TEST_F(UpdateJoinCrossSchemaTest, UpdateEdgeFieldByMatchAlias) { EXPECT_EQ(update_res.ValueOrDie().failed_count, 0); EXPECT_EQ(update_res.ValueOrDie().updated_count, 1); - auto verify = Query::from("u:User") + auto verify = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company", TraverseType::Inner, std::optional{"e"}) .where("u.name", CompareOp::Eq, Value("Alice"s)) @@ -245,7 +246,7 @@ TEST_F(UpdateJoinCrossSchemaTest, UpdateEdgeFieldByMatchAlias) { } TEST_F(UpdateJoinCrossSchemaTest, SelectEdgeAliasReturnsOnlyUserDefinedFields) { - auto query = Query::from("u:User") + auto query = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company", TraverseType::Inner, std::optional{"e"}) .select({"e"}) @@ -281,7 +282,7 @@ TEST_F(UpdateJoinCrossSchemaTest, SelectEdgeAliasReturnsOnlyUserDefinedFields) { TEST_F(UpdateJoinCrossSchemaTest, TraversalWithNoMatchUpdatesNothing) { // WHERE c.name = "NonExistent" → no rows - auto q = Query::from("u:User") + auto q = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company") .where("c.name", CompareOp::Eq, Value("NonExistent"s)) .build(); @@ -302,7 +303,7 @@ TEST_F(UpdateJoinCrossSchemaTest, TraversalWithNoMatchUpdatesNothing) { TEST_F(UpdateJoinCrossSchemaTest, DuplicateAliasForNodeAndEdgeFails) { // "u" is already used as a node alias (u:User); reusing it as an edge alias // must fail during query preparation. - auto query = Query::from("u:User") + auto query = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company", TraverseType::Inner, std::optional{"u"}) .build(); @@ -354,7 +355,7 @@ class UpdateJoinSameSchemaTest : public ::testing::Test { template T get_field(const std::string& schema, int64_t id, const std::string& field_name) { - auto query = Query::from("_:" + schema).build(); + auto query = Query::match("_:" + schema).build(); auto result = db_->query(query).ValueOrDie(); auto table = result->table(); auto ids = get_column_values(table, "_.id").ValueOrDie(); @@ -382,7 +383,7 @@ TEST_F(UpdateJoinSameSchemaTest, UpdateBothSidesOfFriendship) { // MATCH (u:User)-[:FRIEND]->(f:User) // SET u.has_friend = true, f.has_friend = true - auto q = Query::from("u:User").traverse("u", "FRIEND", "f:User").build(); + auto q = Query::match("u:User").traverse("u", "FRIEND", "f:User").build(); auto uq = UpdateQuery::match(q) .set("u.has_friend", Value(true)) .set("f.has_friend", Value(true)) @@ -407,7 +408,7 @@ TEST_F(UpdateJoinSameSchemaTest, UpdateBothSidesOfFriendship) { TEST_F(UpdateJoinSameSchemaTest, UpdateOnlySourceSide) { // Only update the source alias "u" - auto q = Query::from("u:User").traverse("u", "FRIEND", "f:User").build(); + auto q = Query::match("u:User").traverse("u", "FRIEND", "f:User").build(); auto uq = UpdateQuery::match(q).set("u.has_friend", Value(true)).build(); auto result = db_->update(uq); @@ -424,7 +425,7 @@ TEST_F(UpdateJoinSameSchemaTest, UpdateOnlySourceSide) { TEST_F(UpdateJoinSameSchemaTest, UpdateOnlyTargetSide) { // Only update the target alias "f" - auto q = Query::from("u:User").traverse("u", "FRIEND", "f:User").build(); + auto q = Query::match("u:User").traverse("u", "FRIEND", "f:User").build(); auto uq = UpdateQuery::match(q).set("f.has_friend", Value(true)).build(); auto result = db_->update(uq); @@ -444,7 +445,7 @@ TEST_F(UpdateJoinSameSchemaTest, UpdateOnlyTargetSide) { TEST_F(UpdateJoinSameSchemaTest, UpdateWithWhereOnTarget) { // Only update friends named "Bob" - auto q = Query::from("u:User") + auto q = Query::match("u:User") .traverse("u", "FRIEND", "f:User") .where("f.name", CompareOp::Eq, Value("Bob"s)) .build(); diff --git a/tests/update_query_test.cpp b/tests/update_query_test.cpp index 9c63525..04d667a 100644 --- a/tests/update_query_test.cpp +++ b/tests/update_query_test.cpp @@ -87,7 +87,7 @@ class UpdateQueryTest : public ::testing::Test { T get_field(const std::string& schema, int64_t id, const std::string& field_name) { const std::string alias = "_"; - auto query = Query::from(alias + ":" + schema).build(); + auto query = Query::match(alias + ":" + schema).build(); auto result = db_->query(query).ValueOrDie(); auto table = result->table(); auto ids = get_column_values(table, alias + ".id").ValueOrDie(); @@ -142,12 +142,12 @@ TEST_F(UpdateQueryTest, BuilderDefaultUpdateTypeIsSET) { // ========================================================================= TEST_F(UpdateQueryTest, MatchRequiresAtLeastOneSet) { - auto q = Query::from("u:User").build(); + auto q = Query::match("u:User").build(); EXPECT_THROW((UpdateQuery::match(q).build()), std::runtime_error); } TEST_F(UpdateQueryTest, MatchStoresQuery) { - auto q = Query::from("u:User") + auto q = Query::match("u:User") .where("u.city", CompareOp::Eq, Value("NYC"s)) .build(); auto uq = UpdateQuery::match(q).set("u.age", Value(31)).build(); @@ -156,7 +156,8 @@ TEST_F(UpdateQueryTest, MatchStoresQuery) { } TEST_F(UpdateQueryTest, MatchTargetAliasesFromSetFields) { - auto q = Query::from("u:User").traverse("u", "WORKS_AT", "c:Company").build(); + auto q = + Query::match("u:User").traverse("u", "WORKS_AT", "c:Company").build(); auto uq = UpdateQuery::match(q) .set("u.salary", Value(int32_t(0))) .set("c.size", Value(int32_t(9))) @@ -247,7 +248,7 @@ TEST_F(UpdateQueryTest, UpdateByIdInvalidSchema) { TEST_F(UpdateQueryTest, UpdateByMatchSimpleWhere) { // All NYC users: Alice(0), Bob(1), Eve(4) - auto q = Query::from("u:User") + auto q = Query::match("u:User") .where("u.city", CompareOp::Eq, Value("NYC"s)) .build(); auto uq = @@ -270,7 +271,7 @@ TEST_F(UpdateQueryTest, UpdateByMatchSimpleWhere) { } TEST_F(UpdateQueryTest, UpdateByMatchSingleResult) { - auto q = Query::from("u:User") + auto q = Query::match("u:User") .where("u.name", CompareOp::Eq, Value("Alice"s)) .build(); auto uq = UpdateQuery::match(q).set("u.age", Value(int32_t(26))).build(); @@ -283,7 +284,7 @@ TEST_F(UpdateQueryTest, UpdateByMatchSingleResult) { } TEST_F(UpdateQueryTest, UpdateByMatchNoResults) { - auto q = Query::from("u:User") + auto q = Query::match("u:User") .where("u.name", CompareOp::Eq, Value("Nobody"s)) .build(); auto uq = UpdateQuery::match(q).set("u.age", Value(int32_t(0))).build(); @@ -295,7 +296,7 @@ TEST_F(UpdateQueryTest, UpdateByMatchNoResults) { TEST_F(UpdateQueryTest, UpdateByMatchCompoundAnd) { // age > 30 AND city = "NYC" → Bob(35,NYC), Eve(55,NYC) - auto q = Query::from("u:User") + auto q = Query::match("u:User") .where("u.age", CompareOp::Gt, Value(int32_t(30))) .and_where("u.city", CompareOp::Eq, Value("NYC"s)) .build(); @@ -311,7 +312,7 @@ TEST_F(UpdateQueryTest, UpdateByMatchCompoundAnd) { } TEST_F(UpdateQueryTest, UpdateByMatchMultipleSetFields) { - auto q = Query::from("u:User") + auto q = Query::match("u:User") .where("u.name", CompareOp::Eq, Value("Alice"s)) .build(); auto uq = UpdateQuery::match(q) @@ -333,7 +334,7 @@ TEST_F(UpdateQueryTest, UpdateByMatchMultipleSetFields) { TEST_F(UpdateQueryTest, UpdateByMatchWithTraversal) { // Update users who work at TechCorp: Alice(0), Bob(1) - auto q = Query::from("u:User") + auto q = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company") .where("c.name", CompareOp::Eq, Value("TechCorp"s)) .build(); @@ -357,7 +358,7 @@ TEST_F(UpdateQueryTest, UpdateByMatchWithTraversal) { TEST_F(UpdateQueryTest, UpdateMultiSchemaViaTraversal) { // UPDATE users who work at TechCorp AND update TechCorp itself - auto q = Query::from("u:User") + auto q = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company") .where("c.name", CompareOp::Eq, Value("TechCorp"s)) .build(); @@ -384,7 +385,7 @@ TEST_F(UpdateQueryTest, UpdateMultiSchemaViaTraversal) { // ========================================================================= TEST_F(UpdateQueryTest, UpdateByMatchBadAliasInSet) { - auto q = Query::from("u:User").build(); + auto q = Query::match("u:User").build(); auto uq = UpdateQuery::match(q) .set("x.salary", Value(int32_t(0))) // "x" not in MATCH .build(); @@ -394,7 +395,7 @@ TEST_F(UpdateQueryTest, UpdateByMatchBadAliasInSet) { } TEST_F(UpdateQueryTest, UpdateByMatchUnqualifiedFieldFails) { - auto q = Query::from("u:User").build(); + auto q = Query::match("u:User").build(); auto uq = UpdateQuery::match(q) .set("salary", Value(int32_t(0))) // missing alias .build(); @@ -469,7 +470,7 @@ TEST_F(UpdateQueryTest, UpdateByMatchSupportsMapKeySet) { db_->create_node("MapUser", {{"name", Value{"Nina"}}}).ValueOrDie(); db_->create_node("MapUser", {{"name", Value{"Omar"}}}).ValueOrDie(); - auto q = Query::from("m:MapUser") + auto q = Query::match("m:MapUser") .where("m.name", CompareOp::Eq, Value("Nina"s)) .build(); auto uq = @@ -506,7 +507,7 @@ TEST_F(UpdateQueryTest, db_->get_schema_registry()->create("MapUserDepth", map_schema).ValueOrDie(); db_->create_node("MapUserDepth", {{"name", Value{"Nina"}}}).ValueOrDie(); - auto q = Query::from("m:MapUserDepth") + auto q = Query::match("m:MapUserDepth") .where("m.name", CompareOp::Eq, Value("Nina"s)) .build(); auto uq = UpdateQuery::match(q) diff --git a/tests/where_expression_test.cpp b/tests/where_expression_test.cpp index 3b8c57d..9f23c6e 100644 --- a/tests/where_expression_test.cpp +++ b/tests/where_expression_test.cpp @@ -135,7 +135,8 @@ class WhereExpressionTest : public ::testing::Test { // Test simple WHERE expressions TEST_F(WhereExpressionTest, SimpleWhereCondition) { // Test basic WHERE clause - Query query = Query::from("u:User").where("u.age", CompareOp::Gt, 40).build(); + Query query = + Query::match("u:User").where("u.age", CompareOp::Gt, 40).build(); auto result = db_->query(query); ASSERT_OK(result); @@ -153,7 +154,7 @@ TEST_F(WhereExpressionTest, SimpleWhereCondition) { // Test compound WHERE with AND - fluent API TEST_F(WhereExpressionTest, CompoundWhereAndFluent) { // Test: age > 30 AND city = "NYC" - Query query = Query::from("u:User") + Query query = Query::match("u:User") .where("u.age", CompareOp::Gt, 30) .and_where("u.city", CompareOp::Eq, "NYC") .build(); @@ -178,7 +179,7 @@ TEST_F(WhereExpressionTest, CompoundWhereAndFluent) { TEST_F(WhereExpressionTest, CompoundWhereOrFluent) { Logger::get_instance().set_level(LogLevel::DEBUG); // Test: city = "SF" OR salary > 150000 - Query query = Query::from("u:User") + Query query = Query::match("u:User") .where("u.city", CompareOp::Eq, "SF") .or_where("u.salary", CompareOp::Gt, 150000) .build(); @@ -220,7 +221,7 @@ TEST_F(WhereExpressionTest, ComplexExpressionWithPrecedence) { // age > 30 AND (city = "NYC" OR salary > 150000) auto final_expr = LogicalExpr::and_expr(age_condition, or_expr); - Query query = Query::from("u:User").where_logical_expr(final_expr).build(); + Query query = Query::match("u:User").where_logical_expr(final_expr).build(); auto result = db_->query(query); ASSERT_OK(result); @@ -245,7 +246,7 @@ TEST_F(WhereExpressionTest, ComplexExpressionWithPrecedence) { // Test inline WHERE with simple condition TEST_F(WhereExpressionTest, InlineWhereSimple) { // Test inline optimization with simple WHERE - Query query = Query::from("u:User") + Query query = Query::match("u:User") .traverse("u", "FRIEND", "f:User") .where("f.age", CompareOp::Gt, 40) .inline_where() @@ -268,7 +269,7 @@ TEST_F(WhereExpressionTest, InlineWhereSimple) { // Test inline WHERE with compound condition TEST_F(WhereExpressionTest, InlineWhereCompound) { // Test inline optimization with compound WHERE: f.age > 25 AND f.city = "NYC" - Query query = Query::from("u:User") + Query query = Query::match("u:User") .traverse("u", "FRIEND", "f:User") .where("f.age", CompareOp::Gt, 25) .and_where("f.city", CompareOp::Eq, "NYC") @@ -302,7 +303,7 @@ TEST_F(WhereExpressionTest, MultipleDifferentPrecedence) { // Left-to-right: (age > 40 AND city = "LA") OR salary > 100000 // This will match: Bob(salary), Charlie(salary), Eve(salary), Henry(salary), // Jack(both) = 5 users - Query query_left = Query::from("u:User") + Query query_left = Query::match("u:User") .where("u.age", CompareOp::Gt, 40) .and_where("u.city", CompareOp::Eq, "LA") .or_where("u.salary", CompareOp::Gt, 100000) @@ -325,7 +326,7 @@ TEST_F(WhereExpressionTest, MultipleDifferentPrecedence) { auto final_expr = LogicalExpr::and_expr(age_cond, or_part); Query query_explicit = - Query::from("u:User").where_logical_expr(final_expr).build(); + Query::match("u:User").where_logical_expr(final_expr).build(); auto result_explicit = db_->query(query_explicit); ASSERT_OK(result_explicit); @@ -368,7 +369,7 @@ TEST_F(WhereExpressionTest, ExpressionToString) { // Test error handling TEST_F(WhereExpressionTest, ErrorHandling) { // Test invalid field name - Query query = Query::from("u:User") + Query query = Query::match("u:User") .where("u.nonexistent", CompareOp::Eq, "value") .build(); @@ -444,7 +445,7 @@ TEST_F(WhereExpressionTest, PerformanceComparison) { // Test simple WHERE performance auto start = std::chrono::high_resolution_clock::now(); - Query query = Query::from("u:User") + Query query = Query::match("u:User") .where("u.age", CompareOp::Gt, 40) .and_where("u.city", CompareOp::Eq, "NYC") .build(); @@ -505,7 +506,7 @@ TEST_F(WhereExpressionTest, OrWithMultipleVariablesNotInlined) { auto final_expr = LogicalExpr::and_expr(age_condition, or_expr); // Create query that should match our test data - Query query = Query::from("a:User") + Query query = Query::match("a:User") .traverse("a", "WORKS_AT", "c:Company") .select({"a.age", "a.city", "c.size"}) // Explicitly select the fields we need @@ -540,7 +541,7 @@ TEST_F(WhereExpressionTest, TraversalWhereCombinations) { // Test Case 1: Single variable where clause (should be inlined) { - Query query = Query::from("u:User") + Query query = Query::match("u:User") .where("u.age", CompareOp::Gt, 35) .traverse("u", "WORKS_AT", "c:Company") @@ -561,7 +562,7 @@ TEST_F(WhereExpressionTest, TraversalWhereCombinations) { } TEST_F(WhereExpressionTest, TraversalWhereCombinations2) { - Query query = Query::from("u:User") + Query query = Query::match("u:User") .traverse("u", "WORKS_AT", "c:Company") .where("u.age", CompareOp::Gte, 35) .and_where("c.size", CompareOp::Gt, 1000) @@ -582,7 +583,7 @@ TEST_F(WhereExpressionTest, TraversalWhereCombinations2) { TEST_F(WhereExpressionTest, TraversalWhereCombinations3) { Query query = - Query::from("u:User") + Query::match("u:User") .where("u.age", CompareOp::Gte, 35) // Should be inlined .traverse("u", "WORKS_AT", "c:Company") .where("c.size", CompareOp::Gt, 1000) // Should be inlined @@ -620,7 +621,7 @@ TEST_F(WhereExpressionTest, QueryMaterializesMapColumn) { std::vector{"score"}}}) .ok()); - Query query = Query::from("m:MapUser").build(); + Query query = Query::match("m:MapUser").build(); auto result = db_->query(query); ASSERT_OK(result); @@ -662,7 +663,7 @@ TEST_F(WhereExpressionTest, QueryFiltersByMapProperty) { ASSERT_OK(ben->update_fields( {FieldUpdate{props, Value{int32_t(7)}, UpdateType::SET, score_key}})); - Query query = Query::from("m:MapUserFilter") + Query query = Query::match("m:MapUserFilter") .where("m.props.score", CompareOp::Eq, Value(int32_t(42))) .build(); auto result = db_->query(query); diff --git a/tests/where_pushdown_join_test.cpp b/tests/where_pushdown_join_test.cpp index 9fba8ce..e191452 100644 --- a/tests/where_pushdown_join_test.cpp +++ b/tests/where_pushdown_join_test.cpp @@ -84,7 +84,7 @@ TEST_F(WherePushdownJoinTest, WhereInJoin) { auto duration = std::chrono::duration_cast( end_time - start_time); create_companies({"google", "ibm", "piedpiper"}); - Query query = Query::from("u:User").build(); + Query query = Query::match("u:User").build(); auto result = db_->query(query); std::cout << result.ValueOrDie()->table()->num_rows() @@ -97,7 +97,7 @@ TEST_F(WherePushdownJoinTest, WhereInJoin) { db_->connect(i, "FRIEND", i + half).ValueOrDie(); } - query = Query::from("u:User").traverse("u", "FRIEND", "f:User").build(); + query = Query::match("u:User").traverse("u", "FRIEND", "f:User").build(); result = db_->query(query); @@ -119,7 +119,7 @@ TEST_F(WherePushdownJoinTest, WhereInJoin) { << std::endl; start_time = std::chrono::high_resolution_clock::now(); - result = db_->query(Query::from("u:User") + result = db_->query(Query::match("u:User") .traverse("u", "FRIEND", "f:User") .where("f.age", CompareOp::Gt, 50) .build()); @@ -134,7 +134,7 @@ TEST_F(WherePushdownJoinTest, WhereInJoin) { std::cout << "unoptimized size=" << unoptimized_size << std::endl; start_time = std::chrono::high_resolution_clock::now(); - result = db_->query(Query::from("u:User") + result = db_->query(Query::match("u:User") .traverse("u", "FRIEND", "f:User") .where("f.age", CompareOp::Gt, 50) .inline_where() From 453b326e254392b5a133ed05addcd9f92e3121bf Mon Sep 17 00:00:00 2001 From: dmgcodevil Date: Sun, 19 Apr 2026 23:20:54 -0400 Subject: [PATCH 2/7] experement with pushdowns --- include/query/where_planner.hpp | 81 +++++ src/query/CMakeLists.txt | 1 + src/query/result_builder.cpp | 340 ++++++++++++++++++- src/query/where_planner.cpp | 179 ++++++++++ tests/CMakeLists.txt | 41 ++- tests/join_test.cpp | 44 ++- tests/join_where_test.cpp | 556 ++++++++++++++++++++++++++++++++ tests/where_planner_test.cpp | 157 +++++++++ 8 files changed, 1394 insertions(+), 5 deletions(-) create mode 100644 include/query/where_planner.hpp create mode 100644 src/query/where_planner.cpp create mode 100644 tests/join_where_test.cpp create mode 100644 tests/where_planner_test.cpp diff --git a/include/query/where_planner.hpp b/include/query/where_planner.hpp new file mode 100644 index 0000000..c68d0a7 --- /dev/null +++ b/include/query/where_planner.hpp @@ -0,0 +1,81 @@ +#ifndef QUERY_WHERE_PLANNER_HPP +#define QUERY_WHERE_PLANNER_HPP + +#include +#include + +#include "query/query.hpp" + +namespace tundradb { + +struct QueryState; + +/** + * @brief One predicate fragment assigned to a concrete execution phase. + * + * The planner preserves the original query clause position so execution can + * keep user-written order when multiple predicates are pulled back into the + * same root/traverse site. + * + * Important: planned fragments reuse subtrees from the original WHERE AST. + * They should be treated as read-only views rather than mutated copies. + */ +struct PlannedPredicate { + size_t source_clause_index; ///< Original WHERE clause position in Query. + std::shared_ptr expr; +}; + +/** + * @brief Predicates that can be applied while executing one traverse hop. + * + * `target_filters` apply to the hop's target node alias. + * `edge_filters` apply to the hop's optional edge alias. + */ +struct TraverseWherePlan { + std::vector target_filters; + std::vector edge_filters; +}; + +/** + * @brief Full predicate execution plan derived from a prepared Query. + * + * Execution model: + * - `root_filters` run before the first traverse. + * - `traverse_filters[i]` run while executing `query_state.traversals[i]`. + * - `residual_by_clause[i]` is appended when visiting clause `i` in the + * normal clause loop and applied later on the denormalized result table. + */ +struct WhereExecutionPlan { + std::vector root_filters; + std::vector traverse_filters; + std::vector> residual_by_clause; +}; + +/** + * @brief Build a safe pushdown plan for the query's WHERE clauses. + * + * Preconditions: + * - `query_state` has already been prepared via `prepare_query(...)`. + * - All aliases referenced by WHERE expressions are registered in + * `query_state`. + * + * Safe split rules: + * - A subtree that references exactly one alias is pushable as-is. + * - `AND` is decomposed recursively. + * - Mixed-alias `OR` and alias-to-alias comparisons remain residual. + */ +arrow::Result build_where_plan( + const Query& query, const QueryState& query_state); + +/** + * @brief Fold planned predicates into one logical AND expression. + * + * This is handy if an executor wants to issue one filter call per site while + * still planning predicates as individual fragments. + */ +std::shared_ptr combine_predicates_with_and( + const std::vector& predicates); + +} // namespace tundradb + +#endif // QUERY_WHERE_PLANNER_HPP diff --git a/src/query/CMakeLists.txt b/src/query/CMakeLists.txt index c379743..0ac0f09 100644 --- a/src/query/CMakeLists.txt +++ b/src/query/CMakeLists.txt @@ -6,5 +6,6 @@ target_sources(core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/query_bootstrap.cpp ${CMAKE_CURRENT_SOURCE_DIR}/filter_executor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/traverse_executor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/where_planner.cpp ${CMAKE_CURRENT_SOURCE_DIR}/result_builder.cpp ) diff --git a/src/query/result_builder.cpp b/src/query/result_builder.cpp index 3406849..9afbfa0 100644 --- a/src/query/result_builder.cpp +++ b/src/query/result_builder.cpp @@ -22,6 +22,333 @@ namespace tundradb { +namespace { + +/// Temporary feature flag for the experimental binding-based materializer. +/// +/// Keep this disabled by default so query execution uses the legacy row builder +/// until the binding path covers the full join/WHERE matrix and can replace the +/// old implementation outright. +static constexpr bool kEnableBindingMaterialization = false; + +/// One partially-materialized result row plus the alias bindings that produced +/// it. +/// +/// `row` stores the actual field values accumulated so far. +/// `node_ids` records node alias -> bound node id, or `nullopt` for a LEFT +/// null-extension. +/// `edge_ids` records edge alias -> bound edge id, or `nullopt` when the edge +/// side of a LEFT traversal is absent. +struct BindingRow { + std::shared_ptr row; + std::unordered_map> node_ids; + std::unordered_map> edge_ids; +}; + +/// Returns true when every traversal in the query can use the new +/// binding-based materializer. +/// +/// At the moment that path handles INNER and LEFT joins directly. RIGHT and +/// FULL still fall back to the legacy BFS/schema-based builder. +bool supports_binding_materialization(const std::vector& traverses) { + if (!kEnableBindingMaterialization) { + return false; + } + return std::all_of(traverses.begin(), traverses.end(), [](const auto& t) { + return t.traverse_type() == TraverseType::Inner || + t.traverse_type() == TraverseType::Left; + }); +} + +/// Deep-copy a binding row so one input row can branch into multiple output +/// rows when a traversal fans out to several matches. +auto clone_binding_row(const BindingRow& binding) -> BindingRow { + BindingRow copy; + copy.row = std::make_shared(*binding.row); + copy.node_ids = binding.node_ids; + copy.edge_ids = binding.edge_ids; + return copy; +} + +/// Materialize one bound node alias into the row's cell storage. +/// +/// If the alias has projected fields in the final schema, this loads the node +/// by id and writes its values into the row. Otherwise it is a cheap no-op. +auto fill_node_cells(const SchemaRef& ref, int64_t node_id, BindingRow& binding, + const QueryState& query_state) -> arrow::Status { + const auto idx_it = query_state.schema_field_indices().find(ref.value()); + if (idx_it == query_state.schema_field_indices().end()) { + return arrow::Status::OK(); + } + + ARROW_ASSIGN_OR_RAISE(const auto schema_name, + query_state.resolve_schema(ref)); + ARROW_ASSIGN_OR_RAISE(const auto node, query_state.node_manager->get_node( + schema_name, node_id)); + binding.row->set_cell_from_node(idx_it->second, node, + query_state.temporal_context.get()); + return arrow::Status::OK(); +} + +/// Materialize one bound edge alias into the row's cell storage. +/// +/// If the edge alias contributes projected fields to the final schema, this +/// loads the edge by id and writes its properties into the row. Otherwise it is +/// a cheap no-op. +auto fill_edge_cells(const std::string& edge_alias, int64_t edge_id, + BindingRow& binding, const QueryState& query_state) + -> arrow::Status { + if (!query_state.edge_store) { + return arrow::Status::Invalid("Edge store not available"); + } + + const auto idx_it = query_state.schema_field_indices().find(edge_alias); + if (idx_it == query_state.schema_field_indices().end()) { + return arrow::Status::OK(); + } + + ARROW_ASSIGN_OR_RAISE(const auto edge_schema, + query_state.get_schema_for_alias(edge_alias)); + ARROW_ASSIGN_OR_RAISE(const auto edge, query_state.edge_store->get(edge_id)); + binding.row->set_cell_from_edge(idx_it->second, edge, edge_schema->fields(), + query_state.temporal_context.get()); + return arrow::Status::OK(); +} + +/// Bind one node alias for the current row. +/// +/// Behavior: +/// - if the alias is unbound, record the supplied id (or NULL binding) +/// - if the alias is already bound, succeed only when the new id matches +/// - when a concrete node id is accepted, materialize that node's fields into +/// the row immediately +auto bind_node_alias(BindingRow& binding, const SchemaRef& ref, + const std::optional& node_id, + const QueryState& query_state) -> arrow::Result { + const auto [it, inserted] = + binding.node_ids.try_emplace(ref.value(), node_id); + if (!inserted) { + return it->second == node_id; + } + + if (!node_id.has_value()) { + return true; + } + + ARROW_RETURN_NOT_OK(fill_node_cells(ref, *node_id, binding, query_state)); + binding.row->id = binding.row->id >= 0 ? binding.row->id : *node_id; + return true; +} + +/// Bind one edge alias for the current row. +/// +/// This mirrors `bind_node_alias(...)`, but for optional edge aliases attached +/// to a TRAVERSE. Accepted concrete edge ids are materialized into the row +/// immediately so later stages do not need to revisit the edge store. +auto bind_edge_alias(BindingRow& binding, const std::string& edge_alias, + const std::optional& edge_id, + const QueryState& query_state) -> arrow::Result { + const auto [it, inserted] = binding.edge_ids.try_emplace(edge_alias, edge_id); + if (!inserted) { + return it->second == edge_id; + } + + if (!edge_id.has_value()) { + return true; + } + + ARROW_RETURN_NOT_OK( + fill_edge_cells(edge_alias, *edge_id, binding, query_state)); + return true; +} + +/// Returns true when a recorded graph connection belongs to the given TRAVERSE +/// clause. +/// +/// Matching checks the full traverse identity: source alias, target alias, +/// edge type, and optional edge alias. +bool connection_matches_traverse(const GraphConnection& conn, + const Traverse& traverse) { + return conn.source.value() == traverse.source().value() && + conn.target.value() == traverse.target().value() && + conn.edge_type == traverse.edge_type() && + conn.edge_alias == traverse.edge_alias(); +} + +/// Return the currently valid connections for one bound row traversing one +/// TRAVERSE clause. +/// +/// Starting from the row's already-bound source alias, this filters the stored +/// graph connections down to those that: +/// - match the exact TRAVERSE clause +/// - still point to a live target id in `query_state.ids()[target_alias]` +/// +/// The second rule lets alias-local filtering shrink the acceptable target set +/// before row expansion runs. +auto get_live_connections_for_traverse(const BindingRow& binding, + const Traverse& traverse, + const QueryState& query_state) + -> llvm::SmallVector { + llvm::SmallVector matches; + + const auto source_it = binding.node_ids.find(traverse.source().value()); + if (source_it == binding.node_ids.end() || !source_it->second.has_value()) { + return matches; + } + + const int64_t source_id = *source_it->second; + if (!query_state.connections().contains(traverse.source().value())) { + return matches; + } + + const auto& by_source = + query_state.connections().at(traverse.source().value()); + if (!by_source.contains(source_id)) { + return matches; + } + + const auto target_ids_it = query_state.ids().find(traverse.target().value()); + if (target_ids_it == query_state.ids().end()) { + return matches; + } + + const auto& target_ids = target_ids_it->second; + for (const auto& conn : by_source.at(source_id)) { + if (!connection_matches_traverse(conn, traverse)) { + continue; + } + if (!target_ids.contains(conn.target_id)) { + continue; + } + matches.push_back(conn); + } + return matches; +} + +/// Expand one bound row through a single TRAVERSE clause. +/// +/// Semantics: +/// - `INNER`: one output row per surviving connection; zero connections drops +/// the row +/// - `LEFT`: one output row per surviving connection; zero connections keeps +/// the row and binds the introduced target/edge aliases to NULL +/// +/// This is the core of the binding-based materialization model: join behavior +/// is applied exactly where the TRAVERSE is processed instead of being inferred +/// later from graph connectivity. +auto expand_traverse_binding(const BindingRow& binding, + const Traverse& traverse, + const QueryState& query_state) + -> arrow::Result> { + std::vector expanded_rows; + auto live_connections = + get_live_connections_for_traverse(binding, traverse, query_state); + + if (live_connections.empty()) { + if (traverse.traverse_type() == TraverseType::Inner) { + return expanded_rows; + } + + auto null_extended = clone_binding_row(binding); + ARROW_ASSIGN_OR_RAISE(const bool target_ok, + bind_node_alias(null_extended, traverse.target(), + std::nullopt, query_state)); + if (!target_ok) { + return expanded_rows; + } + if (traverse.edge_alias().has_value()) { + ARROW_ASSIGN_OR_RAISE( + const bool edge_ok, + bind_edge_alias(null_extended, traverse.edge_alias().value(), + std::nullopt, query_state)); + if (!edge_ok) { + return expanded_rows; + } + } + expanded_rows.push_back(std::move(null_extended)); + return expanded_rows; + } + + expanded_rows.reserve(live_connections.size()); + for (const auto& conn : live_connections) { + auto next = clone_binding_row(binding); + ARROW_ASSIGN_OR_RAISE( + const bool target_ok, + bind_node_alias(next, traverse.target(), conn.target_id, query_state)); + if (!target_ok) { + continue; + } + if (traverse.edge_alias().has_value()) { + ARROW_ASSIGN_OR_RAISE(const bool edge_ok, + bind_edge_alias(next, traverse.edge_alias().value(), + conn.edge_id, query_state)); + if (!edge_ok) { + continue; + } + } + expanded_rows.push_back(std::move(next)); + } + return expanded_rows; +} + +/// Build result rows by expanding bound aliases in clause order. +/// +/// This keeps join semantics local to each TRAVERSE: +/// - INNER drops rows with no matching connection. +/// - LEFT preserves the existing row and null-extends the new target/edge. +/// +/// Unlike the legacy BFS/schema-based builder, this does not infer join +/// behavior from missing target IDs after the fact. +auto populate_rows_by_bindings( + const QueryState& query_state, const std::vector& traverses, + const std::shared_ptr& output_schema) + -> arrow::Result>>> { + auto rows = std::make_shared>>(); + const auto root_ids_it = query_state.ids().find(query_state.root.value()); + if (root_ids_it == query_state.ids().end()) { + return rows; + } + + std::vector bindings; + bindings.reserve(root_ids_it->second.size()); + for (const auto root_id : root_ids_it->second) { + BindingRow binding{.row = std::make_shared( + create_empty_row_from_schema(output_schema))}; + ARROW_ASSIGN_OR_RAISE( + const bool root_ok, + bind_node_alias(binding, query_state.root, root_id, query_state)); + if (!root_ok) { + continue; + } + bindings.push_back(std::move(binding)); + } + + for (const auto& traverse : traverses) { + std::vector next_bindings; + for (const auto& binding : bindings) { + ARROW_ASSIGN_OR_RAISE(auto expanded, expand_traverse_binding( + binding, traverse, query_state)); + next_bindings.insert(next_bindings.end(), + std::make_move_iterator(expanded.begin()), + std::make_move_iterator(expanded.end())); + } + bindings = std::move(next_bindings); + if (bindings.empty()) { + break; + } + } + + rows->reserve(bindings.size()); + int64_t row_id = 0; + for (auto& binding : bindings) { + binding.row->id = row_id++; + rows->push_back(std::move(binding.row)); + } + return rows; +} + +} // namespace + /// Starting from one root node, walk the prepared query graph and emit the /// denormalized row variants reachable from that node. arrow::Result>>> @@ -448,9 +775,16 @@ arrow::Result> Database::build_result_table( build_denormalized_schema(query_state)); IF_DEBUG_ENABLED { log_debug("output_schema={}", output_schema->ToString()); } - ARROW_ASSIGN_OR_RAISE(auto rows, - populate_rows(query.execution_config(), query_state, - query_state.traversals, output_schema)); + std::shared_ptr>> rows; + if (supports_binding_materialization(query_state.traversals)) { + ARROW_ASSIGN_OR_RAISE( + rows, populate_rows_by_bindings(query_state, query_state.traversals, + output_schema)); + } else { + ARROW_ASSIGN_OR_RAISE(rows, + populate_rows(query.execution_config(), query_state, + query_state.traversals, output_schema)); + } ARROW_ASSIGN_OR_RAISE(auto table, create_table_from_rows(rows, output_schema)); diff --git a/src/query/where_planner.cpp b/src/query/where_planner.cpp new file mode 100644 index 0000000..d429d44 --- /dev/null +++ b/src/query/where_planner.cpp @@ -0,0 +1,179 @@ +#include "query/where_planner.hpp" + +#include +#include +#include +#include + +#include "query/execution.hpp" + +namespace tundradb { + +namespace { + +/** + * @brief Output of recursively decomposing one WHERE subtree. + * + * `pushdowns` are alias-local fragments that can be scheduled earlier. + * `residual` is the part that must survive to final table filtering. + */ +struct DecomposeResult { + std::vector>> pushdowns; + std::shared_ptr residual; +}; + +/** + * @brief Earliest planner site where an alias can be filtered safely. + * + * `traverse_index == std::nullopt` means the root phase. + * Otherwise the alias is available while executing + * `traversals[traverse_index]`. + */ +struct AliasActivation { + std::optional traverse_index; + AliasKind kind; +}; + +std::shared_ptr combine_with_and(std::shared_ptr left, + std::shared_ptr right) { + if (!left) return right; + if (!right) return left; + return LogicalExpr::and_expr(std::move(left), std::move(right)); +} + +void append_pushdowns( + std::vector>>& dst, + std::vector>>& src) { + dst.insert(dst.end(), std::make_move_iterator(src.begin()), + std::make_move_iterator(src.end())); +} + +/** + * @brief Map each alias to the earliest execution site where it exists. + * + * The root alias is available before traversals start. + * A target alias and optional edge alias become available at the traverse that + * introduces them. We intentionally do not place source aliases here because + * they must already be available from an earlier root/target binding. + */ +std::unordered_map build_alias_activation_map( + const Query& query, const QueryState& query_state) { + std::unordered_map activation; + activation.emplace(query.root().value(), + AliasActivation{std::nullopt, AliasKind::Node}); + + for (size_t traverse_index = 0; + traverse_index < query_state.traversals.size(); ++traverse_index) { + const auto& traverse = query_state.traversals[traverse_index]; + activation.try_emplace(traverse.target().value(), + AliasActivation{traverse_index, AliasKind::Node}); + if (traverse.edge_alias().has_value()) { + activation.try_emplace(traverse.edge_alias().value(), + AliasActivation{traverse_index, AliasKind::Edge}); + } + } + + return activation; +} + +/** + * @brief Recursively split a WHERE subtree into pushable and residual parts. + * + * Safe rules: + * - single-alias subtree: push whole subtree + * - AND: recurse into both children + * - everything else: keep residual + */ +DecomposeResult decompose_where(const std::shared_ptr& expr) { + if (!expr) return {}; + + const auto& vars = expr->get_all_variables(); + if (vars.size() == 1) { + return {{{*vars.begin(), expr}}, nullptr}; + } + + auto logical = std::dynamic_pointer_cast(expr); + if (logical && logical->op() == LogicalOp::AND) { + auto left = decompose_where(logical->left()); + auto right = decompose_where(logical->right()); + + DecomposeResult out; + out.pushdowns.reserve(left.pushdowns.size() + right.pushdowns.size()); + append_pushdowns(out.pushdowns, left.pushdowns); + append_pushdowns(out.pushdowns, right.pushdowns); + out.residual = + combine_with_and(std::move(left.residual), std::move(right.residual)); + return out; + } + + return {{}, expr}; +} + +arrow::Status append_pushdown(WhereExecutionPlan& plan, + const AliasActivation& activation, + PlannedPredicate predicate) { + if (!activation.traverse_index.has_value()) { + if (activation.kind != AliasKind::Node) { + return arrow::Status::Invalid("Root phase cannot host edge predicates"); + } + plan.root_filters.push_back(std::move(predicate)); + return arrow::Status::OK(); + } + + auto& traverse_plan = plan.traverse_filters[*activation.traverse_index]; + if (activation.kind == AliasKind::Edge) { + traverse_plan.edge_filters.push_back(std::move(predicate)); + } else { + traverse_plan.target_filters.push_back(std::move(predicate)); + } + return arrow::Status::OK(); +} + +} // namespace + +std::shared_ptr combine_predicates_with_and( + const std::vector& predicates) { + std::shared_ptr combined; + for (const auto& predicate : predicates) { + combined = combine_with_and(std::move(combined), predicate.expr); + } + return combined; +} + +arrow::Result build_where_plan( + const Query& query, const QueryState& query_state) { + WhereExecutionPlan plan; + plan.traverse_filters.resize(query_state.traversals.size()); + plan.residual_by_clause.resize(query.clauses().size()); + + auto activation = build_alias_activation_map(query, query_state); + + for (size_t clause_index = 0; clause_index < query.clauses().size(); + ++clause_index) { + const auto& clause = query.clauses()[clause_index]; + if (clause->type() != Clause::Type::WHERE) continue; + + auto where = std::dynamic_pointer_cast(clause); + if (!where) { + return arrow::Status::Invalid("Clause ", clause_index, + " is WHERE but not a WhereExpr"); + } + + auto parts = decompose_where(where); + for (auto& [alias, expr] : parts.pushdowns) { + auto it = activation.find(alias); + if (it == activation.end()) { + return arrow::Status::KeyError( + "Alias '", alias, "' is not registered for WHERE pushdown"); + } + + ARROW_RETURN_NOT_OK(append_pushdown( + plan, it->second, PlannedPredicate{clause_index, std::move(expr)})); + } + plan.residual_by_clause[clause_index] = std::move(parts.residual); + } + + return plan; +} + +} // namespace tundradb diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 70d70af..d242756 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -104,6 +104,10 @@ add_executable(join_test join_test.cpp ) +add_executable(join_where_test + join_where_test.cpp +) + # Add benchmark test add_executable(benchmark_test benchmark_test.cpp @@ -116,6 +120,9 @@ add_executable(where_pushdown_join_test add_executable(where_expression_test where_expression_test.cpp) +add_executable(where_planner_test + where_planner_test.cpp) + add_executable(memory_arena_test memory_arena_test.cpp) @@ -288,6 +295,18 @@ target_link_libraries(join_test LLVMSupport LLVMCore ) +target_link_libraries(join_where_test + PRIVATE + core + Arrow::arrow_shared + Parquet::parquet_shared + GTest::GTest + GTest::Main + pthread + TBB::tbb + LLVMSupport LLVMCore +) + # Link benchmark test with Google Benchmark and other dependencies target_link_libraries(benchmark_test PRIVATE @@ -327,6 +346,17 @@ target_link_libraries(where_expression_test LLVMSupport LLVMCore ) +target_link_libraries(where_planner_test + PRIVATE + core + Arrow::arrow_shared + GTest::GTest + GTest::Main + pthread + TBB::tbb + LLVMSupport LLVMCore +) + target_link_libraries(memory_arena_test PRIVATE GTest::GTest @@ -539,9 +569,15 @@ if(ENABLE_SANITIZERS) target_compile_options(join_test PRIVATE ${SANITIZER_COMPILE_FLAGS}) target_link_options(join_test PRIVATE ${SANITIZER_LINK_FLAGS}) + + target_compile_options(join_where_test PRIVATE ${SANITIZER_COMPILE_FLAGS}) + target_link_options(join_where_test PRIVATE ${SANITIZER_LINK_FLAGS}) target_compile_options(where_expression_test PRIVATE ${SANITIZER_COMPILE_FLAGS}) target_link_options(where_expression_test PRIVATE ${SANITIZER_LINK_FLAGS}) + + target_compile_options(where_planner_test PRIVATE ${SANITIZER_COMPILE_FLAGS}) + target_link_options(where_planner_test PRIVATE ${SANITIZER_LINK_FLAGS}) target_compile_options(free_list_arena_test PRIVATE ${SANITIZER_COMPILE_FLAGS}) target_link_options(free_list_arena_test PRIVATE ${SANITIZER_LINK_FLAGS}) @@ -587,7 +623,9 @@ add_test(NAME TableInfoTest COMMAND table_info_test) add_test(NAME SchemaUtilsTest COMMAND schema_utils_test) add_test(NAME DatabaseTest COMMAND database_test) add_test(NAME JoinTest COMMAND join_test) +add_test(NAME JoinWhereTest COMMAND join_where_test) add_test(NAME WhereExpressionTest COMMAND where_expression_test) +add_test(NAME WherePlannerTest COMMAND where_planner_test) add_test(NAME MemoryArenaTest COMMAND memory_arena_test) add_test(NAME FreeListArenaTest COMMAND free_list_arena_test) add_test(NAME NodeArenaTest COMMAND node_arena_test) @@ -626,6 +664,7 @@ set_tests_properties( SchemaUtilsTest DatabaseTest JoinTest + JoinWhereTest WhereExpressionTest FreeListArenaTest UpdateQueryTest @@ -633,4 +672,4 @@ set_tests_properties( TypeSystemTest PROPERTIES ISOLATED TRUE # This ensures tests run in isolation -) \ No newline at end of file +) diff --git a/tests/join_test.cpp b/tests/join_test.cpp index f7d156f..3c501ec 100644 --- a/tests/join_test.cpp +++ b/tests/join_test.cpp @@ -688,6 +688,48 @@ TEST(JoinTest, LeftJoin) { << "Expected NULL for c.size in jeff's row"; } +TEST(JoinTest, LeftJoinTargetWhereFiltersFinalRows) { + auto db = setup_test_db(); + db->connect(0, "friend", 1).ValueOrDie(); // alex -> bob + db->connect(0, "friend", 2).ValueOrDie(); // alex -> jeff + db->connect(1, "works-at", 1).ValueOrDie(); // bob -> google + db->connect(2, "works-at", 0).ValueOrDie(); // jeff -> ibm + + Query query = + Query::match("u:users") + .traverse("u", "friend", "f:users", TraverseType::Inner) + .traverse("f", "works-at", "c:companies", TraverseType::Left) + .where("c.name", CompareOp::Eq, Value("google")) + .build(); + + auto query_result = db->query(query); + ASSERT_TRUE(query_result.ok()); + auto result_table = query_result.ValueOrDie()->table(); + ASSERT_NE(result_table, nullptr); + ASSERT_EQ(result_table->num_rows(), 1); + + auto friend_name_col = result_table->GetColumnByName("f.name"); + auto company_name_col = result_table->GetColumnByName("c.name"); + ASSERT_NE(friend_name_col, nullptr); + ASSERT_NE(company_name_col, nullptr); + + int bob_index = -1; + for (int64_t i = 0; i < result_table->num_rows(); ++i) { + auto friend_name_scalar = std::static_pointer_cast( + friend_name_col->GetScalar(i).ValueOrDie()); + if (friend_name_scalar->view() == "bob") { + bob_index = static_cast(i); + } + } + + ASSERT_NE(bob_index, -1); + + auto bob_company = std::static_pointer_cast( + company_name_col->GetScalar(bob_index).ValueOrDie()); + ASSERT_TRUE(bob_company->is_valid); + EXPECT_EQ(bob_company->ToString(), "google"); +} + TEST(JoinTest, RightJoin) { auto db = setup_test_db(); // Create relationships where some targets don't have matching sources @@ -1835,4 +1877,4 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); // Logger::get_instance().set_level(LogLevel::DEBUG); return RUN_ALL_TESTS(); -} \ No newline at end of file +} diff --git a/tests/join_where_test.cpp b/tests/join_where_test.cpp new file mode 100644 index 0000000..1ab3511 --- /dev/null +++ b/tests/join_where_test.cpp @@ -0,0 +1,556 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include "common/logger.hpp" +#include "main/database.hpp" +#include "query/query.hpp" +#include "storage/metadata.hpp" + +using namespace std::string_literals; +using namespace tundradb; + +namespace tundradb { + +namespace { + +std::shared_ptr create_users_schema() { + return arrow::schema({arrow::field("name", arrow::utf8()), + arrow::field("age", arrow::int64())}); +} + +std::shared_ptr create_companies_schema() { + return arrow::schema({arrow::field("name", arrow::utf8()), + arrow::field("size", arrow::int64())}); +} + +std::string scalar_to_test_string( + const std::shared_ptr& scalar) { + if (!scalar || !scalar->is_valid) { + return "NULL"; + } + return scalar->ToString(); +} + +std::string table_to_test_string(const std::shared_ptr& table) { + std::vector column_names; + column_names.reserve(table->schema()->num_fields()); + for (const auto& field : table->schema()->fields()) { + column_names.push_back(field->name()); + } + + std::vector rows; + rows.reserve(table->num_rows()); + for (int64_t row_index = 0; row_index < table->num_rows(); ++row_index) { + std::ostringstream row; + for (size_t column_index = 0; column_index < column_names.size(); + ++column_index) { + if (column_index > 0) { + row << " | "; + } + + auto column = table->GetColumnByName(column_names[column_index]); + auto scalar_result = column->GetScalar(row_index); + EXPECT_TRUE(scalar_result.ok()) << scalar_result.status().ToString(); + if (!scalar_result.ok()) { + row << ""; + continue; + } + row << scalar_to_test_string(scalar_result.ValueOrDie()); + } + rows.push_back(row.str()); + } + + std::sort(rows.begin(), rows.end()); + + std::ostringstream out; + for (size_t i = 0; i < column_names.size(); ++i) { + if (i > 0) { + out << " | "; + } + out << column_names[i]; + } + for (const auto& row : rows) { + out << '\n' << row; + } + return out.str(); +} + +void create_user(const std::shared_ptr& db, const std::string& name, + int64_t age) { + db->create_node("users", {{"name", Value{name}}, {"age", Value{age}}}) + .ValueOrDie(); +} + +void create_company(const std::shared_ptr& db, + const std::string& name, int64_t size) { + db->create_node("companies", {{"name", Value{name}}, {"size", Value{size}}}) + .ValueOrDie(); +} + +} // namespace + +class JoinWhereTest : public ::testing::Test { + protected: + void SetUp() override { + auto db_path = "join_where_test_db_" + std::to_string(now_millis()); + auto config = make_config() + .with_db_path(db_path) + .with_shard_capacity(1000) + .with_chunk_size(1000) + .build(); + + db_ = std::make_shared(config); + db_->get_schema_registry() + ->create("users", create_users_schema()) + .ValueOrDie(); + db_->get_schema_registry() + ->create("companies", create_companies_schema()) + .ValueOrDie(); + + create_user(db_, "alex", 25); // users(0) + create_user(db_, "bob", 31); // users(1) + create_user(db_, "jeff", 33); // users(2) + + create_company(db_, "google", 3000); // companies(0) + create_company(db_, "acme", 1200); // companies(1) + create_company(db_, "meta", 900); // companies(2) + + db_->connect(0, "works-at", 0).ValueOrDie(); // alex -> google + db_->connect(1, "works-at", 1).ValueOrDie(); // bob -> acme + + db_->connect(0, "friend", 1).ValueOrDie(); // alex -> bob + db_->connect(0, "friend", 2).ValueOrDie(); // alex -> jeff + } + + void expect_query_output(const Query& query, const std::string& query_text, + const std::string& expected_table) { + auto result = db_->query(query); + ASSERT_TRUE(result.ok()) << result.status().ToString(); + auto actual = table_to_test_string(result.ValueOrDie()->table()); + EXPECT_EQ(actual, expected_table) << "Query:\n" << query_text; + } + + std::shared_ptr db_; +}; + +/* +TundraQL query: +MATCH (u:users)-[:works-at INNER]->(c:companies) +WHERE c.name = "google" +SELECT u.name, c.name; + +Expected output table: +u.name | c.name +alex | google +*/ +TEST_F(JoinWhereTest, InnerJoinTargetWhereFiltersMatchedRows) { + const std::string query_text = + "MATCH (u:users)-[:works-at INNER]->(c:companies)\n" + "WHERE c.name = \"google\"\n" + "SELECT u.name, c.name;"; + const std::string expected_table = + "u.name | c.name\n" + "alex | google"; + + auto query = + Query::match("u:users") + .traverse("u", "works-at", "c:companies", TraverseType::Inner) + .where("c.name", CompareOp::Eq, Value("google"s)) + .select({"u.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:works-at INNER]->(c:companies) +WHERE u.name = "alex" +SELECT u.name, c.name; + +Expected output table: +u.name | c.name +alex | google +*/ +TEST_F(JoinWhereTest, InnerJoinSourceWhereFiltersBeforeTraverse) { + const std::string query_text = + "MATCH (u:users)-[:works-at INNER]->(c:companies)\n" + "WHERE u.name = \"alex\"\n" + "SELECT u.name, c.name;"; + const std::string expected_table = + "u.name | c.name\n" + "alex | google"; + + auto query = + Query::match("u:users") + .traverse("u", "works-at", "c:companies", TraverseType::Inner) + .where("u.name", CompareOp::Eq, Value("alex"s)) + .select({"u.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:works-at INNER]->(c:companies) +WHERE u.name = "alex" AND c.name = "google" +SELECT u.name, c.name; + +Expected output table: +u.name | c.name +alex | google +*/ +TEST_F(JoinWhereTest, InnerJoinSourceAndTargetWhereMatchesSingleRow) { + const std::string query_text = + "MATCH (u:users)-[:works-at INNER]->(c:companies)\n" + "WHERE u.name = \"alex\" AND c.name = \"google\"\n" + "SELECT u.name, c.name;"; + const std::string expected_table = + "u.name | c.name\n" + "alex | google"; + + auto query = + Query::match("u:users") + .traverse("u", "works-at", "c:companies", TraverseType::Inner) + .where("u.name", CompareOp::Eq, Value("alex"s)) + .and_where("c.name", CompareOp::Eq, Value("google"s)) + .select({"u.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:works-at LEFT]->(c:companies) +WHERE u.name = "jeff" +SELECT u.name, c.name; + +Expected output table: +u.name | c.name +jeff | NULL +*/ +TEST_F(JoinWhereTest, LeftJoinSourceWhereKeepsNullExtendedSourceRow) { + const std::string query_text = + "MATCH (u:users)-[:works-at LEFT]->(c:companies)\n" + "WHERE u.name = \"jeff\"\n" + "SELECT u.name, c.name;"; + const std::string expected_table = + "u.name | c.name\n" + "jeff | NULL"; + + auto query = Query::match("u:users") + .traverse("u", "works-at", "c:companies", TraverseType::Left) + .where("u.name", CompareOp::Eq, Value("jeff"s)) + .select({"u.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:works-at LEFT]->(c:companies) +WHERE u.name = "alex" AND c.name = "google" +SELECT u.name, c.name; + +Expected output table: +u.name | c.name +alex | google +*/ +TEST_F(JoinWhereTest, LeftJoinSourceAndTargetWhereMatchesQualifiedRow) { + const std::string query_text = + "MATCH (u:users)-[:works-at LEFT]->(c:companies)\n" + "WHERE u.name = \"alex\" AND c.name = \"google\"\n" + "SELECT u.name, c.name;"; + const std::string expected_table = + "u.name | c.name\n" + "alex | google"; + + auto query = Query::match("u:users") + .traverse("u", "works-at", "c:companies", TraverseType::Left) + .where("u.name", CompareOp::Eq, Value("alex"s)) + .and_where("c.name", CompareOp::Eq, Value("google"s)) + .select({"u.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:works-at LEFT]->(c:companies) +WHERE c.name = "google" OR u.name = "jeff" +SELECT u.name, c.name; + +Expected output table: +u.name | c.name +alex | google +jeff | NULL +*/ +TEST_F(JoinWhereTest, LeftJoinMixedAliasOrRemainsResidual) { + const std::string query_text = + "MATCH (u:users)-[:works-at LEFT]->(c:companies)\n" + "WHERE c.name = \"google\" OR u.name = \"jeff\"\n" + "SELECT u.name, c.name;"; + const std::string expected_table = + "u.name | c.name\n" + "alex | google\n" + "jeff | NULL"; + + auto query = Query::match("u:users") + .traverse("u", "works-at", "c:companies", TraverseType::Left) + .where("c.name", CompareOp::Eq, Value("google"s)) + .or_where("u.name", CompareOp::Eq, Value("jeff"s)) + .select({"u.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:works-at LEFT]->(c:companies) +WHERE c.name = "google" +SELECT u.name, c.name; + +Expected output table: +u.name | c.name +alex | google +*/ +TEST_F(JoinWhereTest, + DISABLED_LeftJoinTargetWhereShouldFilterOutNullExtendedRows) { + const std::string query_text = + "MATCH (u:users)-[:works-at LEFT]->(c:companies)\n" + "WHERE c.name = \"google\"\n" + "SELECT u.name, c.name;"; + const std::string expected_table = + "u.name | c.name\n" + "alex | google"; + + auto query = Query::match("u:users") + .traverse("u", "works-at", "c:companies", TraverseType::Left) + .where("c.name", CompareOp::Eq, Value("google"s)) + .select({"u.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:works-at RIGHT]->(c:companies) +WHERE u.name = "alex" +SELECT u.name, c.name; + +Expected output table: +u.name | c.name +alex | google +*/ +TEST_F(JoinWhereTest, DISABLED_RightJoinSourceWhereDropsNullSourceRows) { + const std::string query_text = + "MATCH (u:users)-[:works-at RIGHT]->(c:companies)\n" + "WHERE u.name = \"alex\"\n" + "SELECT u.name, c.name;"; + const std::string expected_table = + "u.name | c.name\n" + "alex | google"; + + auto query = + Query::match("u:users") + .traverse("u", "works-at", "c:companies", TraverseType::Right) + .where("u.name", CompareOp::Eq, Value("alex"s)) + .select({"u.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:works-at RIGHT]->(c:companies) +WHERE c.name = "meta" +SELECT u.name, c.name; + +Expected output table: +u.name | c.name +NULL | meta +*/ +TEST_F(JoinWhereTest, RightJoinTargetWhereKeepsUnmatchedTargetRow) { + const std::string query_text = + "MATCH (u:users)-[:works-at RIGHT]->(c:companies)\n" + "WHERE c.name = \"meta\"\n" + "SELECT u.name, c.name;"; + const std::string expected_table = + "u.name | c.name\n" + "NULL | meta"; + + auto query = + Query::match("u:users") + .traverse("u", "works-at", "c:companies", TraverseType::Right) + .where("c.name", CompareOp::Eq, Value("meta"s)) + .select({"u.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:works-at FULL]->(c:companies) +WHERE u.name = "jeff" +SELECT u.name, c.name; + +Expected output table: +u.name | c.name +jeff | NULL +*/ +TEST_F(JoinWhereTest, DISABLED_FullJoinSourceWhereKeepsUnmatchedSourceRow) { + const std::string query_text = + "MATCH (u:users)-[:works-at FULL]->(c:companies)\n" + "WHERE u.name = \"jeff\"\n" + "SELECT u.name, c.name;"; + const std::string expected_table = + "u.name | c.name\n" + "jeff | NULL"; + + auto query = Query::match("u:users") + .traverse("u", "works-at", "c:companies", TraverseType::Full) + .where("u.name", CompareOp::Eq, Value("jeff"s)) + .select({"u.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:works-at FULL]->(c:companies) +WHERE c.name = "meta" +SELECT u.name, c.name; + +Expected output table: +u.name | c.name +NULL | meta +*/ +TEST_F(JoinWhereTest, DISABLED_FullJoinTargetWhereKeepsUnmatchedTargetRow) { + const std::string query_text = + "MATCH (u:users)-[:works-at FULL]->(c:companies)\n" + "WHERE c.name = \"meta\"\n" + "SELECT u.name, c.name;"; + const std::string expected_table = + "u.name | c.name\n" + "NULL | meta"; + + auto query = Query::match("u:users") + .traverse("u", "works-at", "c:companies", TraverseType::Full) + .where("c.name", CompareOp::Eq, Value("meta"s)) + .select({"u.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:friend INNER]->(f:users)-[:works-at INNER]->(c:companies) +WHERE f.name = "bob" AND c.name = "acme" +SELECT u.name, f.name, c.name; + +Expected output table: +u.name | f.name | c.name +alex | bob | acme +*/ +TEST_F(JoinWhereTest, TwoHopInnerJoinMiddleAndTargetWhere) { + const std::string query_text = + "MATCH (u:users)-[:friend INNER]->(f:users)-[:works-at INNER]->" + "(c:companies)\n" + "WHERE f.name = \"bob\" AND c.name = \"acme\"\n" + "SELECT u.name, f.name, c.name;"; + const std::string expected_table = + "u.name | f.name | c.name\n" + "alex | bob | acme"; + + auto query = + Query::match("u:users") + .traverse("u", "friend", "f:users", TraverseType::Inner) + .traverse("f", "works-at", "c:companies", TraverseType::Inner) + .where("f.name", CompareOp::Eq, Value("bob"s)) + .and_where("c.name", CompareOp::Eq, Value("acme"s)) + .select({"u.name", "f.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:friend INNER]->(f:users)-[:works-at LEFT]->(c:companies) +WHERE f.name = "jeff" +SELECT u.name, f.name, c.name; + +Expected output table: +u.name | f.name | c.name +alex | jeff | NULL +*/ +TEST_F(JoinWhereTest, + DISABLED_TwoHopLeftJoinMiddleWhereKeepsNullExtendedTargetRow) { + const std::string query_text = + "MATCH (u:users)-[:friend INNER]->(f:users)-[:works-at LEFT]->" + "(c:companies)\n" + "WHERE f.name = \"jeff\"\n" + "SELECT u.name, f.name, c.name;"; + const std::string expected_table = + "u.name | f.name | c.name\n" + "alex | jeff | NULL"; + + auto query = Query::match("u:users") + .traverse("u", "friend", "f:users", TraverseType::Inner) + .traverse("f", "works-at", "c:companies", TraverseType::Left) + .where("f.name", CompareOp::Eq, Value("jeff"s)) + .select({"u.name", "f.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +/* +TundraQL query: +MATCH (u:users)-[:friend INNER]->(f:users)-[:works-at LEFT]->(c:companies) +WHERE u.name = "alex" AND f.name = "bob" AND c.name = "acme" +SELECT u.name, f.name, c.name; + +Expected output table: +u.name | f.name | c.name +alex | bob | acme +*/ +TEST_F(JoinWhereTest, TwoHopLeftJoinRootMiddleAndTargetWhere) { + const std::string query_text = + "MATCH (u:users)-[:friend INNER]->(f:users)-[:works-at LEFT]->" + "(c:companies)\n" + "WHERE u.name = \"alex\" AND f.name = \"bob\" AND c.name = \"acme\"\n" + "SELECT u.name, f.name, c.name;"; + const std::string expected_table = + "u.name | f.name | c.name\n" + "alex | bob | acme"; + + auto query = Query::match("u:users") + .traverse("u", "friend", "f:users", TraverseType::Inner) + .traverse("f", "works-at", "c:companies", TraverseType::Left) + .where("u.name", CompareOp::Eq, Value("alex"s)) + .and_where("f.name", CompareOp::Eq, Value("bob"s)) + .and_where("c.name", CompareOp::Eq, Value("acme"s)) + .select({"u.name", "f.name", "c.name"}) + .build(); + + expect_query_output(query, query_text, expected_table); +} + +} // namespace tundradb diff --git a/tests/where_planner_test.cpp b/tests/where_planner_test.cpp new file mode 100644 index 0000000..6e3e18c --- /dev/null +++ b/tests/where_planner_test.cpp @@ -0,0 +1,157 @@ +#include "query/where_planner.hpp" + +#include + +#include +#include +#include + +#include "core/edge_store.hpp" +#include "query/execution.hpp" +#include "schema/schema.hpp" + +using namespace std::string_literals; +using namespace tundradb; + +namespace tundradb { + +class WherePlannerTest : public ::testing::Test { + protected: + void SetUp() override { + registry_ = std::make_shared(); + + auto user_res = registry_->create( + "User", arrow::schema({arrow::field("x", arrow::int32())})); + ASSERT_TRUE(user_res.ok()) << user_res.status().ToString(); + + auto company_res = registry_->create( + "Company", arrow::schema({arrow::field("z", arrow::int32()), + arrow::field("country", arrow::utf8())})); + ASSERT_TRUE(company_res.ok()) << company_res.status().ToString(); + + auto region_res = registry_->create( + "Region", arrow::schema({arrow::field("kind", arrow::utf8())})); + ASSERT_TRUE(region_res.ok()) << region_res.status().ToString(); + + edge_store_ = std::make_shared(0); + + auto works_at_res = edge_store_->register_edge_schema( + "WORKS_AT", {std::make_shared("y", ValueType::INT32)}); + ASSERT_TRUE(works_at_res.ok()) << works_at_res.status().ToString(); + + auto located_in_res = edge_store_->register_edge_schema( + "LOCATED_IN", {std::make_shared("weight", ValueType::INT32)}); + ASSERT_TRUE(located_in_res.ok()) << located_in_res.status().ToString(); + } + + void prepare_state(const Query& query, QueryState& state) const { + state.edge_store = edge_store_; + + auto status = prepare_query(query, state); + EXPECT_TRUE(status.ok()) << status.ToString(); + } + + std::shared_ptr registry_; + std::shared_ptr edge_store_; +}; + +TEST_F(WherePlannerTest, SplitsAndAcrossRootTargetAndEdge) { + auto query = Query::match("u:User") + .traverse("u", "WORKS_AT", "c:Company", TraverseType::Inner, + std::optional{"e"}) + .where("u.x", CompareOp::Eq, Value(int32_t(1))) + .and_where("e.y", CompareOp::Eq, Value(int32_t(2))) + .and_where("c.z", CompareOp::Eq, Value(int32_t(3))) + .build(); + + QueryState state(registry_); + prepare_state(query, state); + auto plan_res = build_where_plan(query, state); + ASSERT_TRUE(plan_res.ok()) << plan_res.status().ToString(); + const auto& plan = plan_res.ValueOrDie(); + + ASSERT_EQ(plan.root_filters.size(), 1u); + EXPECT_EQ(plan.root_filters[0].source_clause_index, 1u); + EXPECT_EQ(plan.root_filters[0].expr->extract_first_variable(), "u"); + + ASSERT_EQ(plan.traverse_filters.size(), 1u); + ASSERT_EQ(plan.traverse_filters[0].target_filters.size(), 1u); + ASSERT_EQ(plan.traverse_filters[0].edge_filters.size(), 1u); + EXPECT_EQ(plan.traverse_filters[0].target_filters[0].source_clause_index, 1u); + EXPECT_EQ(plan.traverse_filters[0].edge_filters[0].source_clause_index, 1u); + EXPECT_EQ( + plan.traverse_filters[0].target_filters[0].expr->extract_first_variable(), + "c"); + EXPECT_EQ( + plan.traverse_filters[0].edge_filters[0].expr->extract_first_variable(), + "e"); + + ASSERT_EQ(plan.residual_by_clause.size(), 2u); + EXPECT_EQ(plan.residual_by_clause[1], nullptr); +} + +TEST_F(WherePlannerTest, PullsLaterAliasFiltersBackToEarliestTraverseSlot) { + auto query = Query::match("u:User") + .traverse("u", "WORKS_AT", "c:Company") + .where("c.z", CompareOp::Eq, Value(int32_t(3))) + .traverse("c", "LOCATED_IN", "r:Region") + .where("c.country", CompareOp::Eq, Value("US"s)) + .and_where("r.kind", CompareOp::Eq, Value("hq"s)) + .build(); + + QueryState state(registry_); + prepare_state(query, state); + auto plan_res = build_where_plan(query, state); + ASSERT_TRUE(plan_res.ok()) << plan_res.status().ToString(); + const auto& plan = plan_res.ValueOrDie(); + + ASSERT_EQ(plan.traverse_filters.size(), 2u); + ASSERT_EQ(plan.traverse_filters[0].target_filters.size(), 2u); + EXPECT_EQ(plan.traverse_filters[0].target_filters[0].source_clause_index, 1u); + EXPECT_EQ(plan.traverse_filters[0].target_filters[1].source_clause_index, 3u); + EXPECT_EQ( + plan.traverse_filters[0].target_filters[0].expr->extract_first_variable(), + "c"); + EXPECT_EQ( + plan.traverse_filters[0].target_filters[1].expr->extract_first_variable(), + "c"); + + ASSERT_EQ(plan.traverse_filters[1].target_filters.size(), 1u); + EXPECT_EQ(plan.traverse_filters[1].target_filters[0].source_clause_index, 3u); + EXPECT_EQ( + plan.traverse_filters[1].target_filters[0].expr->extract_first_variable(), + "r"); + + ASSERT_EQ(plan.residual_by_clause.size(), 4u); + EXPECT_EQ(plan.residual_by_clause[1], nullptr); + EXPECT_EQ(plan.residual_by_clause[3], nullptr); +} + +TEST_F(WherePlannerTest, KeepsMixedAliasOrAsResidual) { + auto expr = LogicalExpr::or_expr( + std::make_shared("u.x", CompareOp::Eq, Value(int32_t(1))), + std::make_shared("c.z", CompareOp::Eq, + Value(int32_t(3)))); + + auto query = Query::match("u:User") + .traverse("u", "WORKS_AT", "c:Company") + .where_logical_expr(expr) + .build(); + + QueryState state(registry_); + prepare_state(query, state); + auto plan_res = build_where_plan(query, state); + ASSERT_TRUE(plan_res.ok()) << plan_res.status().ToString(); + const auto& plan = plan_res.ValueOrDie(); + + EXPECT_TRUE(plan.root_filters.empty()); + ASSERT_EQ(plan.traverse_filters.size(), 1u); + EXPECT_TRUE(plan.traverse_filters[0].target_filters.empty()); + EXPECT_TRUE(plan.traverse_filters[0].edge_filters.empty()); + + ASSERT_EQ(plan.residual_by_clause.size(), 2u); + ASSERT_NE(plan.residual_by_clause[1], nullptr); + EXPECT_EQ(plan.residual_by_clause[1]->get_all_variables().size(), 2u); +} + +} // namespace tundradb From 74f36942d283a9a1b32898f23b180d3d12063aa7 Mon Sep 17 00:00:00 2001 From: dmgcodevil Date: Tue, 21 Apr 2026 08:58:08 -0400 Subject: [PATCH 3/7] experiment with pushdowns 2 --- include/main/database.hpp | 3 +- include/query/execution.hpp | 2 + include/query/query.hpp | 6 +++ src/main/database.cpp | 23 +++++++-- src/query/query_bootstrap.cpp | 46 +++++++++++++++++- src/query/result_builder.cpp | 2 + src/query/traverse_executor.cpp | 80 ++++++++++++++++++++++++------- src/query/where_planner.cpp | 84 +++++++++++++++++++++++++++++++++ tests/join_test.cpp | 2 + tests/join_where_test.cpp | 26 +++++++--- tests/where_expression_test.cpp | 9 ++-- 11 files changed, 249 insertions(+), 34 deletions(-) diff --git a/include/main/database.hpp b/include/main/database.hpp index fa9bf7b..acc6d30 100644 --- a/include/main/database.hpp +++ b/include/main/database.hpp @@ -196,7 +196,8 @@ class Database { /** Execute a single TRAVERSE clause, updating query_state in-place. */ [[nodiscard]] arrow::Status execute_traverse( const std::shared_ptr &traverse, QueryState &query_state, - const Query &query, size_t clause_index, QueryResult &result) const; + const Query &query, size_t clause_index, size_t traverse_index, + QueryResult &result) const; /** Apply a single-variable WHERE filter, or defer to post_where. */ [[nodiscard]] arrow::Status apply_where_filter( diff --git a/include/query/execution.hpp b/include/query/execution.hpp index d10996e..9178567 100644 --- a/include/query/execution.hpp +++ b/include/query/execution.hpp @@ -19,6 +19,7 @@ #include #include "query/query.hpp" +#include "query/where_planner.hpp" #include "schema/schema.hpp" namespace tundradb { @@ -450,6 +451,7 @@ struct QueryState { SchemaRef root; ///< Root schema for query execution. std::vector traversals; ///< Traverse clauses in query order. + std::optional where_plan; ///< Planned WHERE execution. std::shared_ptr schema_registry; ///< Node schema registry. std::shared_ptr node_manager; ///< Node storage. diff --git a/include/query/query.hpp b/include/query/query.hpp index 70e2539..bb15d9a 100644 --- a/include/query/query.hpp +++ b/include/query/query.hpp @@ -694,8 +694,14 @@ struct QueryExecutionStats { int num_edges_traversed = 0; int num_where_clauses_inlined = 0; int num_where_clauses_post_processed = 0; + int num_where_predicates_pushed_to_root = 0; + int num_where_predicates_pushed_to_traverse = 0; + int num_where_predicates_deferred = 0; std::vector inlined_conditions; // For debugging std::vector post_processed_conditions; // For debugging + std::vector root_pushdown_conditions; + std::vector traverse_pushdown_conditions; + std::vector deferred_conditions; }; /** @brief Holds the output Arrow table and execution statistics from a query. diff --git a/src/main/database.cpp b/src/main/database.cpp index 0458b2a..c7a8a13 100644 --- a/src/main/database.cpp +++ b/src/main/database.cpp @@ -222,18 +222,31 @@ arrow::Result>> Database::execute_clauses(const Query& query, QueryState& query_state, QueryResult& result) const { std::vector> post_where; + size_t traverse_index = 0; for (size_t i = 0; i < query.clauses().size(); ++i) { auto clause = query.clauses()[i]; switch (clause->type()) { - case Clause::Type::WHERE: - ARROW_RETURN_NOT_OK( - apply_where_filter(std::dynamic_pointer_cast(clause), - query_state, post_where)); + case Clause::Type::WHERE: { + if (query.inline_where() && query_state.where_plan.has_value()) { + const auto residual = query_state.where_plan->residual_by_clause[i]; + if (residual) { + post_where.push_back(residual); + auto& stats = result.mutable_execution_stats(); + stats.num_where_predicates_deferred++; + stats.deferred_conditions.push_back(residual->toString()); + } + } else { + ARROW_RETURN_NOT_OK( + apply_where_filter(std::dynamic_pointer_cast(clause), + query_state, post_where)); + } break; + } case Clause::Type::TRAVERSE: ARROW_RETURN_NOT_OK( execute_traverse(std::static_pointer_cast(clause), - query_state, query, i, result)); + query_state, query, i, traverse_index, result)); + ++traverse_index; break; default: return arrow::Status::NotImplemented( diff --git a/src/query/query_bootstrap.cpp b/src/query/query_bootstrap.cpp index 22002bb..8d34ad8 100644 --- a/src/query/query_bootstrap.cpp +++ b/src/query/query_bootstrap.cpp @@ -1,8 +1,35 @@ #include "main/database.hpp" #include "query/temporal_context.hpp" +#include "query/where_planner.hpp" namespace tundradb { +namespace { + +std::vector> extract_predicates( + const std::vector& predicates) { + std::vector> exprs; + exprs.reserve(predicates.size()); + for (const auto& predicate : predicates) { + exprs.push_back(predicate.expr); + } + return exprs; +} + +void record_root_pushdowns(QueryResult& result, + const std::vector& predicates) { + auto& stats = result.mutable_execution_stats(); + stats.num_where_predicates_pushed_to_root += predicates.size(); + stats.num_where_clauses_inlined += predicates.size(); + for (const auto& predicate : predicates) { + const auto text = predicate.expr->toString(); + stats.inlined_conditions.push_back(text); + stats.root_pushdown_conditions.push_back(text); + } +} + +} // namespace + /// Prepare the per-query execution state from the root clause and optional /// temporal snapshot before clause execution begins. arrow::Status Database::init_query_state(const Query& query, @@ -36,7 +63,12 @@ arrow::Status Database::init_query_state(const Query& query, ARROW_RETURN_NOT_OK( query_state.compute_fully_qualified_names(query.root()).status()); - return prepare_query(query, query_state); + ARROW_RETURN_NOT_OK(prepare_query(query, query_state)); + if (query.inline_where()) { + ARROW_ASSIGN_OR_RAISE(query_state.where_plan, + build_where_plan(query, query_state)); + } + return arrow::Status::OK(); } /// Inline any WHERE expressions that can be applied directly to the root alias @@ -44,6 +76,18 @@ arrow::Status Database::init_query_state(const Query& query, arrow::Status Database::inline_root_where(const Query& query, QueryState& query_state, QueryResult& result) const { + if (query.inline_where() && query_state.where_plan.has_value()) { + const auto& root_filters = query_state.where_plan->root_filters; + if (root_filters.empty()) { + return arrow::Status::OK(); + } + + record_root_pushdowns(result, root_filters); + return inline_where(query.root(), query_state.tables[query.root().value()], + query_state, extract_predicates(root_filters)) + .status(); + } + auto where_exps = get_where_to_inline(query.root().value(), 0, query.clauses()); result.mutable_execution_stats().num_where_clauses_inlined += diff --git a/src/query/result_builder.cpp b/src/query/result_builder.cpp index 9afbfa0..5595cbe 100644 --- a/src/query/result_builder.cpp +++ b/src/query/result_builder.cpp @@ -790,6 +790,8 @@ arrow::Result> Database::build_result_table( for (const auto& expr : post_where) { result.mutable_execution_stats().num_where_clauses_post_processed++; + result.mutable_execution_stats().post_processed_conditions.push_back( + expr->toString()); IF_DEBUG_ENABLED { log_debug("post process where: {}", expr->toString()); } ARROW_ASSIGN_OR_RAISE(table, filter(table, *expr, false)); } diff --git a/src/query/traverse_executor.cpp b/src/query/traverse_executor.cpp index f0d9098..b9e5e3b 100644 --- a/src/query/traverse_executor.cpp +++ b/src/query/traverse_executor.cpp @@ -3,11 +3,38 @@ namespace tundradb { +namespace { + +std::vector> extract_predicates( + const std::vector& predicates) { + std::vector> exprs; + exprs.reserve(predicates.size()); + for (const auto& predicate : predicates) { + exprs.push_back(predicate.expr); + } + return exprs; +} + +void record_traverse_pushdowns( + QueryResult& result, const std::vector& predicates) { + auto& stats = result.mutable_execution_stats(); + stats.num_where_predicates_pushed_to_traverse += predicates.size(); + stats.num_where_clauses_inlined += predicates.size(); + for (const auto& predicate : predicates) { + const auto text = predicate.expr->toString(); + stats.inlined_conditions.push_back(text); + stats.traverse_pushdown_conditions.push_back(text); + } +} + +} // namespace + /// Execute one TRAVERSE clause by expanding the hop, applying the configured /// join semantics, and refreshing the affected alias tables in QueryState. arrow::Status Database::execute_traverse( const std::shared_ptr& traverse, QueryState& query_state, - const Query& query, size_t clause_index, QueryResult& result) const { + const Query& query, size_t clause_index, size_t traverse_index, + QueryResult& result) const { ARROW_ASSIGN_OR_RAISE(const auto source_schema, query_state.resolve_schema(traverse->source())); ARROW_ASSIGN_OR_RAISE(const auto target_schema, @@ -23,18 +50,34 @@ arrow::Status Database::execute_traverse( std::vector> where_clauses; std::vector> edge_where_clauses; - if (query.inline_where()) { - where_clauses = get_where_to_inline(traverse->target().value(), - clause_index + 1, query.clauses()); - } - if (traverse->edge_alias().has_value()) { - edge_where_clauses = get_where_to_inline(traverse->edge_alias().value(), - clause_index + 1, query.clauses()); + if (query.inline_where() && query_state.where_plan.has_value()) { + const auto& where_plan = *query_state.where_plan; + if (traverse_index >= where_plan.traverse_filters.size()) { + return arrow::Status::Invalid("Missing WHERE traverse plan for index ", + traverse_index); + } + + const auto& traverse_plan = where_plan.traverse_filters[traverse_index]; + where_clauses = extract_predicates(traverse_plan.target_filters); + edge_where_clauses = extract_predicates(traverse_plan.edge_filters); + record_traverse_pushdowns(result, traverse_plan.target_filters); + record_traverse_pushdowns(result, traverse_plan.edge_filters); + for (const auto& wc : where_clauses) wc->set_inlined(true); + for (const auto& wc : edge_where_clauses) wc->set_inlined(true); + } else { + if (query.inline_where()) { + where_clauses = get_where_to_inline(traverse->target().value(), + clause_index + 1, query.clauses()); + } + if (traverse->edge_alias().has_value()) { + edge_where_clauses = get_where_to_inline( + traverse->edge_alias().value(), clause_index + 1, query.clauses()); + } + for (const auto& wc : where_clauses) wc->set_inlined(true); + for (const auto& wc : edge_where_clauses) wc->set_inlined(true); + result.mutable_execution_stats().num_where_clauses_inlined += + where_clauses.size() + edge_where_clauses.size(); } - for (const auto& wc : where_clauses) wc->set_inlined(true); - for (const auto& wc : edge_where_clauses) wc->set_inlined(true); - result.mutable_execution_stats().num_where_clauses_inlined += - where_clauses.size() + edge_where_clauses.size(); IF_DEBUG_ENABLED { log_debug("Processing TRAVERSE {}-({})->{}", traverse->source().toString(), @@ -61,11 +104,14 @@ arrow::Status Database::execute_traverse( llvm::DenseSet all_target_ids; if (traverse->traverse_type() == TraverseType::Right || traverse->traverse_type() == TraverseType::Full) { - all_target_ids = - get_ids_from_table( - get_table(target_schema, query_state.temporal_context.get()) - .ValueOrDie()) - .ValueOrDie(); + ARROW_ASSIGN_OR_RAISE( + auto all_target_table, + get_table(target_schema, query_state.temporal_context.get())); + for (const auto& predicate : where_clauses) { + ARROW_ASSIGN_OR_RAISE(all_target_table, + filter(all_target_table, *predicate, true)); + } + ARROW_ASSIGN_OR_RAISE(all_target_ids, get_ids_from_table(all_target_table)); } const bool is_self_join = source_schema == target_schema; diff --git a/src/query/where_planner.cpp b/src/query/where_planner.cpp index d429d44..5653a48 100644 --- a/src/query/where_planner.cpp +++ b/src/query/where_planner.cpp @@ -34,6 +34,81 @@ struct AliasActivation { AliasKind kind; }; +/** + * @brief Record the clause index for each prepared traverse in query order. + * + * The planner needs this mapping because a predicate is only pushable if it + * can be consumed before the original WHERE clause position without crossing a + * traverse that changes nullability for that alias. + */ +std::vector build_traverse_clause_indices(const Query& query) { + std::vector indices; + indices.reserve(query.clauses().size()); + for (size_t clause_index = 0; clause_index < query.clauses().size(); + ++clause_index) { + if (query.clauses()[clause_index]->type() == Clause::Type::TRAVERSE) { + indices.push_back(clause_index); + } + } + return indices; +} + +/** + * @brief Return whether this traverse can null-extend the given alias. + * + * Single-alias predicates are only safe to consume early while the alias is + * guaranteed to stay materialized. Once an outer join can produce NULLs for + * that alias, consuming the predicate before the join changes post-WHERE + * semantics into join-condition semantics. + */ +bool alias_becomes_nullable_during_traverse(const std::string& alias, + const AliasKind alias_kind, + const Traverse& traverse) { + switch (alias_kind) { + case AliasKind::Node: + if (alias == traverse.source().value()) { + return traverse.traverse_type() == TraverseType::Right || + traverse.traverse_type() == TraverseType::Full; + } + if (alias == traverse.target().value()) { + return traverse.traverse_type() == TraverseType::Left || + traverse.traverse_type() == TraverseType::Full; + } + return false; + case AliasKind::Edge: + return traverse.edge_alias().has_value() && + alias == traverse.edge_alias().value() && + traverse.traverse_type() != TraverseType::Inner; + } + return false; +} + +/** + * @brief Check whether a planned fragment may be consumed before a WHERE. + * + * We walk from the alias activation site up to the WHERE clause and reject + * pushdown if any traverse along the way can null-extend that alias. In that + * case the fragment stays residual so final row filtering preserves the + * user-visible post-join semantics. + */ +bool can_consume_pushdown_before_clause( + const std::string& alias, const AliasActivation& activation, + size_t where_clause_index, const QueryState& query_state, + const std::vector& traverse_clause_indices) { + const size_t start_traverse_index = activation.traverse_index.value_or(0); + for (size_t traverse_index = start_traverse_index; + traverse_index < query_state.traversals.size(); ++traverse_index) { + if (traverse_clause_indices[traverse_index] >= where_clause_index) { + break; + } + if (alias_becomes_nullable_during_traverse( + alias, activation.kind, query_state.traversals[traverse_index])) { + return false; + } + } + return true; +} + std::shared_ptr combine_with_and(std::shared_ptr left, std::shared_ptr right) { if (!left) return right; @@ -147,6 +222,7 @@ arrow::Result build_where_plan( plan.residual_by_clause.resize(query.clauses().size()); auto activation = build_alias_activation_map(query, query_state); + const auto traverse_clause_indices = build_traverse_clause_indices(query); for (size_t clause_index = 0; clause_index < query.clauses().size(); ++clause_index) { @@ -167,6 +243,14 @@ arrow::Result build_where_plan( "Alias '", alias, "' is not registered for WHERE pushdown"); } + if (!can_consume_pushdown_before_clause(alias, it->second, clause_index, + query_state, + traverse_clause_indices)) { + parts.residual = + combine_with_and(std::move(parts.residual), std::move(expr)); + continue; + } + ARROW_RETURN_NOT_OK(append_pushdown( plan, it->second, PlannedPredicate{clause_index, std::move(expr)})); } diff --git a/tests/join_test.cpp b/tests/join_test.cpp index 3c501ec..e32340d 100644 --- a/tests/join_test.cpp +++ b/tests/join_test.cpp @@ -474,6 +474,7 @@ TEST(JoinTest, MultiPathToSameTarget) { .where( "c2.id", CompareOp::Eq, Value((int64_t)0)) // Filter for friend's company (also IBM ID 0) + .inline_where() .build(); auto query_result = db->query(query); @@ -700,6 +701,7 @@ TEST(JoinTest, LeftJoinTargetWhereFiltersFinalRows) { .traverse("u", "friend", "f:users", TraverseType::Inner) .traverse("f", "works-at", "c:companies", TraverseType::Left) .where("c.name", CompareOp::Eq, Value("google")) + .inline_where() .build(); auto query_result = db->query(query); diff --git a/tests/join_where_test.cpp b/tests/join_where_test.cpp index 1ab3511..721bc9c 100644 --- a/tests/join_where_test.cpp +++ b/tests/join_where_test.cpp @@ -163,6 +163,7 @@ TEST_F(JoinWhereTest, InnerJoinTargetWhereFiltersMatchedRows) { .traverse("u", "works-at", "c:companies", TraverseType::Inner) .where("c.name", CompareOp::Eq, Value("google"s)) .select({"u.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -192,6 +193,7 @@ TEST_F(JoinWhereTest, InnerJoinSourceWhereFiltersBeforeTraverse) { .traverse("u", "works-at", "c:companies", TraverseType::Inner) .where("u.name", CompareOp::Eq, Value("alex"s)) .select({"u.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -222,6 +224,7 @@ TEST_F(JoinWhereTest, InnerJoinSourceAndTargetWhereMatchesSingleRow) { .where("u.name", CompareOp::Eq, Value("alex"s)) .and_where("c.name", CompareOp::Eq, Value("google"s)) .select({"u.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -250,6 +253,7 @@ TEST_F(JoinWhereTest, LeftJoinSourceWhereKeepsNullExtendedSourceRow) { .traverse("u", "works-at", "c:companies", TraverseType::Left) .where("u.name", CompareOp::Eq, Value("jeff"s)) .select({"u.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -279,6 +283,7 @@ TEST_F(JoinWhereTest, LeftJoinSourceAndTargetWhereMatchesQualifiedRow) { .where("u.name", CompareOp::Eq, Value("alex"s)) .and_where("c.name", CompareOp::Eq, Value("google"s)) .select({"u.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -310,6 +315,7 @@ TEST_F(JoinWhereTest, LeftJoinMixedAliasOrRemainsResidual) { .where("c.name", CompareOp::Eq, Value("google"s)) .or_where("u.name", CompareOp::Eq, Value("jeff"s)) .select({"u.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -325,8 +331,7 @@ Expected output table: u.name | c.name alex | google */ -TEST_F(JoinWhereTest, - DISABLED_LeftJoinTargetWhereShouldFilterOutNullExtendedRows) { +TEST_F(JoinWhereTest, LeftJoinTargetWhereShouldFilterOutNullExtendedRows) { const std::string query_text = "MATCH (u:users)-[:works-at LEFT]->(c:companies)\n" "WHERE c.name = \"google\"\n" @@ -339,6 +344,7 @@ TEST_F(JoinWhereTest, .traverse("u", "works-at", "c:companies", TraverseType::Left) .where("c.name", CompareOp::Eq, Value("google"s)) .select({"u.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -354,7 +360,7 @@ Expected output table: u.name | c.name alex | google */ -TEST_F(JoinWhereTest, DISABLED_RightJoinSourceWhereDropsNullSourceRows) { +TEST_F(JoinWhereTest, RightJoinSourceWhereDropsNullSourceRows) { const std::string query_text = "MATCH (u:users)-[:works-at RIGHT]->(c:companies)\n" "WHERE u.name = \"alex\"\n" @@ -368,6 +374,7 @@ TEST_F(JoinWhereTest, DISABLED_RightJoinSourceWhereDropsNullSourceRows) { .traverse("u", "works-at", "c:companies", TraverseType::Right) .where("u.name", CompareOp::Eq, Value("alex"s)) .select({"u.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -397,6 +404,7 @@ TEST_F(JoinWhereTest, RightJoinTargetWhereKeepsUnmatchedTargetRow) { .traverse("u", "works-at", "c:companies", TraverseType::Right) .where("c.name", CompareOp::Eq, Value("meta"s)) .select({"u.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -412,7 +420,7 @@ Expected output table: u.name | c.name jeff | NULL */ -TEST_F(JoinWhereTest, DISABLED_FullJoinSourceWhereKeepsUnmatchedSourceRow) { +TEST_F(JoinWhereTest, FullJoinSourceWhereKeepsUnmatchedSourceRow) { const std::string query_text = "MATCH (u:users)-[:works-at FULL]->(c:companies)\n" "WHERE u.name = \"jeff\"\n" @@ -425,6 +433,7 @@ TEST_F(JoinWhereTest, DISABLED_FullJoinSourceWhereKeepsUnmatchedSourceRow) { .traverse("u", "works-at", "c:companies", TraverseType::Full) .where("u.name", CompareOp::Eq, Value("jeff"s)) .select({"u.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -440,7 +449,7 @@ Expected output table: u.name | c.name NULL | meta */ -TEST_F(JoinWhereTest, DISABLED_FullJoinTargetWhereKeepsUnmatchedTargetRow) { +TEST_F(JoinWhereTest, FullJoinTargetWhereKeepsUnmatchedTargetRow) { const std::string query_text = "MATCH (u:users)-[:works-at FULL]->(c:companies)\n" "WHERE c.name = \"meta\"\n" @@ -453,6 +462,7 @@ TEST_F(JoinWhereTest, DISABLED_FullJoinTargetWhereKeepsUnmatchedTargetRow) { .traverse("u", "works-at", "c:companies", TraverseType::Full) .where("c.name", CompareOp::Eq, Value("meta"s)) .select({"u.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -485,6 +495,7 @@ TEST_F(JoinWhereTest, TwoHopInnerJoinMiddleAndTargetWhere) { .where("f.name", CompareOp::Eq, Value("bob"s)) .and_where("c.name", CompareOp::Eq, Value("acme"s)) .select({"u.name", "f.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -500,8 +511,7 @@ Expected output table: u.name | f.name | c.name alex | jeff | NULL */ -TEST_F(JoinWhereTest, - DISABLED_TwoHopLeftJoinMiddleWhereKeepsNullExtendedTargetRow) { +TEST_F(JoinWhereTest, TwoHopLeftJoinMiddleWhereKeepsNullExtendedTargetRow) { const std::string query_text = "MATCH (u:users)-[:friend INNER]->(f:users)-[:works-at LEFT]->" "(c:companies)\n" @@ -516,6 +526,7 @@ TEST_F(JoinWhereTest, .traverse("f", "works-at", "c:companies", TraverseType::Left) .where("f.name", CompareOp::Eq, Value("jeff"s)) .select({"u.name", "f.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); @@ -548,6 +559,7 @@ TEST_F(JoinWhereTest, TwoHopLeftJoinRootMiddleAndTargetWhere) { .and_where("f.name", CompareOp::Eq, Value("bob"s)) .and_where("c.name", CompareOp::Eq, Value("acme"s)) .select({"u.name", "f.name", "c.name"}) + .inline_where() .build(); expect_query_output(query, query_text, expected_table); diff --git a/tests/where_expression_test.cpp b/tests/where_expression_test.cpp index 9f23c6e..6a61335 100644 --- a/tests/where_expression_test.cpp +++ b/tests/where_expression_test.cpp @@ -601,8 +601,11 @@ TEST_F(WhereExpressionTest, TraversalWhereCombinations3) { ASSERT_EQ(table->num_rows(), 0); const auto& stats = result.ValueOrDie()->execution_stats(); - EXPECT_EQ(stats.num_where_clauses_inlined, 2); - EXPECT_EQ(stats.num_where_clauses_post_processed, 1); + EXPECT_EQ(stats.num_where_clauses_inlined, 4); + EXPECT_EQ(stats.num_where_clauses_post_processed, 0); + EXPECT_EQ(stats.num_where_predicates_pushed_to_root, 2); + EXPECT_EQ(stats.num_where_predicates_pushed_to_traverse, 2); + EXPECT_EQ(stats.num_where_predicates_deferred, 0); } TEST_F(WhereExpressionTest, QueryMaterializesMapColumn) { @@ -677,4 +680,4 @@ TEST_F(WhereExpressionTest, QueryFiltersByMapProperty) { EXPECT_EQ(names[0], "Anna"); } -} // namespace tundradb \ No newline at end of file +} // namespace tundradb From beabc12cb8b6727c933150ee4d92826347981ec0 Mon Sep 17 00:00:00 2001 From: dmgcodevil Date: Tue, 21 Apr 2026 17:12:12 -0400 Subject: [PATCH 4/7] experiment with pushdowns 3 --- include/query/execution.hpp | 3 ++- include/query/query.hpp | 4 ++++ include/query/where_planner.hpp | 42 ++++++++++++++++++++++++++++++--- src/query/execution.cpp | 7 ++++-- src/query/query_bootstrap.cpp | 21 ++++++++++------- src/query/traverse_executor.cpp | 21 ++++++++++------- src/query/where_planner.cpp | 10 ++++++-- tests/join_where_test.cpp | 10 +++++++- tests/where_planner_test.cpp | 37 +++++++++++++++++++++++++++++ 9 files changed, 129 insertions(+), 26 deletions(-) diff --git a/include/query/execution.hpp b/include/query/execution.hpp index 9178567..8c4d826 100644 --- a/include/query/execution.hpp +++ b/include/query/execution.hpp @@ -726,7 +726,8 @@ std::vector> get_where_to_inline( arrow::Result> inline_where( const SchemaRef& ref, std::shared_ptr table, QueryState& query_state, - const std::vector>& where_exprs); + const std::vector>& where_exprs, + bool mark_inlined = true); /** * @brief Prepares a query for execution: registers aliases, resolves fields, diff --git a/include/query/query.hpp b/include/query/query.hpp index bb15d9a..42fe0f4 100644 --- a/include/query/query.hpp +++ b/include/query/query.hpp @@ -696,11 +696,15 @@ struct QueryExecutionStats { int num_where_clauses_post_processed = 0; int num_where_predicates_pushed_to_root = 0; int num_where_predicates_pushed_to_traverse = 0; + int num_where_predicates_prefiltered_at_root = 0; + int num_where_predicates_prefiltered_at_traverse = 0; int num_where_predicates_deferred = 0; std::vector inlined_conditions; // For debugging std::vector post_processed_conditions; // For debugging std::vector root_pushdown_conditions; std::vector traverse_pushdown_conditions; + std::vector root_prefilter_conditions; + std::vector traverse_prefilter_conditions; std::vector deferred_conditions; }; diff --git a/include/query/where_planner.hpp b/include/query/where_planner.hpp index c68d0a7..340869f 100644 --- a/include/query/where_planner.hpp +++ b/include/query/where_planner.hpp @@ -11,7 +11,27 @@ namespace tundradb { struct QueryState; /** - * @brief One predicate fragment assigned to a concrete execution phase. + * @brief Metadata describing how a planned predicate participates in the plan. + * + * This enum does not decide where or whether execution applies a predicate. + * Execution is driven by the plan shape itself: + * - predicates present in `root_filters` / `traverse_filters` are applied early + * - predicates present in `residual_by_clause` are applied later + * + * `mode` exists to make the planner output explicit and to keep execution + * statistics honest. + * + * `Consume` means the fragment appears only in the early phase of the plan. + * `PrefilterOnly` means the fragment appears in an early phase and is also + * retained in `residual_by_clause`. + */ +enum class PlannedPredicateMode { + Consume, + PrefilterOnly, +}; + +/** + * @brief One predicate fragment scheduled by the WHERE planner. * * The planner preserves the original query clause position so execution can * keep user-written order when multiple predicates are pulled back into the @@ -23,11 +43,16 @@ struct QueryState; struct PlannedPredicate { size_t source_clause_index; ///< Original WHERE clause position in Query. std::shared_ptr expr; + ///< Descriptive planner metadata; not the source of execution truth. + PlannedPredicateMode mode = PlannedPredicateMode::Consume; }; /** * @brief Predicates that can be applied while executing one traverse hop. * + * Each predicate carries a @c PlannedPredicateMode describing whether the + * planner also retained that same fragment in `residual_by_clause`. + * * `target_filters` apply to the hop's target node alias. * `edge_filters` apply to the hop's optional edge alias. */ @@ -44,6 +69,13 @@ struct TraverseWherePlan { * - `traverse_filters[i]` run while executing `query_state.traversals[i]`. * - `residual_by_clause[i]` is appended when visiting clause `i` in the * normal clause loop and applied later on the denormalized result table. + * + * A fragment may therefore appear: + * - only in `root_filters` / `traverse_filters` + * - both in an early filter vector and in `residual_by_clause` + * + * `PlannedPredicateMode` documents which of those two layouts the planner + * chose, but execution still follows the containers above. */ struct WhereExecutionPlan { std::vector root_filters; @@ -59,10 +91,14 @@ struct WhereExecutionPlan { * - All aliases referenced by WHERE expressions are registered in * `query_state`. * - * Safe split rules: - * - A subtree that references exactly one alias is pushable as-is. + * Planning rules: + * - A subtree that references exactly one alias is a pushdown candidate. * - `AND` is decomposed recursively. * - Mixed-alias `OR` and alias-to-alias comparisons remain residual. + * - If the alias stays non-nullable up to the WHERE clause, the fragment is + * planned as `Consume`. + * - If the alias may become nullable before the WHERE clause, the fragment is + * planned as `PrefilterOnly` and also retained as residual. */ arrow::Result build_where_plan( const Query& query, const QueryState& query_state); diff --git a/src/query/execution.cpp b/src/query/execution.cpp index 2d2b3aa..36b3621 100644 --- a/src/query/execution.cpp +++ b/src/query/execution.cpp @@ -683,7 +683,8 @@ std::vector> get_where_to_inline( arrow::Result> inline_where( const SchemaRef& ref, std::shared_ptr table, QueryState& query_state, - const std::vector>& where_exprs) { + const std::vector>& where_exprs, + bool mark_inlined) { auto curr_table = std::move(table); for (const auto& exp : where_exprs) { IF_DEBUG_ENABLED { log_debug("inline where '{}'", exp->toString()); } @@ -696,7 +697,9 @@ arrow::Result> inline_where( } ARROW_RETURN_NOT_OK(query_state.update_table(result.ValueOrDie(), ref)); curr_table = result.ValueOrDie(); - exp->set_inlined(true); + if (mark_inlined) { + exp->set_inlined(true); + } } return curr_table; } diff --git a/src/query/query_bootstrap.cpp b/src/query/query_bootstrap.cpp index 8d34ad8..0cbfedd 100644 --- a/src/query/query_bootstrap.cpp +++ b/src/query/query_bootstrap.cpp @@ -16,15 +16,20 @@ std::vector> extract_predicates( return exprs; } -void record_root_pushdowns(QueryResult& result, - const std::vector& predicates) { +void record_root_planned_predicates( + QueryResult& result, const std::vector& predicates) { auto& stats = result.mutable_execution_stats(); - stats.num_where_predicates_pushed_to_root += predicates.size(); - stats.num_where_clauses_inlined += predicates.size(); for (const auto& predicate : predicates) { const auto text = predicate.expr->toString(); - stats.inlined_conditions.push_back(text); - stats.root_pushdown_conditions.push_back(text); + if (predicate.mode == PlannedPredicateMode::Consume) { + stats.num_where_predicates_pushed_to_root++; + stats.num_where_clauses_inlined++; + stats.inlined_conditions.push_back(text); + stats.root_pushdown_conditions.push_back(text); + } else { + stats.num_where_predicates_prefiltered_at_root++; + stats.root_prefilter_conditions.push_back(text); + } } } @@ -82,9 +87,9 @@ arrow::Status Database::inline_root_where(const Query& query, return arrow::Status::OK(); } - record_root_pushdowns(result, root_filters); + record_root_planned_predicates(result, root_filters); return inline_where(query.root(), query_state.tables[query.root().value()], - query_state, extract_predicates(root_filters)) + query_state, extract_predicates(root_filters), false) .status(); } diff --git a/src/query/traverse_executor.cpp b/src/query/traverse_executor.cpp index b9e5e3b..acf37d8 100644 --- a/src/query/traverse_executor.cpp +++ b/src/query/traverse_executor.cpp @@ -15,15 +15,20 @@ std::vector> extract_predicates( return exprs; } -void record_traverse_pushdowns( +void record_traverse_planned_predicates( QueryResult& result, const std::vector& predicates) { auto& stats = result.mutable_execution_stats(); - stats.num_where_predicates_pushed_to_traverse += predicates.size(); - stats.num_where_clauses_inlined += predicates.size(); for (const auto& predicate : predicates) { const auto text = predicate.expr->toString(); - stats.inlined_conditions.push_back(text); - stats.traverse_pushdown_conditions.push_back(text); + if (predicate.mode == PlannedPredicateMode::Consume) { + stats.num_where_predicates_pushed_to_traverse++; + stats.num_where_clauses_inlined++; + stats.inlined_conditions.push_back(text); + stats.traverse_pushdown_conditions.push_back(text); + } else { + stats.num_where_predicates_prefiltered_at_traverse++; + stats.traverse_prefilter_conditions.push_back(text); + } } } @@ -60,10 +65,8 @@ arrow::Status Database::execute_traverse( const auto& traverse_plan = where_plan.traverse_filters[traverse_index]; where_clauses = extract_predicates(traverse_plan.target_filters); edge_where_clauses = extract_predicates(traverse_plan.edge_filters); - record_traverse_pushdowns(result, traverse_plan.target_filters); - record_traverse_pushdowns(result, traverse_plan.edge_filters); - for (const auto& wc : where_clauses) wc->set_inlined(true); - for (const auto& wc : edge_where_clauses) wc->set_inlined(true); + record_traverse_planned_predicates(result, traverse_plan.target_filters); + record_traverse_planned_predicates(result, traverse_plan.edge_filters); } else { if (query.inline_where()) { where_clauses = get_where_to_inline(traverse->target().value(), diff --git a/src/query/where_planner.cpp b/src/query/where_planner.cpp index 5653a48..24b1ea7 100644 --- a/src/query/where_planner.cpp +++ b/src/query/where_planner.cpp @@ -246,13 +246,19 @@ arrow::Result build_where_plan( if (!can_consume_pushdown_before_clause(alias, it->second, clause_index, query_state, traverse_clause_indices)) { + ARROW_RETURN_NOT_OK(append_pushdown( + plan, it->second, + PlannedPredicate{clause_index, expr, + PlannedPredicateMode::PrefilterOnly})); parts.residual = combine_with_and(std::move(parts.residual), std::move(expr)); continue; } - ARROW_RETURN_NOT_OK(append_pushdown( - plan, it->second, PlannedPredicate{clause_index, std::move(expr)})); + ARROW_RETURN_NOT_OK( + append_pushdown(plan, it->second, + PlannedPredicate{clause_index, std::move(expr), + PlannedPredicateMode::Consume})); } plan.residual_by_clause[clause_index] = std::move(parts.residual); } diff --git a/tests/join_where_test.cpp b/tests/join_where_test.cpp index 721bc9c..7fa9e2e 100644 --- a/tests/join_where_test.cpp +++ b/tests/join_where_test.cpp @@ -347,7 +347,15 @@ TEST_F(JoinWhereTest, LeftJoinTargetWhereShouldFilterOutNullExtendedRows) { .inline_where() .build(); - expect_query_output(query, query_text, expected_table); + auto result = db_->query(query); + ASSERT_TRUE(result.ok()) << result.status().ToString(); + auto actual = table_to_test_string(result.ValueOrDie()->table()); + EXPECT_EQ(actual, expected_table) << "Query:\n" << query_text; + + const auto& stats = result.ValueOrDie()->execution_stats(); + EXPECT_EQ(stats.num_where_predicates_pushed_to_traverse, 0); + EXPECT_EQ(stats.num_where_predicates_prefiltered_at_traverse, 1); + EXPECT_EQ(stats.num_where_predicates_deferred, 1); } /* diff --git a/tests/where_planner_test.cpp b/tests/where_planner_test.cpp index 6e3e18c..aea5ae2 100644 --- a/tests/where_planner_test.cpp +++ b/tests/where_planner_test.cpp @@ -73,12 +73,17 @@ TEST_F(WherePlannerTest, SplitsAndAcrossRootTargetAndEdge) { ASSERT_EQ(plan.root_filters.size(), 1u); EXPECT_EQ(plan.root_filters[0].source_clause_index, 1u); EXPECT_EQ(plan.root_filters[0].expr->extract_first_variable(), "u"); + EXPECT_EQ(plan.root_filters[0].mode, PlannedPredicateMode::Consume); ASSERT_EQ(plan.traverse_filters.size(), 1u); ASSERT_EQ(plan.traverse_filters[0].target_filters.size(), 1u); ASSERT_EQ(plan.traverse_filters[0].edge_filters.size(), 1u); EXPECT_EQ(plan.traverse_filters[0].target_filters[0].source_clause_index, 1u); EXPECT_EQ(plan.traverse_filters[0].edge_filters[0].source_clause_index, 1u); + EXPECT_EQ(plan.traverse_filters[0].target_filters[0].mode, + PlannedPredicateMode::Consume); + EXPECT_EQ(plan.traverse_filters[0].edge_filters[0].mode, + PlannedPredicateMode::Consume); EXPECT_EQ( plan.traverse_filters[0].target_filters[0].expr->extract_first_variable(), "c"); @@ -109,6 +114,10 @@ TEST_F(WherePlannerTest, PullsLaterAliasFiltersBackToEarliestTraverseSlot) { ASSERT_EQ(plan.traverse_filters[0].target_filters.size(), 2u); EXPECT_EQ(plan.traverse_filters[0].target_filters[0].source_clause_index, 1u); EXPECT_EQ(plan.traverse_filters[0].target_filters[1].source_clause_index, 3u); + EXPECT_EQ(plan.traverse_filters[0].target_filters[0].mode, + PlannedPredicateMode::Consume); + EXPECT_EQ(plan.traverse_filters[0].target_filters[1].mode, + PlannedPredicateMode::Consume); EXPECT_EQ( plan.traverse_filters[0].target_filters[0].expr->extract_first_variable(), "c"); @@ -118,6 +127,8 @@ TEST_F(WherePlannerTest, PullsLaterAliasFiltersBackToEarliestTraverseSlot) { ASSERT_EQ(plan.traverse_filters[1].target_filters.size(), 1u); EXPECT_EQ(plan.traverse_filters[1].target_filters[0].source_clause_index, 3u); + EXPECT_EQ(plan.traverse_filters[1].target_filters[0].mode, + PlannedPredicateMode::Consume); EXPECT_EQ( plan.traverse_filters[1].target_filters[0].expr->extract_first_variable(), "r"); @@ -154,4 +165,30 @@ TEST_F(WherePlannerTest, KeepsMixedAliasOrAsResidual) { EXPECT_EQ(plan.residual_by_clause[1]->get_all_variables().size(), 2u); } +TEST_F(WherePlannerTest, PrefiltersNullableLeftTargetAndKeepsResidual) { + auto query = Query::match("u:User") + .traverse("u", "WORKS_AT", "c:Company", TraverseType::Left) + .where("c.z", CompareOp::Eq, Value(int32_t(3))) + .traverse("c", "LOCATED_IN", "r:Region") + .build(); + + QueryState state(registry_); + prepare_state(query, state); + auto plan_res = build_where_plan(query, state); + ASSERT_TRUE(plan_res.ok()) << plan_res.status().ToString(); + const auto& plan = plan_res.ValueOrDie(); + + ASSERT_EQ(plan.traverse_filters.size(), 2u); + ASSERT_EQ(plan.traverse_filters[0].target_filters.size(), 1u); + EXPECT_EQ(plan.traverse_filters[0].target_filters[0].mode, + PlannedPredicateMode::PrefilterOnly); + EXPECT_EQ( + plan.traverse_filters[0].target_filters[0].expr->extract_first_variable(), + "c"); + + ASSERT_EQ(plan.residual_by_clause.size(), 3u); + ASSERT_NE(plan.residual_by_clause[1], nullptr); + EXPECT_EQ(plan.residual_by_clause[1]->extract_first_variable(), "c"); +} + } // namespace tundradb From 0df03c790af4ca1c784378a2c8574a02b0b79dd5 Mon Sep 17 00:00:00 2001 From: dmgcodevil Date: Tue, 21 Apr 2026 21:00:11 -0400 Subject: [PATCH 5/7] experiment with pushdowns 4 --- include/main/database.hpp | 35 +++++- src/main/database.cpp | 29 ++++- src/query/filter_executor.cpp | 187 +++++++++++++++++++++----------- src/query/query_bootstrap.cpp | 72 ++++++------ src/query/traverse_executor.cpp | 35 ++---- 5 files changed, 229 insertions(+), 129 deletions(-) diff --git a/include/main/database.hpp b/include/main/database.hpp index acc6d30..b428682 100644 --- a/include/main/database.hpp +++ b/include/main/database.hpp @@ -199,10 +199,37 @@ class Database { const Query &query, size_t clause_index, size_t traverse_index, QueryResult &result) const; - /** Apply a single-variable WHERE filter, or defer to post_where. */ - [[nodiscard]] arrow::Status apply_where_filter( - const std::shared_ptr &where, QueryState &query_state, - std::vector> &post_where) const; + /** High-level action chosen for one WHERE clause in legacy execution mode. */ + struct WhereDisposition { + enum class Kind { + Skip, + Defer, + ApplyToAlias, + }; + + Kind kind = Kind::Skip; + std::string alias; + }; + + enum class PlannedPredicateSite { + Root, + Traverse, + }; + + /** Classify a WHERE clause as skipped, deferred, or directly applicable. */ + [[nodiscard]] arrow::Result classify_where_filter( + const std::shared_ptr &where, + const QueryState &query_state) const; + + /** Record planner-driven WHERE stats for one execution site. */ + void record_planned_predicates( + QueryResult &result, const std::vector &predicates, + PlannedPredicateSite site) const; + + /** Apply a single-alias WHERE clause to an already materialized alias. */ + [[nodiscard]] arrow::Status apply_alias_where( + const std::shared_ptr &where, const std::string &alias, + QueryState &query_state) const; /** Build the final output table: denormalize, populate rows, apply * deferred WHERE filters, and project via SELECT. */ diff --git a/src/main/database.cpp b/src/main/database.cpp index c7a8a13..adf9920 100644 --- a/src/main/database.cpp +++ b/src/main/database.cpp @@ -227,18 +227,35 @@ Database::execute_clauses(const Query& query, QueryState& query_state, auto clause = query.clauses()[i]; switch (clause->type()) { case Clause::Type::WHERE: { - if (query.inline_where() && query_state.where_plan.has_value()) { - const auto residual = query_state.where_plan->residual_by_clause[i]; - if (residual) { + if (query.inline_where()) { + // -------------------------------------------------- + if (!query_state.where_plan.has_value()) { + return arrow::Status::Invalid( + "Missing WHERE plan during inline clause execution"); + } + // -------------------------------------------------- + if (const auto residual = + query_state.where_plan->residual_by_clause[i]) { post_where.push_back(residual); auto& stats = result.mutable_execution_stats(); stats.num_where_predicates_deferred++; stats.deferred_conditions.push_back(residual->toString()); } } else { - ARROW_RETURN_NOT_OK( - apply_where_filter(std::dynamic_pointer_cast(clause), - query_state, post_where)); + auto where = std::dynamic_pointer_cast(clause); + ARROW_ASSIGN_OR_RAISE(const auto disposition, + classify_where_filter(where, query_state)); + switch (disposition.kind) { + case WhereDisposition::Kind::Skip: + break; + case WhereDisposition::Kind::Defer: + post_where.push_back(where); + break; + case WhereDisposition::Kind::ApplyToAlias: + ARROW_RETURN_NOT_OK( + apply_alias_where(where, disposition.alias, query_state)); + break; + } } break; } diff --git a/src/query/filter_executor.cpp b/src/query/filter_executor.cpp index 50ba4cd..a37a5d1 100644 --- a/src/query/filter_executor.cpp +++ b/src/query/filter_executor.cpp @@ -5,98 +5,157 @@ namespace tundradb { -/// Apply a WHERE clause immediately when it targets one materialized alias, or -/// defer it for post-processing when it spans multiple aliases. -arrow::Status Database::apply_where_filter( - const std::shared_ptr& where, QueryState& query_state, - std::vector>& post_where) const { +namespace { + +/** + * @brief Apply an Arrow-native filter to one materialized alias table. + * + * This is the fast path for single-alias predicates that Arrow can evaluate + * directly. The caller decides whether to fall back to row-by-row evaluation + * if this returns an error for a predicate that requires row evaluation. + */ +arrow::Result> filter_alias_table( + const std::shared_ptr& table, + const std::shared_ptr& where) { + return filter(table, *where, true); +} + +/** + * @brief Build a boolean mask for row-evaluated filtering of one alias table. + * + * The mask is produced by reloading each live node for @p alias from storage + * and evaluating @p where with the expression engine's row path. The returned + * mask aligns with the current Arrow table rows and is later passed to + * `arrow::compute::Filter(...)`. + */ +arrow::Result> build_row_eval_mask( + const std::shared_ptr& table, + const std::shared_ptr& where, const std::string& alias, + const QueryState& query_state, + const std::shared_ptr& node_manager) { + ARROW_ASSIGN_OR_RAISE(const auto resolved_schema, + query_state.resolve_schema(SchemaRef::parse(alias))); + + llvm::DenseSet keep_ids; + for (const auto id : query_state.ids().at(alias)) { + auto node_res = node_manager->get_node(resolved_schema, id); + if (!node_res.ok()) continue; + ARROW_ASSIGN_OR_RAISE(const bool matches, + where->matches(node_res.ValueOrDie())); + if (matches) { + keep_ids.insert(id); + } + } + + auto id_column = table->GetColumnByName("id"); + if (!id_column) { + return arrow::Status::Invalid("Could not find 'id' column for variable '", + alias, "'"); + } + + arrow::BooleanBuilder mask_builder; + for (int ci = 0; ci < id_column->num_chunks(); ++ci) { + auto ids = + std::static_pointer_cast(id_column->chunk(ci)); + for (int64_t irow = 0; irow < ids->length(); ++irow) { + if (ids->IsNull(irow)) { + ARROW_RETURN_NOT_OK(mask_builder.Append(false)); + } else { + ARROW_RETURN_NOT_OK( + mask_builder.Append(keep_ids.contains(ids->Value(irow)))); + } + } + } + + std::shared_ptr mask_array; + ARROW_RETURN_NOT_OK(mask_builder.Finish(&mask_array)); + return mask_array; +} + +/** + * @brief Filter one alias table using row-by-row expression evaluation. + * + * This fallback path is used for predicates that cannot be evaluated by Arrow + * directly, such as nested field access. It builds a boolean mask with + * @ref build_row_eval_mask and then applies that mask to the current table. + */ +arrow::Result> filter_alias_table_with_row_eval( + const std::shared_ptr& table, + const std::shared_ptr& where, const std::string& alias, + const QueryState& query_state, + const std::shared_ptr& node_manager) { + ARROW_ASSIGN_OR_RAISE( + auto mask_array, + build_row_eval_mask(table, where, alias, query_state, node_manager)); + ARROW_ASSIGN_OR_RAISE( + auto filtered_datum, + arrow::compute::Filter(arrow::Datum(table), arrow::Datum(mask_array))); + return filtered_datum.table(); +} + +} // namespace + +/// Decide whether a WHERE clause should be skipped, deferred, or applied to an +/// already materialized alias table in legacy execution mode. +arrow::Result Database::classify_where_filter( + const std::shared_ptr& where, + const QueryState& query_state) const { if (where->inlined()) { IF_DEBUG_ENABLED { log_debug("where '{}' is inlined, skip", where->toString()); } - return arrow::Status::OK(); + return WhereDisposition{WhereDisposition::Kind::Skip, ""}; } - auto variables = where->get_all_variables(); + + const auto& variables = where->get_all_variables(); if (variables.empty()) { return arrow::Status::Invalid( "where clause field must have variable " "., actual={}", where->toString()); } + if (variables.size() != 1) { IF_DEBUG_ENABLED { - log_debug("Add compound WHERE expression: '{}' to post process", - where->toString()); + log_debug("Defer compound WHERE expression: '{}'", where->toString()); + } + return WhereDisposition{WhereDisposition::Kind::Defer, ""}; + } + + const std::string alias = *variables.begin(); + if (!query_state.tables.contains(alias)) { + if (!query_state.aliases().contains(alias)) { + return arrow::Status::Invalid("Unknown variable '{}'", alias); } - post_where.emplace_back(where); - return arrow::Status::OK(); + return WhereDisposition{WhereDisposition::Kind::Defer, alias}; } + return WhereDisposition{WhereDisposition::Kind::ApplyToAlias, alias}; +} + +/// Apply a single-alias WHERE clause to one materialized alias table. +arrow::Status Database::apply_alias_where( + const std::shared_ptr& where, const std::string& alias, + QueryState& query_state) const { IF_DEBUG_ENABLED { log_debug("Processing WHERE clause: '{}'", where->toString()); } - std::string variable = *variables.begin(); - if (!query_state.tables.contains(variable)) { - if (!query_state.aliases().contains(variable)) { - return arrow::Status::Invalid("Unknown variable '{}'", variable); - } - post_where.emplace_back(where); - return arrow::Status::OK(); - } - auto table = query_state.tables.at(variable); + const auto table = query_state.tables.at(alias); arrow::Result> filtered_table_result = - filter(table, *where, true); - if (!filtered_table_result.ok() && where->requires_row_eval()) { - ARROW_ASSIGN_OR_RAISE( - const auto resolved_schema, - query_state.resolve_schema(SchemaRef::parse(variable))); - - llvm::DenseSet keep_ids; - for (const auto id : query_state.ids()[variable]) { - auto node_res = node_manager_->get_node(resolved_schema, id); - if (!node_res.ok()) continue; - ARROW_ASSIGN_OR_RAISE(const bool matches, - where->matches(node_res.ValueOrDie())); - if (matches) { - keep_ids.insert(id); - } - } + where->requires_row_eval() + ? filter_alias_table_with_row_eval(table, where, alias, query_state, + node_manager_) + : filter_alias_table(table, where); - auto id_column = table->GetColumnByName("id"); - if (!id_column) { - return arrow::Status::Invalid("Could not find 'id' column for variable '", - variable, "'"); - } - - arrow::BooleanBuilder mask_builder; - for (int ci = 0; ci < id_column->num_chunks(); ++ci) { - auto ids = - std::static_pointer_cast(id_column->chunk(ci)); - for (int64_t irow = 0; irow < ids->length(); ++irow) { - if (ids->IsNull(irow)) { - ARROW_RETURN_NOT_OK(mask_builder.Append(false)); - } else { - ARROW_RETURN_NOT_OK( - mask_builder.Append(keep_ids.contains(ids->Value(irow)))); - } - } - } - - std::shared_ptr mask_array; - ARROW_RETURN_NOT_OK(mask_builder.Finish(&mask_array)); - ARROW_ASSIGN_OR_RAISE( - auto filtered_datum, - arrow::compute::Filter(arrow::Datum(table), arrow::Datum(mask_array))); - filtered_table_result = filtered_datum.table(); - } if (!filtered_table_result.ok()) { log_error("Failed to process where: '{}', error: {}", where->toString(), filtered_table_result.status().ToString()); return filtered_table_result.status(); } + ARROW_RETURN_NOT_OK(query_state.update_table( - filtered_table_result.ValueOrDie(), SchemaRef::parse(variable))); + filtered_table_result.ValueOrDie(), SchemaRef::parse(alias))); return arrow::Status::OK(); } diff --git a/src/query/query_bootstrap.cpp b/src/query/query_bootstrap.cpp index 0cbfedd..da038e7 100644 --- a/src/query/query_bootstrap.cpp +++ b/src/query/query_bootstrap.cpp @@ -16,23 +16,6 @@ std::vector> extract_predicates( return exprs; } -void record_root_planned_predicates( - QueryResult& result, const std::vector& predicates) { - auto& stats = result.mutable_execution_stats(); - for (const auto& predicate : predicates) { - const auto text = predicate.expr->toString(); - if (predicate.mode == PlannedPredicateMode::Consume) { - stats.num_where_predicates_pushed_to_root++; - stats.num_where_clauses_inlined++; - stats.inlined_conditions.push_back(text); - stats.root_pushdown_conditions.push_back(text); - } else { - stats.num_where_predicates_prefiltered_at_root++; - stats.root_prefilter_conditions.push_back(text); - } - } -} - } // namespace /// Prepare the per-query execution state from the root clause and optional @@ -76,29 +59,56 @@ arrow::Status Database::init_query_state(const Query& query, return arrow::Status::OK(); } +void Database::record_planned_predicates( + QueryResult& result, const std::vector& predicates, + PlannedPredicateSite site) const { + auto& stats = result.mutable_execution_stats(); + for (const auto& predicate : predicates) { + const auto text = predicate.expr->toString(); + if (predicate.mode == PlannedPredicateMode::Consume) { + stats.num_where_clauses_inlined++; + stats.inlined_conditions.push_back(text); + if (site == PlannedPredicateSite::Root) { + stats.num_where_predicates_pushed_to_root++; + stats.root_pushdown_conditions.push_back(text); + } else { + stats.num_where_predicates_pushed_to_traverse++; + stats.traverse_pushdown_conditions.push_back(text); + } + } else { + if (site == PlannedPredicateSite::Root) { + stats.num_where_predicates_prefiltered_at_root++; + stats.root_prefilter_conditions.push_back(text); + } else { + stats.num_where_predicates_prefiltered_at_traverse++; + stats.traverse_prefilter_conditions.push_back(text); + } + } + } +} + /// Inline any WHERE expressions that can be applied directly to the root alias /// before later clauses run. arrow::Status Database::inline_root_where(const Query& query, QueryState& query_state, QueryResult& result) const { - if (query.inline_where() && query_state.where_plan.has_value()) { - const auto& root_filters = query_state.where_plan->root_filters; - if (root_filters.empty()) { - return arrow::Status::OK(); - } + if (!query.inline_where()) { + return arrow::Status::OK(); + } + + if (!query_state.where_plan.has_value()) { + return arrow::Status::Invalid( + "Missing WHERE plan for inline root filtering"); + } - record_root_planned_predicates(result, root_filters); - return inline_where(query.root(), query_state.tables[query.root().value()], - query_state, extract_predicates(root_filters), false) - .status(); + const auto& root_filters = query_state.where_plan->root_filters; + if (root_filters.empty()) { + return arrow::Status::OK(); } - auto where_exps = - get_where_to_inline(query.root().value(), 0, query.clauses()); - result.mutable_execution_stats().num_where_clauses_inlined += - where_exps.size(); + record_planned_predicates(result, root_filters, PlannedPredicateSite::Root); return inline_where(query.root(), query_state.tables[query.root().value()], - query_state, where_exps) + query_state, extract_predicates(root_filters), false) .status(); } diff --git a/src/query/traverse_executor.cpp b/src/query/traverse_executor.cpp index acf37d8..be0ca5b 100644 --- a/src/query/traverse_executor.cpp +++ b/src/query/traverse_executor.cpp @@ -15,23 +15,6 @@ std::vector> extract_predicates( return exprs; } -void record_traverse_planned_predicates( - QueryResult& result, const std::vector& predicates) { - auto& stats = result.mutable_execution_stats(); - for (const auto& predicate : predicates) { - const auto text = predicate.expr->toString(); - if (predicate.mode == PlannedPredicateMode::Consume) { - stats.num_where_predicates_pushed_to_traverse++; - stats.num_where_clauses_inlined++; - stats.inlined_conditions.push_back(text); - stats.traverse_pushdown_conditions.push_back(text); - } else { - stats.num_where_predicates_prefiltered_at_traverse++; - stats.traverse_prefilter_conditions.push_back(text); - } - } -} - } // namespace /// Execute one TRAVERSE clause by expanding the hop, applying the configured @@ -55,7 +38,11 @@ arrow::Status Database::execute_traverse( std::vector> where_clauses; std::vector> edge_where_clauses; - if (query.inline_where() && query_state.where_plan.has_value()) { + if (query.inline_where()) { + if (!query_state.where_plan.has_value()) { + return arrow::Status::Invalid("Missing WHERE plan for inline traverse"); + } + const auto& where_plan = *query_state.where_plan; if (traverse_index >= where_plan.traverse_filters.size()) { return arrow::Status::Invalid("Missing WHERE traverse plan for index ", @@ -65,13 +52,13 @@ arrow::Status Database::execute_traverse( const auto& traverse_plan = where_plan.traverse_filters[traverse_index]; where_clauses = extract_predicates(traverse_plan.target_filters); edge_where_clauses = extract_predicates(traverse_plan.edge_filters); - record_traverse_planned_predicates(result, traverse_plan.target_filters); - record_traverse_planned_predicates(result, traverse_plan.edge_filters); + record_planned_predicates(result, traverse_plan.target_filters, + PlannedPredicateSite::Traverse); + record_planned_predicates(result, traverse_plan.edge_filters, + PlannedPredicateSite::Traverse); } else { - if (query.inline_where()) { - where_clauses = get_where_to_inline(traverse->target().value(), - clause_index + 1, query.clauses()); - } + where_clauses = get_where_to_inline(traverse->target().value(), + clause_index + 1, query.clauses()); if (traverse->edge_alias().has_value()) { edge_where_clauses = get_where_to_inline( traverse->edge_alias().value(), clause_index + 1, query.clauses()); From 132f31630bed038b2a6e58122da1bd92128b0e01 Mon Sep 17 00:00:00 2001 From: dmgcodevil Date: Tue, 21 Apr 2026 21:24:17 -0400 Subject: [PATCH 6/7] experiment with pushdowns 5 --- include/main/database.hpp | 10 ------- include/query/query.hpp | 49 ++++++++++++++++++++++++++------- include/query/where_planner.hpp | 20 -------------- src/main/database.cpp | 1 - src/query/query_bootstrap.cpp | 34 ++++------------------- src/query/traverse_executor.cpp | 15 +++++++--- tests/join_where_test.cpp | 18 ++++++++++-- tests/where_expression_test.cpp | 29 +++++++++++++++++-- 8 files changed, 96 insertions(+), 80 deletions(-) diff --git a/include/main/database.hpp b/include/main/database.hpp index b428682..ae37591 100644 --- a/include/main/database.hpp +++ b/include/main/database.hpp @@ -211,21 +211,11 @@ class Database { std::string alias; }; - enum class PlannedPredicateSite { - Root, - Traverse, - }; - /** Classify a WHERE clause as skipped, deferred, or directly applicable. */ [[nodiscard]] arrow::Result classify_where_filter( const std::shared_ptr &where, const QueryState &query_state) const; - /** Record planner-driven WHERE stats for one execution site. */ - void record_planned_predicates( - QueryResult &result, const std::vector &predicates, - PlannedPredicateSite site) const; - /** Apply a single-alias WHERE clause to an already materialized alias. */ [[nodiscard]] arrow::Status apply_alias_where( const std::shared_ptr &where, const std::string &alias, diff --git a/include/query/query.hpp b/include/query/query.hpp index 42fe0f4..867d941 100644 --- a/include/query/query.hpp +++ b/include/query/query.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -688,24 +689,52 @@ class Query { }; }; -/** @brief Counters collected during query execution for diagnostics. */ +/** @brief How a planned predicate behaves relative to residual filtering. */ +enum class PlannedPredicateMode { + Consume, + PrefilterOnly, +}; + +/** @brief The execution phase where a predicate was applied early. */ +enum class PlannedPredicateSite { + Root, + Traverse, +}; + +/** @brief Diagnostic snapshot of one planned predicate application. */ +struct PlannedPredicateStat { + std::string condition; + PlannedPredicateMode mode = PlannedPredicateMode::Consume; +}; + +/** @brief Counters and predicate traces collected during query execution. */ struct QueryExecutionStats { int num_nodes_processed = 0; int num_edges_traversed = 0; int num_where_clauses_inlined = 0; int num_where_clauses_post_processed = 0; - int num_where_predicates_pushed_to_root = 0; - int num_where_predicates_pushed_to_traverse = 0; - int num_where_predicates_prefiltered_at_root = 0; - int num_where_predicates_prefiltered_at_traverse = 0; - int num_where_predicates_deferred = 0; std::vector inlined_conditions; // For debugging std::vector post_processed_conditions; // For debugging - std::vector root_pushdown_conditions; - std::vector traverse_pushdown_conditions; - std::vector root_prefilter_conditions; - std::vector traverse_prefilter_conditions; + std::map> + planned_conditions; std::vector deferred_conditions; + + /** + * Record one early predicate application in the execution stats. + * + * This is diagnostics metadata only; actual execution behavior is still + * defined by the presence of predicates in the where plan and residual list. + */ + void record_planned_predicate(PlannedPredicateSite site, + std::string condition, + PlannedPredicateMode mode) { + planned_conditions[site].push_back( + PlannedPredicateStat{.condition = condition, .mode = mode}); + if (mode == PlannedPredicateMode::Consume) { + num_where_clauses_inlined++; + inlined_conditions.push_back(planned_conditions[site].back().condition); + } + } }; /** @brief Holds the output Arrow table and execution statistics from a query. diff --git a/include/query/where_planner.hpp b/include/query/where_planner.hpp index 340869f..770244d 100644 --- a/include/query/where_planner.hpp +++ b/include/query/where_planner.hpp @@ -10,26 +10,6 @@ namespace tundradb { struct QueryState; -/** - * @brief Metadata describing how a planned predicate participates in the plan. - * - * This enum does not decide where or whether execution applies a predicate. - * Execution is driven by the plan shape itself: - * - predicates present in `root_filters` / `traverse_filters` are applied early - * - predicates present in `residual_by_clause` are applied later - * - * `mode` exists to make the planner output explicit and to keep execution - * statistics honest. - * - * `Consume` means the fragment appears only in the early phase of the plan. - * `PrefilterOnly` means the fragment appears in an early phase and is also - * retained in `residual_by_clause`. - */ -enum class PlannedPredicateMode { - Consume, - PrefilterOnly, -}; - /** * @brief One predicate fragment scheduled by the WHERE planner. * diff --git a/src/main/database.cpp b/src/main/database.cpp index adf9920..f4d014c 100644 --- a/src/main/database.cpp +++ b/src/main/database.cpp @@ -238,7 +238,6 @@ Database::execute_clauses(const Query& query, QueryState& query_state, query_state.where_plan->residual_by_clause[i]) { post_where.push_back(residual); auto& stats = result.mutable_execution_stats(); - stats.num_where_predicates_deferred++; stats.deferred_conditions.push_back(residual->toString()); } } else { diff --git a/src/query/query_bootstrap.cpp b/src/query/query_bootstrap.cpp index da038e7..7c632db 100644 --- a/src/query/query_bootstrap.cpp +++ b/src/query/query_bootstrap.cpp @@ -59,34 +59,6 @@ arrow::Status Database::init_query_state(const Query& query, return arrow::Status::OK(); } -void Database::record_planned_predicates( - QueryResult& result, const std::vector& predicates, - PlannedPredicateSite site) const { - auto& stats = result.mutable_execution_stats(); - for (const auto& predicate : predicates) { - const auto text = predicate.expr->toString(); - if (predicate.mode == PlannedPredicateMode::Consume) { - stats.num_where_clauses_inlined++; - stats.inlined_conditions.push_back(text); - if (site == PlannedPredicateSite::Root) { - stats.num_where_predicates_pushed_to_root++; - stats.root_pushdown_conditions.push_back(text); - } else { - stats.num_where_predicates_pushed_to_traverse++; - stats.traverse_pushdown_conditions.push_back(text); - } - } else { - if (site == PlannedPredicateSite::Root) { - stats.num_where_predicates_prefiltered_at_root++; - stats.root_prefilter_conditions.push_back(text); - } else { - stats.num_where_predicates_prefiltered_at_traverse++; - stats.traverse_prefilter_conditions.push_back(text); - } - } - } -} - /// Inline any WHERE expressions that can be applied directly to the root alias /// before later clauses run. arrow::Status Database::inline_root_where(const Query& query, @@ -106,7 +78,11 @@ arrow::Status Database::inline_root_where(const Query& query, return arrow::Status::OK(); } - record_planned_predicates(result, root_filters, PlannedPredicateSite::Root); + auto& stats = result.mutable_execution_stats(); + for (const auto& predicate : root_filters) { + stats.record_planned_predicate(PlannedPredicateSite::Root, + predicate.expr->toString(), predicate.mode); + } return inline_where(query.root(), query_state.tables[query.root().value()], query_state, extract_predicates(root_filters), false) .status(); diff --git a/src/query/traverse_executor.cpp b/src/query/traverse_executor.cpp index be0ca5b..5d479ec 100644 --- a/src/query/traverse_executor.cpp +++ b/src/query/traverse_executor.cpp @@ -52,10 +52,17 @@ arrow::Status Database::execute_traverse( const auto& traverse_plan = where_plan.traverse_filters[traverse_index]; where_clauses = extract_predicates(traverse_plan.target_filters); edge_where_clauses = extract_predicates(traverse_plan.edge_filters); - record_planned_predicates(result, traverse_plan.target_filters, - PlannedPredicateSite::Traverse); - record_planned_predicates(result, traverse_plan.edge_filters, - PlannedPredicateSite::Traverse); + auto& stats = result.mutable_execution_stats(); + for (const auto& predicate : traverse_plan.target_filters) { + stats.record_planned_predicate(PlannedPredicateSite::Traverse, + predicate.expr->toString(), + predicate.mode); + } + for (const auto& predicate : traverse_plan.edge_filters) { + stats.record_planned_predicate(PlannedPredicateSite::Traverse, + predicate.expr->toString(), + predicate.mode); + } } else { where_clauses = get_where_to_inline(traverse->target().value(), clause_index + 1, query.clauses()); diff --git a/tests/join_where_test.cpp b/tests/join_where_test.cpp index 7fa9e2e..9fa108e 100644 --- a/tests/join_where_test.cpp +++ b/tests/join_where_test.cpp @@ -353,9 +353,21 @@ TEST_F(JoinWhereTest, LeftJoinTargetWhereShouldFilterOutNullExtendedRows) { EXPECT_EQ(actual, expected_table) << "Query:\n" << query_text; const auto& stats = result.ValueOrDie()->execution_stats(); - EXPECT_EQ(stats.num_where_predicates_pushed_to_traverse, 0); - EXPECT_EQ(stats.num_where_predicates_prefiltered_at_traverse, 1); - EXPECT_EQ(stats.num_where_predicates_deferred, 1); + const auto it = stats.planned_conditions.find(PlannedPredicateSite::Traverse); + ASSERT_NE(it, stats.planned_conditions.end()); + EXPECT_EQ(std::ranges::count_if(it->second.begin(), it->second.end(), + [](const PlannedPredicateStat& stat) { + return stat.mode == + PlannedPredicateMode::Consume; + }), + 0); + EXPECT_EQ(std::ranges::count_if(it->second.begin(), it->second.end(), + [](const PlannedPredicateStat& stat) { + return stat.mode == + PlannedPredicateMode::PrefilterOnly; + }), + 1); + EXPECT_EQ(stats.deferred_conditions.size(), 1u); } /* diff --git a/tests/where_expression_test.cpp b/tests/where_expression_test.cpp index 6a61335..9196388 100644 --- a/tests/where_expression_test.cpp +++ b/tests/where_expression_test.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -20,6 +21,24 @@ using namespace std::string_literals; using namespace tundradb; namespace tundradb { + +namespace { + +size_t count_planned_predicates(const QueryExecutionStats& stats, + PlannedPredicateSite site, + PlannedPredicateMode mode) { + const auto it = stats.planned_conditions.find(site); + if (it == stats.planned_conditions.end()) { + return 0; + } + + return static_cast(std::ranges::count_if( + it->second, + [mode](const PlannedPredicateStat& stat) { return stat.mode == mode; })); +} + +} // namespace + class WhereExpressionTest : public ::testing::Test { protected: void SetUp() override { @@ -603,9 +622,13 @@ TEST_F(WhereExpressionTest, TraversalWhereCombinations3) { const auto& stats = result.ValueOrDie()->execution_stats(); EXPECT_EQ(stats.num_where_clauses_inlined, 4); EXPECT_EQ(stats.num_where_clauses_post_processed, 0); - EXPECT_EQ(stats.num_where_predicates_pushed_to_root, 2); - EXPECT_EQ(stats.num_where_predicates_pushed_to_traverse, 2); - EXPECT_EQ(stats.num_where_predicates_deferred, 0); + EXPECT_EQ(count_planned_predicates(stats, PlannedPredicateSite::Root, + PlannedPredicateMode::Consume), + 2u); + EXPECT_EQ(count_planned_predicates(stats, PlannedPredicateSite::Traverse, + PlannedPredicateMode::Consume), + 2u); + EXPECT_TRUE(stats.deferred_conditions.empty()); } TEST_F(WhereExpressionTest, QueryMaterializesMapColumn) { From dbe2bf69173b85acca398d2f4171f36c3c285c27 Mon Sep 17 00:00:00 2001 From: dmgcodevil Date: Tue, 21 Apr 2026 21:31:59 -0400 Subject: [PATCH 7/7] experiment with pushdowns 6 --- src/query/result_builder.cpp | 340 +---------------------------------- 1 file changed, 4 insertions(+), 336 deletions(-) diff --git a/src/query/result_builder.cpp b/src/query/result_builder.cpp index 5595cbe..85f9b10 100644 --- a/src/query/result_builder.cpp +++ b/src/query/result_builder.cpp @@ -22,332 +22,7 @@ namespace tundradb { -namespace { - -/// Temporary feature flag for the experimental binding-based materializer. -/// -/// Keep this disabled by default so query execution uses the legacy row builder -/// until the binding path covers the full join/WHERE matrix and can replace the -/// old implementation outright. -static constexpr bool kEnableBindingMaterialization = false; - -/// One partially-materialized result row plus the alias bindings that produced -/// it. -/// -/// `row` stores the actual field values accumulated so far. -/// `node_ids` records node alias -> bound node id, or `nullopt` for a LEFT -/// null-extension. -/// `edge_ids` records edge alias -> bound edge id, or `nullopt` when the edge -/// side of a LEFT traversal is absent. -struct BindingRow { - std::shared_ptr row; - std::unordered_map> node_ids; - std::unordered_map> edge_ids; -}; - -/// Returns true when every traversal in the query can use the new -/// binding-based materializer. -/// -/// At the moment that path handles INNER and LEFT joins directly. RIGHT and -/// FULL still fall back to the legacy BFS/schema-based builder. -bool supports_binding_materialization(const std::vector& traverses) { - if (!kEnableBindingMaterialization) { - return false; - } - return std::all_of(traverses.begin(), traverses.end(), [](const auto& t) { - return t.traverse_type() == TraverseType::Inner || - t.traverse_type() == TraverseType::Left; - }); -} - -/// Deep-copy a binding row so one input row can branch into multiple output -/// rows when a traversal fans out to several matches. -auto clone_binding_row(const BindingRow& binding) -> BindingRow { - BindingRow copy; - copy.row = std::make_shared(*binding.row); - copy.node_ids = binding.node_ids; - copy.edge_ids = binding.edge_ids; - return copy; -} - -/// Materialize one bound node alias into the row's cell storage. -/// -/// If the alias has projected fields in the final schema, this loads the node -/// by id and writes its values into the row. Otherwise it is a cheap no-op. -auto fill_node_cells(const SchemaRef& ref, int64_t node_id, BindingRow& binding, - const QueryState& query_state) -> arrow::Status { - const auto idx_it = query_state.schema_field_indices().find(ref.value()); - if (idx_it == query_state.schema_field_indices().end()) { - return arrow::Status::OK(); - } - - ARROW_ASSIGN_OR_RAISE(const auto schema_name, - query_state.resolve_schema(ref)); - ARROW_ASSIGN_OR_RAISE(const auto node, query_state.node_manager->get_node( - schema_name, node_id)); - binding.row->set_cell_from_node(idx_it->second, node, - query_state.temporal_context.get()); - return arrow::Status::OK(); -} - -/// Materialize one bound edge alias into the row's cell storage. -/// -/// If the edge alias contributes projected fields to the final schema, this -/// loads the edge by id and writes its properties into the row. Otherwise it is -/// a cheap no-op. -auto fill_edge_cells(const std::string& edge_alias, int64_t edge_id, - BindingRow& binding, const QueryState& query_state) - -> arrow::Status { - if (!query_state.edge_store) { - return arrow::Status::Invalid("Edge store not available"); - } - - const auto idx_it = query_state.schema_field_indices().find(edge_alias); - if (idx_it == query_state.schema_field_indices().end()) { - return arrow::Status::OK(); - } - - ARROW_ASSIGN_OR_RAISE(const auto edge_schema, - query_state.get_schema_for_alias(edge_alias)); - ARROW_ASSIGN_OR_RAISE(const auto edge, query_state.edge_store->get(edge_id)); - binding.row->set_cell_from_edge(idx_it->second, edge, edge_schema->fields(), - query_state.temporal_context.get()); - return arrow::Status::OK(); -} - -/// Bind one node alias for the current row. -/// -/// Behavior: -/// - if the alias is unbound, record the supplied id (or NULL binding) -/// - if the alias is already bound, succeed only when the new id matches -/// - when a concrete node id is accepted, materialize that node's fields into -/// the row immediately -auto bind_node_alias(BindingRow& binding, const SchemaRef& ref, - const std::optional& node_id, - const QueryState& query_state) -> arrow::Result { - const auto [it, inserted] = - binding.node_ids.try_emplace(ref.value(), node_id); - if (!inserted) { - return it->second == node_id; - } - - if (!node_id.has_value()) { - return true; - } - - ARROW_RETURN_NOT_OK(fill_node_cells(ref, *node_id, binding, query_state)); - binding.row->id = binding.row->id >= 0 ? binding.row->id : *node_id; - return true; -} - -/// Bind one edge alias for the current row. -/// -/// This mirrors `bind_node_alias(...)`, but for optional edge aliases attached -/// to a TRAVERSE. Accepted concrete edge ids are materialized into the row -/// immediately so later stages do not need to revisit the edge store. -auto bind_edge_alias(BindingRow& binding, const std::string& edge_alias, - const std::optional& edge_id, - const QueryState& query_state) -> arrow::Result { - const auto [it, inserted] = binding.edge_ids.try_emplace(edge_alias, edge_id); - if (!inserted) { - return it->second == edge_id; - } - - if (!edge_id.has_value()) { - return true; - } - - ARROW_RETURN_NOT_OK( - fill_edge_cells(edge_alias, *edge_id, binding, query_state)); - return true; -} - -/// Returns true when a recorded graph connection belongs to the given TRAVERSE -/// clause. -/// -/// Matching checks the full traverse identity: source alias, target alias, -/// edge type, and optional edge alias. -bool connection_matches_traverse(const GraphConnection& conn, - const Traverse& traverse) { - return conn.source.value() == traverse.source().value() && - conn.target.value() == traverse.target().value() && - conn.edge_type == traverse.edge_type() && - conn.edge_alias == traverse.edge_alias(); -} - -/// Return the currently valid connections for one bound row traversing one -/// TRAVERSE clause. -/// -/// Starting from the row's already-bound source alias, this filters the stored -/// graph connections down to those that: -/// - match the exact TRAVERSE clause -/// - still point to a live target id in `query_state.ids()[target_alias]` -/// -/// The second rule lets alias-local filtering shrink the acceptable target set -/// before row expansion runs. -auto get_live_connections_for_traverse(const BindingRow& binding, - const Traverse& traverse, - const QueryState& query_state) - -> llvm::SmallVector { - llvm::SmallVector matches; - - const auto source_it = binding.node_ids.find(traverse.source().value()); - if (source_it == binding.node_ids.end() || !source_it->second.has_value()) { - return matches; - } - - const int64_t source_id = *source_it->second; - if (!query_state.connections().contains(traverse.source().value())) { - return matches; - } - - const auto& by_source = - query_state.connections().at(traverse.source().value()); - if (!by_source.contains(source_id)) { - return matches; - } - - const auto target_ids_it = query_state.ids().find(traverse.target().value()); - if (target_ids_it == query_state.ids().end()) { - return matches; - } - - const auto& target_ids = target_ids_it->second; - for (const auto& conn : by_source.at(source_id)) { - if (!connection_matches_traverse(conn, traverse)) { - continue; - } - if (!target_ids.contains(conn.target_id)) { - continue; - } - matches.push_back(conn); - } - return matches; -} - -/// Expand one bound row through a single TRAVERSE clause. -/// -/// Semantics: -/// - `INNER`: one output row per surviving connection; zero connections drops -/// the row -/// - `LEFT`: one output row per surviving connection; zero connections keeps -/// the row and binds the introduced target/edge aliases to NULL -/// -/// This is the core of the binding-based materialization model: join behavior -/// is applied exactly where the TRAVERSE is processed instead of being inferred -/// later from graph connectivity. -auto expand_traverse_binding(const BindingRow& binding, - const Traverse& traverse, - const QueryState& query_state) - -> arrow::Result> { - std::vector expanded_rows; - auto live_connections = - get_live_connections_for_traverse(binding, traverse, query_state); - - if (live_connections.empty()) { - if (traverse.traverse_type() == TraverseType::Inner) { - return expanded_rows; - } - - auto null_extended = clone_binding_row(binding); - ARROW_ASSIGN_OR_RAISE(const bool target_ok, - bind_node_alias(null_extended, traverse.target(), - std::nullopt, query_state)); - if (!target_ok) { - return expanded_rows; - } - if (traverse.edge_alias().has_value()) { - ARROW_ASSIGN_OR_RAISE( - const bool edge_ok, - bind_edge_alias(null_extended, traverse.edge_alias().value(), - std::nullopt, query_state)); - if (!edge_ok) { - return expanded_rows; - } - } - expanded_rows.push_back(std::move(null_extended)); - return expanded_rows; - } - - expanded_rows.reserve(live_connections.size()); - for (const auto& conn : live_connections) { - auto next = clone_binding_row(binding); - ARROW_ASSIGN_OR_RAISE( - const bool target_ok, - bind_node_alias(next, traverse.target(), conn.target_id, query_state)); - if (!target_ok) { - continue; - } - if (traverse.edge_alias().has_value()) { - ARROW_ASSIGN_OR_RAISE(const bool edge_ok, - bind_edge_alias(next, traverse.edge_alias().value(), - conn.edge_id, query_state)); - if (!edge_ok) { - continue; - } - } - expanded_rows.push_back(std::move(next)); - } - return expanded_rows; -} - -/// Build result rows by expanding bound aliases in clause order. -/// -/// This keeps join semantics local to each TRAVERSE: -/// - INNER drops rows with no matching connection. -/// - LEFT preserves the existing row and null-extends the new target/edge. -/// -/// Unlike the legacy BFS/schema-based builder, this does not infer join -/// behavior from missing target IDs after the fact. -auto populate_rows_by_bindings( - const QueryState& query_state, const std::vector& traverses, - const std::shared_ptr& output_schema) - -> arrow::Result>>> { - auto rows = std::make_shared>>(); - const auto root_ids_it = query_state.ids().find(query_state.root.value()); - if (root_ids_it == query_state.ids().end()) { - return rows; - } - - std::vector bindings; - bindings.reserve(root_ids_it->second.size()); - for (const auto root_id : root_ids_it->second) { - BindingRow binding{.row = std::make_shared( - create_empty_row_from_schema(output_schema))}; - ARROW_ASSIGN_OR_RAISE( - const bool root_ok, - bind_node_alias(binding, query_state.root, root_id, query_state)); - if (!root_ok) { - continue; - } - bindings.push_back(std::move(binding)); - } - - for (const auto& traverse : traverses) { - std::vector next_bindings; - for (const auto& binding : bindings) { - ARROW_ASSIGN_OR_RAISE(auto expanded, expand_traverse_binding( - binding, traverse, query_state)); - next_bindings.insert(next_bindings.end(), - std::make_move_iterator(expanded.begin()), - std::make_move_iterator(expanded.end())); - } - bindings = std::move(next_bindings); - if (bindings.empty()) { - break; - } - } - - rows->reserve(bindings.size()); - int64_t row_id = 0; - for (auto& binding : bindings) { - binding.row->id = row_id++; - rows->push_back(std::move(binding.row)); - } - return rows; -} - -} // namespace +namespace {} // namespace /// Starting from one root node, walk the prepared query graph and emit the /// denormalized row variants reachable from that node. @@ -775,16 +450,9 @@ arrow::Result> Database::build_result_table( build_denormalized_schema(query_state)); IF_DEBUG_ENABLED { log_debug("output_schema={}", output_schema->ToString()); } - std::shared_ptr>> rows; - if (supports_binding_materialization(query_state.traversals)) { - ARROW_ASSIGN_OR_RAISE( - rows, populate_rows_by_bindings(query_state, query_state.traversals, - output_schema)); - } else { - ARROW_ASSIGN_OR_RAISE(rows, - populate_rows(query.execution_config(), query_state, - query_state.traversals, output_schema)); - } + ARROW_ASSIGN_OR_RAISE(auto rows, + populate_rows(query.execution_config(), query_state, + query_state.traversals, output_schema)); ARROW_ASSIGN_OR_RAISE(auto table, create_table_from_rows(rows, output_schema));