From 418e209384566169ad570c4eeaeaf1adc4833891 Mon Sep 17 00:00:00 2001 From: doylet Date: Thu, 3 Apr 2025 16:25:11 +1100 Subject: [PATCH 01/50] Add --skip-bootstrap to avoid contacting of seed nodes on startup (useful for private networks) --- oxenss/daemon/command_line.cpp | 5 +++++ oxenss/daemon/command_line.h | 1 + oxenss/daemon/oxen-storage.cpp | 7 ++++++- oxenss/snode/service_node.cpp | 4 +++- oxenss/snode/service_node.h | 4 +++- 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/oxenss/daemon/command_line.cpp b/oxenss/daemon/command_line.cpp index 540008988..e43896eba 100644 --- a/oxenss/daemon/command_line.cpp +++ b/oxenss/daemon/command_line.cpp @@ -175,6 +175,11 @@ parse_result parse_cli_args(int argc, char* argv[]) { "--force-start", options.force_start, "Ignore the initialisation ready check (primarily for debugging)."); + cli.add_flag( + "--skip-bootstrap-nodes", + options.skip_bootstrap, + "Skip the contacting of bootstrap seed nodes on startup (primarily for private node " + "networks)"); cli.add_option( "--stats-access-key", options.stats_access_keys, diff --git a/oxenss/daemon/command_line.h b/oxenss/daemon/command_line.h index 4dd64b089..2f3285833 100644 --- a/oxenss/daemon/command_line.h +++ b/oxenss/daemon/command_line.h @@ -12,6 +12,7 @@ struct command_line_options { uint16_t https_port = 22021; uint16_t omq_quic_port = 22020; std::string oxend_omq_rpc; // Defaults to ipc://$HOME/.oxen/[testnet/]oxend.sock + bool skip_bootstrap = false; bool force_start = false; bool testnet = false; std::string log_level = "info"; diff --git a/oxenss/daemon/oxen-storage.cpp b/oxenss/daemon/oxen-storage.cpp index 6318e1fe5..370880194 100644 --- a/oxenss/daemon/oxen-storage.cpp +++ b/oxenss/daemon/oxen-storage.cpp @@ -153,7 +153,12 @@ int main(int argc, char* argv[]) { auto& oxenmq_server = *oxenmq_server_ptr; snode::ServiceNode service_node{ - l_keys, me, oxenmq_server, options.data_dir, options.force_start}; + l_keys, + me, + oxenmq_server, + options.data_dir, + options.force_start, + options.skip_bootstrap}; rpc::RequestHandler request_handler{service_node, channel_encryption, ed_keys.sec}; diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 8cb8d1990..c9c8bf79b 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -50,8 +50,10 @@ ServiceNode::ServiceNode( const contact& contact, server::OMQ& omq_server, const std::filesystem::path& db_location, - const bool force_start) : + bool force_start, + bool skip_bootstrap) : force_start_{force_start}, + skip_bootstrap_{skip_bootstrap}, db_{std::make_unique(db_location)}, our_keys_{keys}, our_contact_{contact}, diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index 3c8d76b74..3a09328ef 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -79,6 +79,7 @@ class ServiceNode { bool active_ = false; std::atomic got_first_response_ = false; bool force_start_ = false; + bool skip_bootstrap_ = false; std::atomic shutting_down_ = false; hf_revision hardfork_ = {0, 0}; uint64_t block_height_ = 0; @@ -170,7 +171,8 @@ class ServiceNode { const contact& contact, server::OMQ& omq_server, const std::filesystem::path& db_location, - bool force_start); + bool force_start, + bool skip_bootstrap); Database& get_db() { return *db_; } const Database& get_db() const { return *db_; } From 7ceff8acd38fe8de28204dfb62ee960fd16e0ce7 Mon Sep 17 00:00:00 2001 From: doylet Date: Mon, 28 Apr 2025 16:02:14 +0900 Subject: [PATCH 02/50] Respect the skip_bootstrap_ flag --- oxenss/snode/service_node.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index c9c8bf79b..f2657df4e 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -703,8 +703,9 @@ void ServiceNode::process_snodes_update(std::string_view data) { auto [total, contactable] = network_.contacts.counts(); auto missing = total - contactable; - if (total >= (oxenss::is_mainnet ? 100 : 10) && - missing <= MISSING_PUBKEY_THRESHOLD::num * total / MISSING_PUBKEY_THRESHOLD::den) { + if (skip_bootstrap_ || + (total >= (oxenss::is_mainnet ? 100 : 10) && + missing <= MISSING_PUBKEY_THRESHOLD::num * total / MISSING_PUBKEY_THRESHOLD::den)) { log::info( logcat, "Initialized from oxend with {}/{} contactable service nodes", From 8d18d4b53784cc09d9a90c98c79b1303b16f388a Mon Sep 17 00:00:00 2001 From: doylet Date: Fri, 16 May 2025 16:14:58 +1000 Subject: [PATCH 03/50] Get rid of db getter and setter by making it public We don't do anything in particular in the setter or getter that might warrant having those functions (like a lock or something) so we can remove the indirection and call the member directly. --- oxenss/rpc/request_handler.cpp | 53 +++++++++++++++++----------------- oxenss/snode/service_node.cpp | 41 +++++++++++++------------- oxenss/snode/service_node.h | 3 +- 3 files changed, 48 insertions(+), 49 deletions(-) diff --git a/oxenss/rpc/request_handler.cpp b/oxenss/rpc/request_handler.cpp index fdd0d8af6..68559c470 100644 --- a/oxenss/rpc/request_handler.cpp +++ b/oxenss/rpc/request_handler.cpp @@ -565,7 +565,7 @@ void RequestHandler::process_client_req(rpc::store&& req, std::function msgs; bool more = false; try { - std::tie(msgs, more) = service_node_.get_db().retrieve( + std::tie(msgs, more) = service_node_.db->retrieve( req.pubkey, req.msg_namespace, req.last_hash.value_or(""), @@ -870,7 +870,7 @@ void RequestHandler::process_client_req( Response{http::NOT_ACCEPTABLE, "delete_all timestamp too far from current time"sv}); } if (!verify_signature( - service_node_.get_db(), + *service_node_.db, req.pubkey, req.pubkey_ed25519, req.subaccount, @@ -896,7 +896,7 @@ void RequestHandler::process_client_req( handle_action_all_ns( mine, "deleted", - service_node_.get_db().delete_all(req.pubkey), + service_node_.db->delete_all(req.pubkey), req.b64, ed25519_sk_, req.pubkey.prefixed_hex(), @@ -906,8 +906,7 @@ void RequestHandler::process_client_req( handle_action_one_ns( mine, "deleted", - service_node_.get_db().delete_all( - req.pubkey, std::get(req.msg_namespace)), + service_node_.db->delete_all(req.pubkey, std::get(req.msg_namespace)), req.b64, ed25519_sk_, req.pubkey.prefixed_hex(), @@ -928,7 +927,7 @@ void RequestHandler::process_client_req(rpc::delete_msgs&& req, std::functionresult["swarm"][service_node_.own_address().pubkey_ed25519.hex()] : res->result; - auto deleted = service_node_.get_db().delete_by_hash(req.pubkey, req.messages); + auto deleted = service_node_.db->delete_by_hash(req.pubkey, req.messages); std::sort(deleted.begin(), deleted.end()); auto sig = create_signature(ed25519_sk_, req.pubkey.prefixed_hex(), req.messages, deleted); mine["deleted"] = std::move(deleted); @@ -1002,7 +1001,7 @@ void RequestHandler::process_client_req( } if (!verify_signature( - service_node_.get_db(), + *service_node_.db, req.pubkey, req.pubkey_ed25519, std::nullopt, // no subaccount allowed @@ -1024,7 +1023,7 @@ void RequestHandler::process_client_req( ? res->result["swarm"][service_node_.own_address().pubkey_ed25519.hex()] : res->result; - service_node_.get_db().revoke_subaccounts(req.pubkey, req.revoke); + service_node_.db->revoke_subaccounts(req.pubkey, req.revoke); auto sig = create_signature(ed25519_sk_, req.pubkey.prefixed_hex(), req.timestamp, req.revoke); mine["signature"] = req.b64 ? oxenc::to_base64(sig.begin(), sig.end()) : util::view_guts(sig); if (req.recurse) @@ -1055,7 +1054,7 @@ void RequestHandler::process_client_req( } if (!verify_signature( - service_node_.get_db(), + *service_node_.db, req.pubkey, req.pubkey_ed25519, std::nullopt, // no subaccount allowed @@ -1077,7 +1076,7 @@ void RequestHandler::process_client_req( ? res->result["swarm"][service_node_.own_address().pubkey_ed25519.hex()] : res->result; - mine["count"] = service_node_.get_db().unrevoke_subaccounts(req.pubkey, req.unrevoke); + mine["count"] = service_node_.db->unrevoke_subaccounts(req.pubkey, req.unrevoke); auto sig = create_signature(ed25519_sk_, req.pubkey.prefixed_hex(), req.timestamp, req.unrevoke); mine["signature"] = req.b64 ? oxenc::to_base64(sig.begin(), sig.end()) : util::view_guts(sig); @@ -1106,7 +1105,7 @@ void RequestHandler::process_client_req( } if (!verify_signature( - service_node_.get_db(), + *service_node_.db, req.pubkey, std::nullopt, std::nullopt, // no subaccount allowed @@ -1122,7 +1121,7 @@ void RequestHandler::process_client_req( std::vector revoked_subaccounts; try { - revoked_subaccounts = service_node_.get_db().revoked_subaccounts(req.pubkey); + revoked_subaccounts = service_node_.db->revoked_subaccounts(req.pubkey); } catch (const std::exception& e) { auto msg = fmt::format( "Internal Server Error. Could not retrieve revoked_subaccounts for {}", @@ -1165,7 +1164,7 @@ void RequestHandler::process_client_req( } if (!verify_signature( - service_node_.get_db(), + *service_node_.db, req.pubkey, req.pubkey_ed25519, req.subaccount, @@ -1191,7 +1190,7 @@ void RequestHandler::process_client_req( handle_action_all_ns( mine, "deleted", - service_node_.get_db().delete_by_timestamp(req.pubkey, req.before), + service_node_.db->delete_by_timestamp(req.pubkey, req.before), req.b64, ed25519_sk_, req.pubkey.prefixed_hex(), @@ -1201,7 +1200,7 @@ void RequestHandler::process_client_req( handle_action_one_ns( mine, "deleted", - service_node_.get_db().delete_by_timestamp( + service_node_.db->delete_by_timestamp( req.pubkey, std::get(req.msg_namespace), req.before), req.b64, ed25519_sk_, @@ -1232,7 +1231,7 @@ void RequestHandler::process_client_req(rpc::expire_all&& req, std::functionupdate_all_expiries(req.pubkey, req.expiry), req.b64, ed25519_sk_, req.pubkey.prefixed_hex(), @@ -1267,7 +1266,7 @@ void RequestHandler::process_client_req(rpc::expire_all&& req, std::functionupdate_all_expiries( req.pubkey, std::get(req.msg_namespace), req.expiry), req.b64, ed25519_sk_, @@ -1309,7 +1308,7 @@ void RequestHandler::process_client_req(rpc::expire_msgs&& req, std::functionresult["swarm"][service_node_.own_address().pubkey_ed25519.hex()] : res->result; - auto updated = service_node_.get_db().update_expiry( + auto updated = service_node_.db->update_expiry( req.pubkey, req.messages, expiry, @@ -1373,7 +1372,7 @@ void RequestHandler::process_client_req(rpc::expire_msgs&& req, std::functionget_expiries(req.pubkey, unchanged_hashes); } std::vector updated_hash; @@ -1439,7 +1438,7 @@ void RequestHandler::process_client_req(rpc::get_expiries&& req, std::functionget_expiries(req.pubkey, req.messages); return cb(Response{http::OK, std::move(res)}); } @@ -1657,7 +1656,7 @@ void RequestHandler::process_client_req( Response RequestHandler::process_retrieve_all() { std::vector msgs; try { - msgs = service_node_.get_db().retrieve_all(); + msgs = service_node_.db->retrieve_all(); } catch (const std::exception& e) { return {http::INTERNAL_SERVER_ERROR, "could not retrieve all messages"s}; } diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index f2657df4e..10ba7de24 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -49,17 +49,17 @@ ServiceNode::ServiceNode( const crypto::legacy_keypair& keys, const contact& contact, server::OMQ& omq_server, - const std::filesystem::path& db_location, + const std::filesystem::path& dblocation, bool force_start, bool skip_bootstrap) : force_start_{force_start}, skip_bootstrap_{skip_bootstrap}, - db_{std::make_unique(db_location)}, our_keys_{keys}, our_contact_{contact}, network_{*omq_server}, omq_server_{omq_server}, - all_stats_{*omq_server} { + all_stats_{*omq_server}, + db{std::make_unique(dblocation)} { mq_servers_.push_back(&omq_server); log::info(logcat, "Requesting initial swarm state"); @@ -67,7 +67,7 @@ ServiceNode::ServiceNode( omq_server->add_timer( [this] { std::lock_guard l{sn_mutex_}; - db_->clean_expired(); + db->clean_expired(); }, Database::CLEANUP_PERIOD); @@ -351,10 +351,11 @@ bool ServiceNode::snode_ready(std::string* reason) { std::vector problems; if (!hf_at_least(STORAGE_SERVER_HARDFORK)) - problems.push_back(fmt::format( - "not yet on hardfork {}.{}", - STORAGE_SERVER_HARDFORK.first, - STORAGE_SERVER_HARDFORK.second)); + problems.push_back( + fmt::format( + "not yet on hardfork {}.{}", + STORAGE_SERVER_HARDFORK.first, + STORAGE_SERVER_HARDFORK.second)); if (syncing_) problems.push_back("not done syncing"); @@ -451,7 +452,7 @@ void ServiceNode::check_new_members() { } if (auto send_now = swarm_.extract_ready_members(); !send_now.empty()) { - auto msgs = db_->retrieve_all(); + auto msgs = db->retrieve_all(); log::debug( logcat, "Initiating swarm message dump ({} message) to new swarm member(s): {}", @@ -530,7 +531,7 @@ bool ServiceNode::process_store( all_stats_.bump_store_requests(); /// store in the database (if not already present) - const auto result = db_->store(msg, expiry); + const auto result = db->store(msg, expiry); if (new_msg) *new_msg = result == StoreResult::New; @@ -542,7 +543,7 @@ bool ServiceNode::process_store( void ServiceNode::save_bulk(const std::vector& msgs) { try { - db_->bulk_store(msgs); + db->bulk_store(msgs); } catch (const std::exception& e) { log::error(logcat, "failed to save batch to the database: {}", e.what()); return; @@ -974,7 +975,7 @@ void ServiceNode::bootstrap_swarms(const std::set& swarms) const { std::unordered_map pk_swarm_cache; std::unordered_map> to_relay; - std::vector all_msgs = db_->retrieve_all(); + std::vector all_msgs = db->retrieve_all(); log::debug(logcat, "We have {} messages", all_msgs.size()); for (auto& entry : all_msgs) { if (!entry.pubkey) { @@ -1080,7 +1081,7 @@ std::string ServiceNode::get_stats() const { val["height"] = block_height_; val["target_height"] = target_height_; - std::vector counts = db_->get_message_counts(); + std::vector counts = db->get_message_counts(); int64_t total = std::accumulate(counts.begin(), counts.end(), int64_t{0}); counts.erase( @@ -1131,12 +1132,12 @@ std::string ServiceNode::get_stats() const { val["account_msg_mean"] = total / (double)counts.size(); auto& ns_stats = (val["namespace_messages"] = nlohmann::json::object()); - for (auto& [ns, count] : db_->get_namespace_counts()) + for (auto& [ns, count] : db->get_namespace_counts()) ns_stats[fmt::format("{}", ns)] = count; - val["db_used"] = db_->get_used_bytes(); - val["db_total"] = db_->get_total_bytes(); - val["db_max"] = Database::SIZE_LIMIT; + val["dbused"] = db->get_used_bytes(); + val["dbtotal"] = db->get_total_bytes(); + val["dbmax"] = Database::SIZE_LIMIT; return val.dump(); } @@ -1165,9 +1166,9 @@ std::string ServiceNode::get_status_line() const { STORAGE_SERVER_VERSION_STRING, oxenss::is_mainnet ? "" : " (TESTNET)", syncing_ ? "; SYNCING" : "", - db_->get_message_count(), - util::get_human_readable_bytes(db_->get_used_bytes()), - db_->get_owner_count(), + db->get_message_count(), + util::get_human_readable_bytes(db->get_used_bytes()), + db->get_owner_count(), stats.client_store_requests, stats.client_retrieve_requests, stats.onion_requests, diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index 3a09328ef..70dd3cff3 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -174,8 +174,7 @@ class ServiceNode { bool force_start, bool skip_bootstrap); - Database& get_db() { return *db_; } - const Database& get_db() const { return *db_; } + std::unique_ptr db; const Network& network() { return network_; } From dae553a1d7d75be2fcee54e9b8d816a7ab3dc813 Mon Sep 17 00:00:00 2001 From: doylet Date: Fri, 16 May 2025 16:43:10 +1000 Subject: [PATCH 04/50] WIP: Return more extensive SN data ready payload --- oxenss/server/omq.cpp | 28 ++++++++++++++++++++-------- oxenss/server/omq.h | 13 +++++++++++++ oxenss/snode/service_node.cpp | 34 +++++++++++++++++++++++++++------- oxenss/snode/service_node.h | 1 - oxenss/storage/database.cpp | 15 +++++++++++++++ oxenss/storage/database.hpp | 3 +++ 6 files changed, 78 insertions(+), 16 deletions(-) diff --git a/oxenss/server/omq.cpp b/oxenss/server/omq.cpp index 182612937..170bb42f8 100644 --- a/oxenss/server/omq.cpp +++ b/oxenss/server/omq.cpp @@ -47,16 +47,28 @@ std::string OMQ::peer_lookup(std::string_view pubkey_bin) const { void OMQ::handle_sn_data_ready(oxenmq::Message& message) { log::debug(logcat, "[OMQ] handle sn.data_ready from: {}", message.conn.to_string()); - auto& xpk_str = message.conn.pubkey(); - if (xpk_str.size() != sizeof(crypto::x25519_pubkey)) - return message.send_reply("Remote not recognized as SN"); + SNDataReadyResponse response = {}; + if (response.status == SNDataReadyStatus::Nil) { + auto& xpk_str = message.conn.pubkey(); + if (xpk_str.size() != sizeof(crypto::x25519_pubkey)) { + response.status = SNDataReadyStatus::RemoteNotRecognizedAsSN; + } else { + crypto::x25519_pubkey xpk; + std::memcpy(xpk.data(), xpk_str.data(), sizeof(crypto::x25519_pubkey)); + if (!service_node_->is_swarm_peer(xpk)) + response.status = SNDataReadyStatus::SwarmMismatch; + } + } - crypto::x25519_pubkey xpk; - std::memcpy(xpk.data(), xpk_str.data(), sizeof(crypto::x25519_pubkey)); - if (!service_node_->is_swarm_peer(xpk)) - return message.send_reply("Swarm mismatch"); + if (response.status == SNDataReadyStatus::Nil) { + response.status = SNDataReadyStatus::OK; + response.newest_timestamp = service_node_->db->retrieve_newest_timestamp(); + } - message.send_reply("OK"); + oxenc::bt_dict_producer dict; + dict.append("s", static_cast(response.status)); + dict.append("t", response.newest_timestamp); + message.send_reply(dict.view()); } void OMQ::handle_sn_data(oxenmq::Message& message) { diff --git a/oxenss/server/omq.h b/oxenss/server/omq.h index 8c6fa1b49..d4106e97b 100644 --- a/oxenss/server/omq.h +++ b/oxenss/server/omq.h @@ -28,6 +28,19 @@ namespace snode { namespace oxenss::server { +enum class SNDataReadyStatus { + Nil, + RemoteNotRecognizedAsSN, + SwarmMismatch, + OK, + Count, +}; + +struct SNDataReadyResponse { + SNDataReadyStatus status; + uint64_t newest_timestamp; +}; + class OMQ : public MQBase { oxenmq::OxenMQ omq_; oxenmq::ConnectionID oxend_conn_; diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 10ba7de24..472368475 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -428,19 +428,39 @@ void ServiceNode::check_new_members() { c->pubkey_x25519.view(), "sn.data_ready", [this, pk](bool success, std::vector data) { + + std::string error; + server::SNDataReadyResponse result = {}; if (data.empty()) { - success = false; - data.push_back("Empty reply"s); - } else if (data[0] != "OK"sv) { - success = false; + error = "Empty reply"; + } else { + oxenc::bt_dict_consumer d{data[0]}; + try { + uint32_t status_u32 = d.require("s"sv); + uint32_t last = static_cast(server::SNDataReadyStatus::Count); + if (status_u32 >= last) + error = "SN data ready status was OOB (received {})"_format(last); + else + result.status = static_cast(status_u32); + } catch (const std::exception& e) { + error = "SN data ready status was not a 4 byte unsigned integer"; + } + + try { + result.newest_timestamp = d.require("t"); + } catch (const std::exception& e) { + error = "SN data ready timestamp was not an 8 byte unsigned integer"; + } } - if (!success) { + + if (result.status == server::SNDataReadyStatus::Nil) { log::info( logcat, "Failed to connect to remote SS {} to initiate new " - "data transfer ({}); will retry soon", + "data transfer ({}: {}); will retry soon", pk, - fmt::join(data, ", ")); + fmt::join(data, ", "), + error); return; } log::debug( diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index 70dd3cff3..f8e86aed1 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -85,7 +85,6 @@ class ServiceNode { uint64_t block_height_ = 0; uint64_t target_height_ = 0; std::string block_hash_; - std::unique_ptr db_; std::weak_ptr http_; SnodeStatus status_ = SnodeStatus::UNKNOWN; diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index 3b91fb286..723ef24d1 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -661,6 +661,21 @@ std::optional Database::retrieve_by_hash(const std::string& msg_hash) { return get_message(*impl, st); } +uint64_t Database::retrieve_newest_timestamp() { + auto impl = get_impl(false); + auto st = impl->prepared_st( + "SELECT COALESCE((SELECT timestamp FROM owned_messages ORDER BY timestamp DESC LIMIT " + "1), 0);"); + uint64_t result = 0; + while (st->executeStep()) { + int64_t time = get(st); + assert(time >= 0); + if (time >= 0) + result = static_cast(time); + } + return result; +} + StoreResult Database::store(const message& msg, std::chrono::system_clock::time_point* expiry) { auto impl = get_impl(true); diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index 18c2fa78f..eea46d2e9 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -129,6 +129,9 @@ class Database { // pubkey or namespace! std::optional retrieve_by_hash(const std::string& msg_hash); + // Get the latest owned message's timestamp. Returns 0 if there are no messages in the DB + uint64_t retrieve_newest_timestamp(); + // Removes expired messages from the database; the `Database` instance owner should call // this periodically. void clean_expired(); From c545a1c942055df6ddd53e0cb1b54e9726463241 Mon Sep 17 00:00:00 2001 From: doylet Date: Tue, 20 May 2025 14:46:25 +1000 Subject: [PATCH 05/50] Move swarm logs into derivation function, serialise SNDataReady w/ helper function Keep the SNDataReady serialisation code compartmentalised into one function that handles both a read and write of the data structure. --- oxenss/server/omq.cpp | 50 +++++++++++++++++++++++++++--- oxenss/server/omq.h | 14 +++++++++ oxenss/snode/service_node.cpp | 34 ++++++--------------- oxenss/snode/swarm.cpp | 57 +++++++++++++++++------------------ oxenss/snode/swarm.h | 6 ++-- unit_test/swarm.cpp | 6 ++-- 6 files changed, 104 insertions(+), 63 deletions(-) diff --git a/oxenss/server/omq.cpp b/oxenss/server/omq.cpp index 170bb42f8..6e3ded8e6 100644 --- a/oxenss/server/omq.cpp +++ b/oxenss/server/omq.cpp @@ -29,6 +29,49 @@ namespace oxenss::server { static auto logcat = log::Cat("server"); +BTSerialiseResult sn_data_ready_response_serialise( + SNDataReadyResponse& item, + BTSerialise serialise, + std::string_view serialized_data) { + + BTSerialiseResult result = {}; + + constexpr std::string_view STATUS_KEY = "s"; + constexpr std::string_view TIMESTAMP_KEY = "t"; + if (serialise == BTSerialise::Write) { + assert(serialized_data.empty()); + oxenc::bt_dict_producer dict; + dict.append(STATUS_KEY, static_cast(item.status)); + dict.append(TIMESTAMP_KEY, item.newest_timestamp); + result.write_payload = dict.view(); + result.success = true; + } else { + oxenc::bt_dict_consumer d{serialized_data}; + SNDataReadyResponse response = {}; + try { + uint32_t status_u32 = d.require(STATUS_KEY); + uint32_t last = static_cast(SNDataReadyStatus::Count); + if (status_u32 >= last) + result.read_error = "SN data ready status was OOB (received {})"_format(last); + else + response.status = static_cast(status_u32); + } catch (const std::exception& e) { + result.read_error = "SN data ready status was not a 4 byte unsigned integer"; + } + + try { + response.newest_timestamp = d.require(TIMESTAMP_KEY); + } catch (const std::exception& e) { + result.read_error = "SN data ready timestamp was not an 8 byte unsigned integer"; + } + + result.success = result.read_error.empty(); + if (result.success) + item = std::move(response); + } + return result; +} + std::string OMQ::peer_lookup(std::string_view pubkey_bin) const { log::trace(logcat, "[OMQ] Peer Lookup"); @@ -65,10 +108,9 @@ void OMQ::handle_sn_data_ready(oxenmq::Message& message) { response.newest_timestamp = service_node_->db->retrieve_newest_timestamp(); } - oxenc::bt_dict_producer dict; - dict.append("s", static_cast(response.status)); - dict.append("t", response.newest_timestamp); - message.send_reply(dict.view()); + BTSerialiseResult write_result = sn_data_ready_response_serialise(response, BTSerialise::Write, ""); + assert(write_result.success); + message.send_reply(write_result.write_payload); } void OMQ::handle_sn_data(oxenmq::Message& message) { diff --git a/oxenss/server/omq.h b/oxenss/server/omq.h index d4106e97b..08d05c07b 100644 --- a/oxenss/server/omq.h +++ b/oxenss/server/omq.h @@ -41,6 +41,20 @@ struct SNDataReadyResponse { uint64_t newest_timestamp; }; +enum class BTSerialise { + Read, + Write, +}; + +struct BTSerialiseResult { + bool success; + std::string write_payload; + std::string read_error; +}; + +BTSerialiseResult sn_data_ready_response_serialise( + server::SNDataReadyResponse& item, BTSerialise serialise, std::string_view serialized_data); + class OMQ : public MQBase { oxenmq::OxenMQ omq_; oxenmq::ConnectionID oxend_conn_; diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 472368475..9c279b6bb 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -428,39 +428,23 @@ void ServiceNode::check_new_members() { c->pubkey_x25519.view(), "sn.data_ready", [this, pk](bool success, std::vector data) { - - std::string error; - server::SNDataReadyResponse result = {}; + server::SNDataReadyResponse response = {}; + server::BTSerialiseResult read_result = {}; if (data.empty()) { - error = "Empty reply"; + read_result.read_error = "Empty reply"; } else { - oxenc::bt_dict_consumer d{data[0]}; - try { - uint32_t status_u32 = d.require("s"sv); - uint32_t last = static_cast(server::SNDataReadyStatus::Count); - if (status_u32 >= last) - error = "SN data ready status was OOB (received {})"_format(last); - else - result.status = static_cast(status_u32); - } catch (const std::exception& e) { - error = "SN data ready status was not a 4 byte unsigned integer"; - } - - try { - result.newest_timestamp = d.require("t"); - } catch (const std::exception& e) { - error = "SN data ready timestamp was not an 8 byte unsigned integer"; - } + read_result = server::sn_data_ready_response_serialise( + response, server::BTSerialise::Read, data[0]); } - if (result.status == server::SNDataReadyStatus::Nil) { + if (!read_result.success) { log::info( logcat, "Failed to connect to remote SS {} to initiate new " "data transfer ({}: {}); will retry soon", pk, fmt::join(data, ", "), - error); + read_result.read_error); return; } log::debug( @@ -573,7 +557,7 @@ void ServiceNode::save_bulk(const std::vector& msgs) { } void ServiceNode::on_bootstrap_update(block_update&& bu) { - swarm_.update_swarms(std::move(bu.swarms), bu.contacts); + swarm_.update_swarms(bu.height, std::move(bu.swarms), bu.contacts); target_height_ = std::max(target_height_, bu.height); } @@ -615,7 +599,7 @@ void ServiceNode::on_snodes_update(block_update&& bu) { active_ = true; } - auto events = swarm_.update_swarms(std::move(bu.swarms), bu.contacts); + auto events = swarm_.update_swarms(bu.height, std::move(bu.swarms), bu.contacts); if (const SnodeStatus status = events.our_swarm_id != INVALID_SWARM_ID ? SnodeStatus::ACTIVE : bu.decommed ? SnodeStatus::DECOMMISSIONED diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index e3a604f88..82f972c6e 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -16,7 +16,7 @@ static auto logswarm = log::Cat("swarm"); Swarm::~Swarm() = default; -SwarmEvents Swarm::derive_swarm_events(const swarms_t& swarms) const { +SwarmEvents Swarm::derive_swarm_events(uint64_t height, const swarms_t& swarms) const { SwarmEvents events{}; events.our_swarm_id = INVALID_SWARM_ID; @@ -31,14 +31,27 @@ SwarmEvents Swarm::derive_swarm_events(const swarms_t& swarms) const { const auto& new_swarm = events.our_swarm_id; const auto& old_swarm = cur_swarm_id_; - if (new_swarm == INVALID_SWARM_ID) + if (new_swarm == INVALID_SWARM_ID) { + if (cur_swarm_id_ != INVALID_SWARM_ID) + log::warning( + logswarm, + "Leaving swarm {:#018x}: we are no longer an active Service Node", + cur_swarm_id_); + else + log::debug(logswarm, "Still not an active Service Node"); + // We are not in any swarm (or have been kicked out); nothing to do return events; + } - if (old_swarm == INVALID_SWARM_ID) + if (old_swarm == INVALID_SWARM_ID) { + log::info(logcat, "Joined swarm {:#18x} (blk {:#018x})", new_swarm, height); // We were previously not in a swarm, which means we just got assigned to one and so we have // nothing to do (other snodes will also see this and push messages to us). + events.new_swarm_members = events.our_swarm_members; + events.new_swarm_members.erase(our_pk); return events; + } if (old_swarm != new_swarm) { // Moved to a new swarm @@ -58,6 +71,13 @@ SwarmEvents Swarm::derive_swarm_events(const swarms_t& swarms) const { // |.................########|########!!!!!!!!!!!!!!!!!| events.dissolved = true; } + log::info( + logcat, + "Changed from {:018x} {}to {:018x} (blk {})", + old_swarm, + new_swarm, + height, + events.dissolved ? "(dissolved) " : ""); // If our old swarm is still alive then that means we got moved out of it, and so there's // nothing for us to do because the remaining swarm members will continue to administer the @@ -105,36 +125,15 @@ SwarmEvents Swarm::derive_swarm_events(const swarms_t& swarms) const { } SwarmEvents Swarm::update_swarms( - swarms_t&& swarms, const std::map& new_contacts) { + uint64_t height, + swarms_t&& swarms, + const std::map& new_contacts) { std::lock_guard lock{network.mut_}; - auto events = derive_swarm_events(swarms); - - if (events.our_swarm_id == INVALID_SWARM_ID) { - if (cur_swarm_id_ != INVALID_SWARM_ID) - log::warning( - logswarm, - "Leaving swarm {:#018x}: we are no longer an active Service Node", - cur_swarm_id_); - else - log::debug(logswarm, "Still not an active Service Node"); - } else { - - if (cur_swarm_id_ == INVALID_SWARM_ID) - log::info(logswarm, "SN now active, joining swarm {:#018x}", events.our_swarm_id); - else if (cur_swarm_id_ != events.our_swarm_id) - log::info( - logswarm, - "SN moving from swarm {:#018x} to swarm {:#018x}", - cur_swarm_id_, - events.our_swarm_id); - - // The following only make sense if we are active, i.e. still in a swarm - - if (events.dissolved) - log::info(logswarm, "Our swarm ({:#018x}) got DISSOLVED!", cur_swarm_id_); + auto events = derive_swarm_events(height, swarms); + if (events.our_swarm_id != INVALID_SWARM_ID) { for (const auto& pk : events.new_swarm_members) { log::info(logswarm, "New SN joining our swarm: {}", pk); pending_new_members_.emplace(pk, std::chrono::steady_clock::now()); diff --git a/oxenss/snode/swarm.h b/oxenss/snode/swarm.h index 91dd0fafa..64b46f2a5 100644 --- a/oxenss/snode/swarm.h +++ b/oxenss/snode/swarm.h @@ -45,7 +45,7 @@ class Swarm { pending_new_members_; // Extract relevant information from incoming swarm composition. - SwarmEvents derive_swarm_events(const swarms_t& swarms) const; + SwarmEvents derive_swarm_events(uint64_t height, const swarms_t& swarms) const; public: Network& network; @@ -59,7 +59,9 @@ class Swarm { /// Update swarm state; this takes care of updating both this swarm itself, and propagates the /// general network swarm changes to the Network object (including contacts) as well. SwarmEvents update_swarms( - swarms_t&& swarms, const std::map& new_contacts); + uint64_t height, + swarms_t&& swarms, + const std::map& new_contacts); bool is_pubkey_for_us(const user_pubkey& pk) const; diff --git a/unit_test/swarm.cpp b/unit_test/swarm.cpp index b0cf30f22..0521a084a 100644 --- a/unit_test/swarm.cpp +++ b/unit_test/swarm.cpp @@ -53,7 +53,7 @@ TEST_CASE("service nodes - pubkey to swarm id") { swarms_t swarms; for (oxenss::snode::swarm_id_t s : {100, 200, 300, 399, 498, 596, 694}) swarms[s]; - swarm.update_swarms(swarms_t{swarms}, {}); + swarm.update_swarms(0, swarms_t{swarms}, {}); oxenss::user_pubkey pk; @@ -147,7 +147,7 @@ TEST_CASE("service nodes - pubkey to swarm id") { // *sigh*). oxenss::snode::swarm_id_t wrapped_swarm = (uint64_t)-20; swarms[wrapped_swarm]; - swarm.update_swarms(swarms_t{swarms}, {}); + swarm.update_swarms(0, swarms_t{swarms}, {}); REQUIRE(pk.load("050000000000000000000000000000000000000000000000000000000000000027")); CHECK(network.get_swarm_id_for(pk).value() == swarms.rbegin()->first); REQUIRE(pk.load("050000000000000000000000000000000000000000000000000000000000000028")); @@ -160,7 +160,7 @@ TEST_CASE("service nodes - pubkey to swarm id") { // as max-u64 away, rather than 1 away), and so the id always maps to the highest swarm (even // though 0xfff...fe maps to the lowest swarm; the first check here, then, would fail. swarms[0]; - swarm.update_swarms(swarms_t{swarms}, {}); + swarm.update_swarms(0, swarms_t{swarms}, {}); REQUIRE(pk.load("05000000000000000000000000000000000000000000000000ffffffffffffffff")); CHECK(network.get_swarm_id_for(pk).value() == 0); REQUIRE(pk.load("05000000000000000000000000000000000000000000000000fffffffffffffffe")); From 89ceced1a8a173a0eb21a5ca6bd367e58c6e8854 Mon Sep 17 00:00:00 2001 From: doylet Date: Mon, 2 Jun 2025 11:15:12 +1000 Subject: [PATCH 06/50] Timestamps in the DB are stored in millisecond precision --- oxenss/rpc/request_handler.cpp | 8 ++++---- oxenss/server/omq.cpp | 5 +++-- oxenss/server/omq.h | 2 +- oxenss/snode/service_node.cpp | 4 ++-- oxenss/snode/swarm.cpp | 23 +++++++++++++---------- oxenss/snode/swarm.h | 23 +++++++++++++++-------- oxenss/storage/database.cpp | 6 +++--- oxenss/storage/database.hpp | 2 +- 8 files changed, 42 insertions(+), 31 deletions(-) diff --git a/oxenss/rpc/request_handler.cpp b/oxenss/rpc/request_handler.cpp index 68559c470..9a008abce 100644 --- a/oxenss/rpc/request_handler.cpp +++ b/oxenss/rpc/request_handler.cpp @@ -434,13 +434,13 @@ static void distribute_command( res->pending += peers.size(); for (auto& peer : peers) { - auto ct = sn.contacts().find(peer); + auto ct = sn.contacts().find(peer.first); if (!ct || !*ct) { log::debug( logcat, "Not distributing {} to swarm peer {}: SN {}", cmd, - peer, + peer.first, ct ? "is non-contactable" : "not found"); res->pending--; continue; @@ -454,7 +454,7 @@ static void distribute_command( log::warning( logcat, "Response timeout from {} for forwarded command {}", - peer, + peer.first, cmd); bool good_result = success && parts.size() == 1; if (good_result) { @@ -465,7 +465,7 @@ static void distribute_command( logcat, "Received unparsable response to {} from {}: {}", cmd, - peer, + peer.first, e.what()); good_result = false; } diff --git a/oxenss/server/omq.cpp b/oxenss/server/omq.cpp index 6e3ded8e6..37a84ff5c 100644 --- a/oxenss/server/omq.cpp +++ b/oxenss/server/omq.cpp @@ -42,7 +42,7 @@ BTSerialiseResult sn_data_ready_response_serialise( assert(serialized_data.empty()); oxenc::bt_dict_producer dict; dict.append(STATUS_KEY, static_cast(item.status)); - dict.append(TIMESTAMP_KEY, item.newest_timestamp); + dict.append(TIMESTAMP_KEY, item.newest_timestamp.count()); result.write_payload = dict.view(); result.success = true; } else { @@ -60,7 +60,8 @@ BTSerialiseResult sn_data_ready_response_serialise( } try { - response.newest_timestamp = d.require(TIMESTAMP_KEY); + uint64_t newest_timestamp = d.require(TIMESTAMP_KEY); + response.newest_timestamp = std::chrono::milliseconds(newest_timestamp); } catch (const std::exception& e) { result.read_error = "SN data ready timestamp was not an 8 byte unsigned integer"; } diff --git a/oxenss/server/omq.h b/oxenss/server/omq.h index 08d05c07b..c76b64c2a 100644 --- a/oxenss/server/omq.h +++ b/oxenss/server/omq.h @@ -38,7 +38,7 @@ enum class SNDataReadyStatus { struct SNDataReadyResponse { SNDataReadyStatus status; - uint64_t newest_timestamp; + std::chrono::milliseconds newest_timestamp; }; enum class BTSerialise { diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 9c279b6bb..89e58bbdc 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -419,7 +419,7 @@ void ServiceNode::check_new_members() { pk, fmt::join(NEW_SWARM_MEMBER_HANDSHAKE_VERSION, "."), fmt::join(c->version, ".")); - swarm_.set_member_ready(pk); + swarm_.set_member_ready(pk, std::nullopt); continue; } @@ -451,7 +451,7 @@ void ServiceNode::check_new_members() { logcat, "Successful contact made with swarm member {}, queuing a message push", pk); - swarm_.set_member_ready(pk); + swarm_.set_member_ready(pk, response.newest_timestamp); }); } diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index 82f972c6e..1e489796b 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -89,12 +89,9 @@ SwarmEvents Swarm::derive_swarm_events(uint64_t height, const swarms_t& swarms) /// --- WE are still in the same swarm if we reach here --- /// See if anyone joined our swarm: if so, we need to push messages to them: - std::set_difference( - events.our_swarm_members.begin(), - events.our_swarm_members.end(), - members_.begin(), - members_.end(), - std::inserter(events.new_swarm_members, events.new_swarm_members.end())); + for (auto it : events.our_swarm_members) + if (members_.count(it) == 0) + events.new_swarm_members.insert(it); events.new_swarm_members.erase(our_pk); // See if there are any new swarms, because if there are, we might need to push messages to them @@ -142,7 +139,9 @@ SwarmEvents Swarm::update_swarms( for (auto swarm : events.new_swarms) log::info(logswarm, "New network swarm: {}", swarm); - members_ = events.our_swarm_members; + members_.clear(); + for (auto it : events.our_swarm_members) + members_.try_emplace(it); } cur_swarm_id_ = events.our_swarm_id; @@ -157,13 +156,13 @@ bool Swarm::is_pubkey_for_us(const user_pubkey& pk) const { return maybe_swarm && cur_swarm_id_ == *maybe_swarm; } -std::set Swarm::members() const { +std::map Swarm::members() const { std::shared_lock lock{network.mut_}; return members_; } // Returns a copy of all the other members of this swarm, not including this node. -std::set Swarm::peers() const { +std::map Swarm::peers() const { auto peers = members(); peers.erase(our_pk); return peers; @@ -237,10 +236,14 @@ std::set Swarm::extract_ready_members() { return result; } -void Swarm::set_member_ready(const crypto::legacy_pubkey& pk) { +void Swarm::set_member_ready( + const crypto::legacy_pubkey& pk, std::optional last_synced_ts) { std::lock_guard lock{network.mut_}; if (auto it = pending_new_members_.find(pk); it != pending_new_members_.end()) it->second = std::nullopt; + if (last_synced_ts) + if (auto it = members_.find(pk); it != members_.end()) + it->second.newest_msg_timestamp = *last_synced_ts; } } // namespace oxenss::snode diff --git a/oxenss/snode/swarm.h b/oxenss/snode/swarm.h index 64b46f2a5..37deeb62d 100644 --- a/oxenss/snode/swarm.h +++ b/oxenss/snode/swarm.h @@ -35,8 +35,6 @@ constexpr auto NEW_SWARM_MEMBER_RETRY = 30s; class Swarm { swarm_id_t cur_swarm_id_ = INVALID_SWARM_ID; - std::set members_; // includes `our_pk`, when we are in a swarm. - // Pubkeys of new members into our swarm who we haven't yet established communications with; // once we do, we push all our swarm's messages to them. The value is the earliest timestamp at // which we should next try contacting them, or nullopt if we have confirmed contact and can now @@ -48,14 +46,21 @@ class Swarm { SwarmEvents derive_swarm_events(uint64_t height, const swarms_t& swarms) const; public: - Network& network; - const crypto::legacy_pubkey our_pk; - Swarm(Network& network, const crypto::legacy_pubkey& our_pk) : network{network}, our_pk{our_pk} {} ~Swarm(); + struct MemberState { + std::chrono::milliseconds newest_msg_timestamp; + }; + + std::map members_; // includes `our_pk`, when we are in a swarm. + + Network& network; + + const crypto::legacy_pubkey our_pk; + /// Update swarm state; this takes care of updating both this swarm itself, and propagates the /// general network swarm changes to the Network object (including contacts) as well. SwarmEvents update_swarms( @@ -66,10 +71,10 @@ class Swarm { bool is_pubkey_for_us(const user_pubkey& pk) const; // Returns a copy of all the members of this swarm, including this node. - std::set members() const; + std::map members() const; // Returns a copy of all the other members of this swarm, not including this node. - std::set peers() const; + std::map peers() const; // Returns true if the given pubkey is recognized as a member of this swarm. bool is_member(const crypto::legacy_pubkey& pk) const; @@ -85,7 +90,9 @@ class Swarm { // Marks a pending member as ready, so that it is returned by the next call to // `extract_ready_members()`, and is no longer returned by `extract_pending_members()`. - void set_member_ready(const crypto::legacy_pubkey& pk); + void set_member_ready( + const crypto::legacy_pubkey& pk, + std::optional last_synced_ts); // Extracts any "ready" members (that is, those that were pending and then marked ready with // `set_member_ready`), returning them and removing them from the pending members list. diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index 723ef24d1..9ad18917d 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -661,17 +661,17 @@ std::optional Database::retrieve_by_hash(const std::string& msg_hash) { return get_message(*impl, st); } -uint64_t Database::retrieve_newest_timestamp() { +std::chrono::milliseconds Database::retrieve_newest_timestamp() { auto impl = get_impl(false); auto st = impl->prepared_st( "SELECT COALESCE((SELECT timestamp FROM owned_messages ORDER BY timestamp DESC LIMIT " "1), 0);"); - uint64_t result = 0; + std::chrono::milliseconds result = {}; while (st->executeStep()) { int64_t time = get(st); assert(time >= 0); if (time >= 0) - result = static_cast(time); + result = std::chrono::milliseconds(static_cast(time)); } return result; } diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index eea46d2e9..72b1c6e90 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -130,7 +130,7 @@ class Database { std::optional retrieve_by_hash(const std::string& msg_hash); // Get the latest owned message's timestamp. Returns 0 if there are no messages in the DB - uint64_t retrieve_newest_timestamp(); + std::chrono::milliseconds retrieve_newest_timestamp(); // Removes expired messages from the database; the `Database` instance owner should call // this periodically. From 652232928fee8adbc0d42f18c2484bfb1c3670ee Mon Sep 17 00:00:00 2001 From: doylet Date: Mon, 2 Jun 2025 14:23:06 +1000 Subject: [PATCH 07/50] Represent pending/ready members w/ an enum This is preliminary work to prep for persisting the swarm member's state to disk to allow resuming from the last known state. Currently when a storage server starts up, it assumes that it's joining a new swarm/new members are joining its swarm and does a full message dump to those members instead of being able to know if a swarm member is new or we already knew about it and then, choose a synchronisation method that is more suited for that particular scenario. --- oxenss/snode/service_node.cpp | 8 ++-- oxenss/snode/swarm.cpp | 69 ++++++++++++++++++----------------- oxenss/snode/swarm.h | 35 +++++++++++------- 3 files changed, 61 insertions(+), 51 deletions(-) diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 89e58bbdc..a6cc4c293 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -399,7 +399,7 @@ void ServiceNode::record_retrieve_request() { } void ServiceNode::check_new_members() { - for (const auto& pk : swarm_.extract_pending_members()) { + for (const auto& pk : swarm_.extract_contact_details_pending_members()) { auto c = network_.contacts.find(pk); if (!c || !*c) { // We don't have contact info, so don't do anything right now and this will get @@ -419,7 +419,7 @@ void ServiceNode::check_new_members() { pk, fmt::join(NEW_SWARM_MEMBER_HANDSHAKE_VERSION, "."), fmt::join(c->version, ".")); - swarm_.set_member_ready(pk, std::nullopt); + swarm_.set_member_contact_details_ready(pk, std::nullopt); continue; } @@ -451,11 +451,11 @@ void ServiceNode::check_new_members() { logcat, "Successful contact made with swarm member {}, queuing a message push", pk); - swarm_.set_member_ready(pk, response.newest_timestamp); + swarm_.set_member_contact_details_ready(pk, response.newest_timestamp); }); } - if (auto send_now = swarm_.extract_ready_members(); !send_now.empty()) { + if (auto send_now = swarm_.extract_contact_details_ready_members(); !send_now.empty()) { auto msgs = db->retrieve_all(); log::debug( logcat, diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index 1e489796b..abde22971 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -131,15 +131,21 @@ SwarmEvents Swarm::update_swarms( auto events = derive_swarm_events(height, swarms); if (events.our_swarm_id != INVALID_SWARM_ID) { - for (const auto& pk : events.new_swarm_members) { + for (const auto& pk : events.new_swarm_members) log::info(logswarm, "New SN joining our swarm: {}", pk); - pending_new_members_.emplace(pk, std::chrono::steady_clock::now()); - } for (auto swarm : events.new_swarms) log::info(logswarm, "New network swarm: {}", swarm); - members_.clear(); + // Remove members that are no longer in the swarm from our runtime state + for (auto it = members_.begin(); it != members_.end(); ) { + if (events.our_swarm_members.find(it->first) == events.our_swarm_members.end()) + it = members_.erase(it); + else + it++; + } + + // Add members from the swarm that are missing from our runtime state for (auto it : events.our_swarm_members) members_.try_emplace(it); } @@ -192,58 +198,53 @@ size_t Swarm::size() const { return members_.size(); } -std::set Swarm::extract_pending_members() { +std::set Swarm::extract_contact_details_pending_members() { std::lock_guard lock{network.mut_}; std::set result; auto now = std::chrono::steady_clock::now(); - for (auto it = pending_new_members_.begin(); it != pending_new_members_.end();) { - auto& [pk, when] = *it; - if (!members_.count(pk)) { - // No longer in our swarm - it = pending_new_members_.erase(it); + for (auto it = members_.begin(); it != members_.end(); it++) { + MemberState& state = it->second; + if (state.status != MemberStatus::ContactDetailsPending) continue; - } - - if (when && *when <= now) { - *when = now + NEW_SWARM_MEMBER_RETRY; + std::chrono::steady_clock::time_point& next_retry = + it->second.check_contact_info_next_retry; + if (now >= next_retry) { + next_retry = now + NEW_SWARM_MEMBER_RETRY; + const crypto::legacy_pubkey& pk = it->first; result.insert(pk); } - ++it; } return result; } -std::set Swarm::extract_ready_members() { +std::set Swarm::extract_contact_details_ready_members() { std::lock_guard lock{network.mut_}; std::set result; - for (auto it = pending_new_members_.begin(); it != pending_new_members_.end();) { - auto& [pk, when] = *it; - if (!members_.count(pk)) { - // No longer in our swarm - it = pending_new_members_.erase(it); - } else if (!when) { - // Found one that is marked ready, so steal it: - result.insert(pk); - it = pending_new_members_.erase(it); - } else { - ++it; - } + for (auto& it : members_) { + if (it.second.status != MemberStatus::ContactDetailsReady) + continue; + const crypto::legacy_pubkey& pk = it.first; + it.second.status = MemberStatus::Ready; + result.insert(pk); } return result; } -void Swarm::set_member_ready( +void Swarm::set_member_contact_details_ready( const crypto::legacy_pubkey& pk, std::optional last_synced_ts) { std::lock_guard lock{network.mut_}; - if (auto it = pending_new_members_.find(pk); it != pending_new_members_.end()) - it->second = std::nullopt; - if (last_synced_ts) - if (auto it = members_.find(pk); it != members_.end()) + + auto it = members_.find(pk); + assert(it != members_.end()); + + if (it != members_.end()) { + it->second.status = MemberStatus::ContactDetailsReady; + if (last_synced_ts) it->second.newest_msg_timestamp = *last_synced_ts; + } } - } // namespace oxenss::snode diff --git a/oxenss/snode/swarm.h b/oxenss/snode/swarm.h index 37deeb62d..59fd4885e 100644 --- a/oxenss/snode/swarm.h +++ b/oxenss/snode/swarm.h @@ -2,7 +2,6 @@ #include #include -#include #include "network.h" #include "oxenss/crypto/keys.h" @@ -35,13 +34,6 @@ constexpr auto NEW_SWARM_MEMBER_RETRY = 30s; class Swarm { swarm_id_t cur_swarm_id_ = INVALID_SWARM_ID; - // Pubkeys of new members into our swarm who we haven't yet established communications with; - // once we do, we push all our swarm's messages to them. The value is the earliest timestamp at - // which we should next try contacting them, or nullopt if we have confirmed contact and can now - // send the data. - std::unordered_map> - pending_new_members_; - // Extract relevant information from incoming swarm composition. SwarmEvents derive_swarm_events(uint64_t height, const swarms_t& swarms) const; @@ -51,8 +43,23 @@ class Swarm { ~Swarm(); + enum struct MemberStatus { + // Pubkeys of new members into our swarm who we haven't yet established communications with; + // once we do, we push all our swarm's messages to them. + ContactDetailsPending, + ContactDetailsReady, + Ready, + }; + struct MemberState { + MemberStatus status; std::chrono::milliseconds newest_msg_timestamp; + + // The earliest timestamp at which the swarm will check if they have received contact + // information for this member yet and can send them data. Only utilised when status is + // 'ContactDetailsPending' before transitioning to 'ContactDetailsReady' when the contact + // detail has been confirmed. + std::chrono::steady_clock::time_point check_contact_info_next_retry; }; std::map members_; // includes `our_pk`, when we are in a swarm. @@ -86,17 +93,19 @@ class Swarm { // Resets the timer and returns the pubkeys of any new swarm members that are due to be // contacted to push swarm messages to. - std::set extract_pending_members(); + std::set extract_contact_details_pending_members(); // Marks a pending member as ready, so that it is returned by the next call to - // `extract_ready_members()`, and is no longer returned by `extract_pending_members()`. - void set_member_ready( + // `extract_contact_details_ready_members()`, and is no longer returned by + // `extract_contract_details_pending_member()`. + void set_member_contact_details_ready( const crypto::legacy_pubkey& pk, std::optional last_synced_ts); // Extracts any "ready" members (that is, those that were pending and then marked ready with - // `set_member_ready`), returning them and removing them from the pending members list. - std::set extract_ready_members(); + // `set_member_contact_details_ready`), returning them and transitioning them from the pending + // state. + std::set extract_contact_details_ready_members(); swarm_id_t our_swarm_id() const { std::shared_lock lock{network.mut_}; From 0b4532e6eaf384e7f86ad3103c173f37316c2eba Mon Sep 17 00:00:00 2001 From: doylet Date: Mon, 2 Jun 2025 16:09:58 +1000 Subject: [PATCH 08/50] Remove unused std::optional header --- oxenss/crypto/keys.h | 1 - oxenss/snode/reachability_testing.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/oxenss/crypto/keys.h b/oxenss/crypto/keys.h index bd0c073f0..d383eb8cd 100644 --- a/oxenss/crypto/keys.h +++ b/oxenss/crypto/keys.h @@ -3,7 +3,6 @@ #include #include #include -#include #include #include diff --git a/oxenss/snode/reachability_testing.h b/oxenss/snode/reachability_testing.h index 9a24165d5..eb958f8bb 100644 --- a/oxenss/snode/reachability_testing.h +++ b/oxenss/snode/reachability_testing.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace oxenss::snode { From 8460381f9e0c58a03da3cf7089207de7bc31db1f Mon Sep 17 00:00:00 2001 From: doylet Date: Wed, 4 Jun 2025 17:28:49 +1000 Subject: [PATCH 09/50] Store network swarm/swarm state to disk to resume from Restoring the swarm state means that the if the node is an active service node, it'll remember which swarm it was in so when the storage server gets restarted, it can correctly detect if a new SN is joining their swarm instead of assuming all the nodes have newly joined their swarm and consequently dump their entire SQL db to them. --- oxenss/common/serialize.h | 28 ++++++ oxenss/server/omq.h | 12 +-- oxenss/snode/network.h | 2 + oxenss/snode/service_node.cpp | 169 +++++++++++++++++++++++++++++++++- oxenss/snode/service_node.h | 12 +++ oxenss/snode/swarm.cpp | 2 +- oxenss/snode/swarm.h | 4 +- oxenss/storage/database.cpp | 28 ++++++ oxenss/storage/database.hpp | 3 + 9 files changed, 243 insertions(+), 17 deletions(-) create mode 100644 oxenss/common/serialize.h diff --git a/oxenss/common/serialize.h b/oxenss/common/serialize.h new file mode 100644 index 000000000..a4720a0dc --- /dev/null +++ b/oxenss/common/serialize.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include +#include + +namespace oxenss { +enum class BTSerialise { + Read, + Write, +}; + +struct BTSerialiseResult { + bool success; + std::string write_payload; + std::string read_error; +}; + +constexpr uint64_t FNV1A64_SEED = 14695981039346656037ULL; + +inline uint64_t fnv1a64_hasher(std::string_view bytes, uint64_t hash) { + for (size_t i = 0; i < bytes.size(); i++) + hash = (bytes[i] ^ hash) * 1099511628211 /*FNV Prime*/; + return hash; +} + +}; // namespace oxenss + diff --git a/oxenss/server/omq.h b/oxenss/server/omq.h index c76b64c2a..03f4e1512 100644 --- a/oxenss/server/omq.h +++ b/oxenss/server/omq.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace oxenss { @@ -41,17 +42,6 @@ struct SNDataReadyResponse { std::chrono::milliseconds newest_timestamp; }; -enum class BTSerialise { - Read, - Write, -}; - -struct BTSerialiseResult { - bool success; - std::string write_payload; - std::string read_error; -}; - BTSerialiseResult sn_data_ready_response_serialise( server::SNDataReadyResponse& item, BTSerialise serialise, std::string_view serialized_data); diff --git a/oxenss/snode/network.h b/oxenss/snode/network.h index 4d0cfe18a..ddb2743c2 100644 --- a/oxenss/snode/network.h +++ b/oxenss/snode/network.h @@ -37,6 +37,8 @@ class Network { friend class Swarm; + friend class ServiceNode; + swarms_t::const_iterator _find_swarm_for(const user_pubkey& pk) const; // Cached value of the all_nodes_blob() return value. The cache is cleared whenever swarms or diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index a6cc4c293..638c665fa 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -45,6 +45,110 @@ constexpr auto OXEND_PING_INTERVAL = 30s; constexpr auto NEW_SWARM_MEMBER_INTERVAL = 10s; +SerialiseResult ServiceNode::serialize(BTSerialise serialise, std::string_view serialized_data) const +{ + SerialiseResult result = {}; + + constexpr std::string_view VERSION_KEY = "@"; + constexpr std::string_view NETWORK_SWARMS_KEY = "network.swarms"; + constexpr std::string_view SWARM_CUR_SWARM_ID = "swarm.cur_swarm_id"; + constexpr std::string_view SWARM_MEMBERS_KEY = "swarm.members"; + + uint32_t version = 0; + if (serialise == BTSerialise::Write) { + oxenc::bt_dict_producer d; + d.append(VERSION_KEY, version); + + { + oxenc::bt_list_producer network_swarm_list = d.append_list(NETWORK_SWARMS_KEY); + for (auto it : network_.swarms_) { + auto swarm = network_swarm_list.append_list(); + swarm.append(it.first); // swarm_id_t + + { // Append list of pubkeys for this swarm + for (const crypto::legacy_pubkey& pk : it.second) + swarm.append(pk.view()); + } + } + } + + d.append(SWARM_CUR_SWARM_ID, swarm_.cur_swarm_id_); + + { // Append list of _our_ swarm members + oxenc::bt_list_producer swarm_member_list = d.append_list(SWARM_MEMBERS_KEY); + for (auto it : swarm_.members_) + swarm_member_list.append(it.first); // pk + } + + result.bt_serialise.success = true; + result.bt_serialise.write_payload = d.view(); + } else { + if (serialized_data.size()) { + oxenc::bt_dict_consumer d{serialized_data}; + try { + version = d.require(VERSION_KEY); + } catch (const std::exception& e) { + result.bt_serialise.read_error = "Failed to parse version: {}"_format(e.what()); + } + + if (result.bt_serialise.read_error.empty()) { + try { // Network swarms + auto [key, network_swarm_list] = d.next_list_consumer(); + assert(key == NETWORK_SWARMS_KEY); + + while (!network_swarm_list.is_finished()) { + auto swarm = network_swarm_list.consume_list_consumer(); + uint64_t swarm_id = swarm.consume(); + + std::set& keys = result.network_swarms[swarm_id]; + while (!swarm.is_finished()) { + auto bytes = swarm.consume(); + keys.insert(keys.end(), crypto::legacy_pubkey::from_bytes(bytes)); + } + } + + } catch (const std::exception& e) { + result.bt_serialise.read_error = + "Failed to parse network swarms: {}"_format(e.what()); + } + } + + if (result.bt_serialise.read_error.empty()) { + try { + result.swarm_cur_swarm_id = d.require(SWARM_CUR_SWARM_ID); + } catch (const std::exception& e) { + result.bt_serialise.read_error = + "Failed to swarm's current swarm ID: {}"_format(e.what()); + } + } + + if (result.bt_serialise.read_error.empty()) { + try { // Swarm members + auto [key, list] = d.next_list_consumer(); + assert(key == SWARM_MEMBERS_KEY); + + while (!list.is_finished()) { + auto bytes = list.consume(); + result.swarm_members[crypto::legacy_pubkey::from_bytes(bytes)]; + } + } catch (const std::exception& e) { + result.bt_serialise.read_error = + "Failed to parse swarm members: {}"_format(e.what()); + } + } + } + result.bt_serialise.success = result.bt_serialise.read_error.empty(); + } + + return result; +} + +static uint64_t fnv1a64_hasher(std::string_view bytes, uint64_t hash) { + for (size_t i = 0; i < bytes.size(); i++) + hash = (bytes[i] ^ hash) * 1099511628211 /*FNV Prime*/; + return hash; +} + ServiceNode::ServiceNode( const crypto::legacy_keypair& keys, const contact& contact, @@ -62,7 +166,26 @@ ServiceNode::ServiceNode( db{std::make_unique(dblocation)} { mq_servers_.push_back(&omq_server); - log::info(logcat, "Requesting initial swarm state"); + std::string blob_data = db->runtime_state_sn_blob(BTSerialise::Read, ""); + SerialiseResult serialise_result = serialize(BTSerialise::Read, blob_data); + if (serialise_result.bt_serialise.success) { + last_serialize_hash = fnv1a64_hasher(blob_data, FNV1A64_SEED); + swarm_.members_ = std::move(serialise_result.swarm_members); + network_.swarms_ = std::move(serialise_result.network_swarms); + swarm_.cur_swarm_id_ = serialise_result.swarm_cur_swarm_id; + } else { + blob_data.clear(); + } + + log::info( + logcat, + "Loaded {} ({}) swarms from disk (#{:x}; in swarm {:x} w/ {} members). Requesting " + "initial swarm state", + network_.swarms_.size(), + util::get_human_readable_bytes(blob_data.size()), + last_serialize_hash, + swarm_.cur_swarm_id_, + swarm_.members_.size()); omq_server->add_timer( [this] { @@ -88,6 +211,7 @@ ServiceNode::ServiceNode( syncing_ = false; }, 1h); + } void ServiceNode::on_oxend_connected() { @@ -429,12 +553,12 @@ void ServiceNode::check_new_members() { "sn.data_ready", [this, pk](bool success, std::vector data) { server::SNDataReadyResponse response = {}; - server::BTSerialiseResult read_result = {}; + BTSerialiseResult read_result = {}; if (data.empty()) { read_result.read_error = "Empty reply"; } else { read_result = server::sn_data_ready_response_serialise( - response, server::BTSerialise::Read, data[0]); + response, BTSerialise::Read, data[0]); } if (!read_result.success) { @@ -559,6 +683,25 @@ void ServiceNode::save_bulk(const std::vector& msgs) { void ServiceNode::on_bootstrap_update(block_update&& bu) { swarm_.update_swarms(bu.height, std::move(bu.swarms), bu.contacts); target_height_ = std::max(target_height_, bu.height); + + snode::SerialiseResult serialise_result = serialize(BTSerialise::Write, ""); + if (serialise_result.bt_serialise.success) { + uint64_t hash = fnv1a64_hasher(serialise_result.bt_serialise.write_payload, FNV1A64_SEED); + if (last_serialize_hash != hash) { + log::info( + logcat, + "Swarm state dirtied at blk {}; #{:x} => #{:x}, saving {} to DB", + block_height_, + last_serialize_hash, + hash, + util::get_human_readable_bytes( + serialise_result.bt_serialise.write_payload.size())); + + last_serialize_hash = hash; + db->runtime_state_sn_blob( + BTSerialise::Write, serialise_result.bt_serialise.write_payload); + } + } } void ServiceNode::on_snodes_update(block_update&& bu) { @@ -601,6 +744,26 @@ void ServiceNode::on_snodes_update(block_update&& bu) { auto events = swarm_.update_swarms(bu.height, std::move(bu.swarms), bu.contacts); + // Serialise state to blob and store into DB if dirtied + snode::SerialiseResult serialise_result = serialize(BTSerialise::Write, ""); + if (serialise_result.bt_serialise.success) { + uint64_t hash = fnv1a64_hasher(serialise_result.bt_serialise.write_payload, FNV1A64_SEED); + if (last_serialize_hash != hash) { + log::info( + logcat, + "Swarm state dirtied at blk {}; #{:x} => #{:x}, saving {} to DB", + block_height_, + last_serialize_hash, + hash, + util::get_human_readable_bytes( + serialise_result.bt_serialise.write_payload.size())); + + last_serialize_hash = hash; + db->runtime_state_sn_blob( + BTSerialise::Write, serialise_result.bt_serialise.write_payload); + } + } + if (const SnodeStatus status = events.our_swarm_id != INVALID_SWARM_ID ? SnodeStatus::ACTIVE : bu.decommed ? SnodeStatus::DECOMMISSIONED : SnodeStatus::UNSTAKED; diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index f8e86aed1..cf8a8455e 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -12,6 +12,7 @@ #include #include +#include #include #include "network.h" #include "swarm.h" @@ -73,6 +74,13 @@ constexpr std::string_view to_string(SnodeStatus status) { return "Unknown"sv; } +struct SerialiseResult { + BTSerialiseResult bt_serialise; + std::map swarm_members; + swarms_t network_swarms; + swarm_id_t swarm_cur_swarm_id; +}; + /// All service node logic that is not network-specific class ServiceNode { bool syncing_ = true; @@ -173,6 +181,10 @@ class ServiceNode { bool force_start, bool skip_bootstrap); + SerialiseResult serialize(BTSerialise serialise, std::string_view serialized_data) const; + + uint64_t last_serialize_hash = 0; + std::unique_ptr db; const Network& network() { return network_; } diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index abde22971..85268c864 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -45,7 +45,7 @@ SwarmEvents Swarm::derive_swarm_events(uint64_t height, const swarms_t& swarms) } if (old_swarm == INVALID_SWARM_ID) { - log::info(logcat, "Joined swarm {:#18x} (blk {:#018x})", new_swarm, height); + log::info(logcat, "Joined swarm {:#18x} (blk {})", new_swarm, height); // We were previously not in a swarm, which means we just got assigned to one and so we have // nothing to do (other snodes will also see this and push messages to us). events.new_swarm_members = events.our_swarm_members; diff --git a/oxenss/snode/swarm.h b/oxenss/snode/swarm.h index 59fd4885e..9bcceb4f8 100644 --- a/oxenss/snode/swarm.h +++ b/oxenss/snode/swarm.h @@ -32,8 +32,6 @@ struct SwarmEvents { constexpr auto NEW_SWARM_MEMBER_RETRY = 30s; class Swarm { - swarm_id_t cur_swarm_id_ = INVALID_SWARM_ID; - // Extract relevant information from incoming swarm composition. SwarmEvents derive_swarm_events(uint64_t height, const swarms_t& swarms) const; @@ -62,6 +60,8 @@ class Swarm { std::chrono::steady_clock::time_point check_contact_info_next_retry; }; + swarm_id_t cur_swarm_id_ = INVALID_SWARM_ID; + std::map members_; // includes `our_pk`, when we are in a swarm. Network& network; diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index 9ad18917d..6613e4216 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -283,6 +283,8 @@ class DatabaseImpl { } void initialize_database() { + [[maybe_unused]] int32_t db_version = db.execAndGet("PRAGMA user_version").getInt(); + if (!db.tableExists("owners")) { create_schema(); } @@ -332,6 +334,15 @@ CREATE TRIGGER IF NOT EXISTS revoked_autoclean )"); } + if (!db.tableExists("runtime_state")) { + log::info(logcat, "Upgrading database schema: adding runtime_state"); + db.exec(R"( +CREATE TABLE runtime_state ( + sn_blob BLOB +); + )"); + } + views_triggers_indices(); log::info(logcat, "Database setup complete"); @@ -1206,4 +1217,21 @@ void oxenss::Database::test_suite_block_for(std::chrono::milliseconds duration) std::this_thread::sleep_for(duration); } +std::string Database::runtime_state_sn_blob(BTSerialise serialise, const std::string& write_blob) +{ + std::string result; + auto impl = get_impl(serialise == BTSerialise::Write); + if (serialise == BTSerialise::Read) { + auto stmt = impl->prepared_st("SELECT sn_blob FROM runtime_state LIMIT 1"); + auto maybe_result = exec_and_maybe_get(stmt); + if (maybe_result) + result = std::move(*maybe_result); + } else { + if (write_blob.size()) { + auto stmt = impl->prepared_st("REPLACE INTO runtime_state (sn_blob) VALUES (?)"); + exec_query(stmt, write_blob); + } + } + return result; +} } // namespace oxenss diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index 72b1c6e90..b341c5ef2 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -213,6 +214,8 @@ class Database { // found are not included). std::map get_expiries( const user_pubkey& pubkey, const std::vector& msg_hashes); + + std::string runtime_state_sn_blob(BTSerialise serialise, const std::string& write_blob); }; } // namespace oxenss From 7ee669cf0588873609e18c3985e927c4653549fe Mon Sep 17 00:00:00 2001 From: doylet Date: Thu, 5 Jun 2025 10:35:00 +1000 Subject: [PATCH 10/50] Update outdated comments --- oxenss/rpc/client_rpc_endpoints.h | 10 +++++----- oxenss/snode/swarm.cpp | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/oxenss/rpc/client_rpc_endpoints.h b/oxenss/rpc/client_rpc_endpoints.h index cc739c4fb..b5fd87936 100644 --- a/oxenss/rpc/client_rpc_endpoints.h +++ b/oxenss/rpc/client_rpc_endpoints.h @@ -541,9 +541,9 @@ struct delete_before final : recursive { }; /// Updates (shortens) the expiry of all stored messages, and broadcasts the update request to all -/// other swarm members. Note that this will not extend existing expiries, it will only shorten the -/// expiry of any messages that have expiries after the requested value. (To extend expiries of one -/// or more individual messages use the `expire` endpoint). +/// other swarm members. Note that this will not extend existing expiries, it will only shorten the +/// expiry of any messages that have expiries after the requested value. (To extend expiries of one +/// or more individual messages use the `expire_msgs` endpoint). /// /// Takes parameters of: /// - pubkey -- the pubkey whose messages shall have their expiries reduced, in hex (66) or bytes @@ -625,8 +625,8 @@ struct expire_all final : recursive { /// ("expire" || ShortenOrExtend || expiry || messages[0] || ... || messages[N]) /// where `expiry` is the expiry timestamp expressed as a string, for a single expiry, or the /// expiries concatenated together (expiry[0] || expiry[1] || ...) for multiple expiries. -/// `ShortenOrExtend` is string "shorten" if the shorten option is given (and true), "extend" if -/// `extend` is true, and empty otherwise. The signature must be base64 encoded (json) or bytes +/// `ShortenOrExtend` is the string "shorten" if the shorten option is given (and true), "extend" +/// if `extend` is true, and empty otherwise. The signature must be base64 encoded (json) or bytes /// (bt). /// /// Returns dict of: diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index 85268c864..f625941c2 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -46,8 +46,8 @@ SwarmEvents Swarm::derive_swarm_events(uint64_t height, const swarms_t& swarms) if (old_swarm == INVALID_SWARM_ID) { log::info(logcat, "Joined swarm {:#18x} (blk {})", new_swarm, height); - // We were previously not in a swarm, which means we just got assigned to one and so we have - // nothing to do (other snodes will also see this and push messages to us). + // We were previously not in a swarm, which means we just got assigned to one, we need to + // relay any of our messages belonging to the swarm events.new_swarm_members = events.our_swarm_members; events.new_swarm_members.erase(our_pk); return events; From 8ac41cfefcbb4cd5c68518b0b8f7182f0551ce90 Mon Sep 17 00:00:00 2001 From: doylet Date: Thu, 5 Jun 2025 16:45:42 +1000 Subject: [PATCH 11/50] Add tmp work-around for DB swarm dump on migration to DB v1 --- oxenss/rpc/request_handler.cpp | 1 + oxenss/snode/service_node.cpp | 4 ++-- oxenss/snode/service_node.h | 1 + oxenss/snode/swarm.cpp | 33 ++++++++++++++++++++++++++++----- oxenss/snode/swarm.h | 18 +++++++++++------- oxenss/storage/database.cpp | 4 ++-- oxenss/storage/database.hpp | 2 ++ 7 files changed, 47 insertions(+), 16 deletions(-) diff --git a/oxenss/rpc/request_handler.cpp b/oxenss/rpc/request_handler.cpp index 9a008abce..885bc0b48 100644 --- a/oxenss/rpc/request_handler.cpp +++ b/oxenss/rpc/request_handler.cpp @@ -445,6 +445,7 @@ static void distribute_command( res->pending--; continue; } + sn.omq_server()->request( ct->pubkey_x25519.view(), "sn.storage_cc", diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 638c665fa..dece37ede 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -523,7 +523,7 @@ void ServiceNode::record_retrieve_request() { } void ServiceNode::check_new_members() { - for (const auto& pk : swarm_.extract_contact_details_pending_members()) { + for (const auto& pk : swarm_.extract_contact_pending_members()) { auto c = network_.contacts.find(pk); if (!c || !*c) { // We don't have contact info, so don't do anything right now and this will get @@ -579,7 +579,7 @@ void ServiceNode::check_new_members() { }); } - if (auto send_now = swarm_.extract_contact_details_ready_members(); !send_now.empty()) { + if (auto send_now = swarm_.extract_contacts_needing_db_dump(); !send_now.empty()) { auto msgs = db->retrieve_all(); log::debug( logcat, diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index cf8a8455e..bfa93d3e5 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -101,6 +101,7 @@ class ServiceNode { const contact our_contact_; Network network_; + Swarm swarm_{network_, our_keys_.pub}; server::OMQ& omq_server_; diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index f625941c2..137cae029 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -146,9 +146,29 @@ SwarmEvents Swarm::update_swarms( } // Add members from the swarm that are missing from our runtime state - for (auto it : events.our_swarm_members) - members_.try_emplace(it); + for (auto it : events.new_swarm_members) { + auto& pair = members_[it]; + + // TODO: Remove this after everyone migrates their DB version to v1. v1 is when we + // started making the nodes store the swarm list and their swarm members to the DB to + // persist on restart. + // + // Before this, on startup they would consider all the nodes in the swarm they loaded + // from get_service_nodes as joining the swarm and perform a full message DB dump. + // Deploying this onto a live network would cause all the nodes to do a DB dump to each + // other the moment they upgraded. + // + // However after they upgrade and start persisting the swarm state to disk, from that + // point onwards they will correctly identify nodes that are leaving and joining their + // swarm and only do a message dump when necessary. + if (oxenss::tmp_init_db_version == 0) { + pair.new_swarm_member = false; // Prevent the swarm DB dump on newly migrated nodes + } else { + pair.new_swarm_member = true; + } + } } + oxenss::tmp_init_db_version = 1; // Disable after the first swarm update cur_swarm_id_ = events.our_swarm_id; @@ -198,7 +218,7 @@ size_t Swarm::size() const { return members_.size(); } -std::set Swarm::extract_contact_details_pending_members() { +std::set Swarm::extract_contact_pending_members() { std::lock_guard lock{network.mut_}; std::set result; @@ -219,7 +239,7 @@ std::set Swarm::extract_contact_details_pending_members() return result; } -std::set Swarm::extract_contact_details_ready_members() { +std::set Swarm::extract_contacts_needing_db_dump() { std::lock_guard lock{network.mut_}; std::set result; @@ -228,7 +248,10 @@ std::set Swarm::extract_contact_details_ready_members() { continue; const crypto::legacy_pubkey& pk = it.first; it.second.status = MemberStatus::Ready; - result.insert(pk); + if (it.second.new_swarm_member) { + it.second.new_swarm_member = false; + result.insert(pk); + } } return result; diff --git a/oxenss/snode/swarm.h b/oxenss/snode/swarm.h index 9bcceb4f8..5a3e3d451 100644 --- a/oxenss/snode/swarm.h +++ b/oxenss/snode/swarm.h @@ -53,6 +53,10 @@ class Swarm { MemberStatus status; std::chrono::milliseconds newest_msg_timestamp; + // Set if this member joined the swarm. They are assumed to not have any of the messages for + // the swarm yet so a full DB will be initiated + bool new_swarm_member; + // The earliest timestamp at which the swarm will check if they have received contact // information for this member yet and can send them data. Only utilised when status is // 'ContactDetailsPending' before transitioning to 'ContactDetailsReady' when the contact @@ -92,8 +96,13 @@ class Swarm { size_t size() const; // Resets the timer and returns the pubkeys of any new swarm members that are due to be - // contacted to push swarm messages to. - std::set extract_contact_details_pending_members(); + // contacted to establish liveness in prep for transitioning to a contact that we can push swarm + // messages to. + std::set extract_contact_pending_members(); + + // Returns the pubkeys of any new swarm members that have joined that we now have contact + // details for, mark them as ready and need a dump of the DB. + std::set extract_contacts_needing_db_dump(); // Marks a pending member as ready, so that it is returned by the next call to // `extract_contact_details_ready_members()`, and is no longer returned by @@ -102,11 +111,6 @@ class Swarm { const crypto::legacy_pubkey& pk, std::optional last_synced_ts); - // Extracts any "ready" members (that is, those that were pending and then marked ready with - // `set_member_contact_details_ready`), returning them and transitioning them from the pending - // state. - std::set extract_contact_details_ready_members(); - swarm_id_t our_swarm_id() const { std::shared_lock lock{network.mut_}; return cur_swarm_id_; diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index 6613e4216..4b7ede1b9 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -283,7 +283,7 @@ class DatabaseImpl { } void initialize_database() { - [[maybe_unused]] int32_t db_version = db.execAndGet("PRAGMA user_version").getInt(); + tmp_init_db_version = db.execAndGet("PRAGMA user_version").getInt(); if (!db.tableExists("owners")) { create_schema(); @@ -341,10 +341,10 @@ CREATE TABLE runtime_state ( sn_blob BLOB ); )"); + db.exec("PRAGMA user_version = 1;"); } views_triggers_indices(); - log::info(logcat, "Database setup complete"); } diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index b341c5ef2..b4d18665a 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -33,6 +33,8 @@ enum class StoreResult { Full, // Can't insert right now because the database is full. }; +inline std::atomic tmp_init_db_version = 0; + // Storage database class. class Database { std::stack> impl_pool_; From 91fa900c449a1f5f9465cea6ac60618da5e295b7 Mon Sep 17 00:00:00 2001 From: doylet Date: Thu, 12 Jun 2025 11:30:43 +1000 Subject: [PATCH 12/50] Infinite retries with exponential fallback When a node initiates a recursive swarm request, the initial node awaits the response from all other nodes before returning to the client. Children swarm nodes that fail to receive the request are stored into a retryable request queue to be re-attempted later. This queue is flushed every 3s by piggybacking onto the swarm member check function that is periodically invoked by OMQ. --- oxenss/rpc/request_handler.cpp | 118 +++++++---- oxenss/rpc/request_handler.h | 20 ++ oxenss/snode/service_node.cpp | 344 +++++++++++++++++++++++++++------ oxenss/snode/service_node.h | 38 +++- oxenss/snode/swarm.cpp | 35 ++-- oxenss/snode/swarm.h | 71 +++---- 6 files changed, 480 insertions(+), 146 deletions(-) diff --git a/oxenss/rpc/request_handler.cpp b/oxenss/rpc/request_handler.cpp index 885bc0b48..6394fd08c 100644 --- a/oxenss/rpc/request_handler.cpp +++ b/oxenss/rpc/request_handler.cpp @@ -403,13 +403,16 @@ struct swarm_response { bool b64; nlohmann::json result; std::function cb; + std::vector retry_nodes; + std::string cmd; + std::string req_payload; }; // Replies to a swarm request via its callback; sends an http::OK unless all of the // swarm entries returned things with "failed" in them or in the case of a non-recursive request, // the top-level object has a "failed" in it then we send back an INTERNAL_SERVER_ERROR // along with the response. -void reply_or_fail(const std::shared_ptr& res) { +static void reply_or_fail(snode::ServiceNode& sn, const std::shared_ptr& res) { auto res_code = http::INTERNAL_SERVER_ERROR; if (auto swarm_obj = res->result.find("swarm"); swarm_obj != res->result.end()) { for (const auto& [sn_pkey, obj] : swarm_obj->items()) { @@ -423,13 +426,38 @@ void reply_or_fail(const std::shared_ptr& res) { } res->cb(Response{res_code, std::move(res->result)}); + + if (res->retry_nodes.size()) { + snode::RequestRetry retry = {}; + retry.nodes = std::move(res->retry_nodes); + retry.cmd = res->cmd; + retry.req_payload = std::move(res->req_payload); + sn.add_retryable_request(std::move(retry)); + } } -static void distribute_command( - snode::ServiceNode& sn, - std::shared_ptr& res, - std::string_view cmd, - const rpc::recursive& req) { +SNStorageCCResult interpret_sn_storage_cc_response_parts( + bool success, std::span parts) { + bool good_result = success && parts.size() == 1; + SNStorageCCResult result = {}; + if (good_result) { + result.status = SNStorageCCResultStatus::Good; + } else { + bool timeout = !success; + if (timeout) { + result.status = SNStorageCCResultStatus::Timeout; + } else if (parts.size() == 2) { + result.status = SNStorageCCResultStatus::ErrorCodeReason; + result.error_code = parts[0]; + result.error_reason = parts[1]; + } else { + result.status = SNStorageCCResultStatus::BadPeerResponse; + } + } + return result; +} + +static void distribute_command(snode::ServiceNode& sn, std::shared_ptr& res) { auto peers = sn.swarm().peers(); res->pending += peers.size(); @@ -439,36 +467,36 @@ static void distribute_command( log::debug( logcat, "Not distributing {} to swarm peer {}: SN {}", - cmd, + res->cmd, peer.first, ct ? "is non-contactable" : "not found"); res->pending--; + + snode::RequestRetryEntry entry = {}; + entry.key = peer.first; + entry.reason = snode::RetryReason::NON_CONTACTABLE; + res->retry_nodes.push_back(entry); continue; } sn.omq_server()->request( ct->pubkey_x25519.view(), "sn.storage_cc", - [res, peer, peer_ed = ct->pubkey_ed25519, cmd](bool success, auto parts) { + [res, peer, peer_ed = ct->pubkey_ed25519, &sn](bool success, auto parts) { json peer_result; - if (!success) - log::warning( - logcat, - "Response timeout from {} for forwarded command {}", - peer.first, - cmd); - bool good_result = success && parts.size() == 1; - if (good_result) { + SNStorageCCResult store_result = + interpret_sn_storage_cc_response_parts(success, parts); + if (store_result.status == SNStorageCCResultStatus::Good) { try { peer_result = bt_to_json(oxenc::bt_dict_consumer{parts[0]}); } catch (const std::exception& e) { log::warning( logcat, "Received unparsable response to {} from {}: {}", - cmd, + res->cmd, peer.first, e.what()); - good_result = false; + store_result.status = SNStorageCCResultStatus::BadPeerResponse; } } @@ -477,15 +505,30 @@ static void distribute_command( // If we're the last response then we reply: bool send_reply = --res->pending == 0; - if (!good_result) { + if (store_result.status != SNStorageCCResultStatus::Good) { peer_result = json{{"failed", true}}; - if (!success) + bool timeout = store_result.status == SNStorageCCResultStatus::Timeout; + if (timeout) { peer_result["timeout"] = true; - else if (parts.size() == 2) { - peer_result["code"] = parts[0]; - peer_result["reason"] = parts[1]; - } else + } else if (store_result.status == SNStorageCCResultStatus::ErrorCodeReason) { + peer_result["code"] = store_result.error_code; + peer_result["reason"] = store_result.error_reason; + } else { peer_result["bad_peer_response"] = true; + } + + log::debug( + logcat, + "Failure response from {} for forwarded command {} ({}): <{}>", + peer.first, + res->cmd, + timeout ? "will be retried" : "unretryable due to error", + peer_result.dump()); + + snode::RequestRetryEntry entry = {}; + entry.key = peer.first; + entry.reason = snode::RetryReason::FAILED_TO_SEND; + res->retry_nodes.push_back(entry); } else if (res->b64) { if (auto it = peer_result.find("signature"); it != peer_result.end() && it->is_string()) @@ -493,12 +536,11 @@ static void distribute_command( } res->result["swarm"][peer_ed.hex()] = std::move(peer_result); - if (send_reply) - reply_or_fail(res); + reply_or_fail(sn, res); }, - cmd, - bt_serialize(req.to_bt()), + res->cmd, + res->req_payload, oxenmq::send_option::request_timeout{5s}); } } @@ -510,11 +552,13 @@ std::pair, std::unique_lock> static res->cb = std::move(cb); res->pending = 1; res->b64 = req.b64; + res->cmd = RPC::names()[0]; + res->req_payload = bt_serialize(req.to_bt()); std::unique_lock lock{res->mutex, std::defer_lock}; if (req.recurse) { // Send it off to our peers right away, before we process it ourselves - distribute_command(sn, res, RPC::names()[0], req); + distribute_command(sn, res); lock.lock(); } return {std::move(res), std::move(lock)}; @@ -645,7 +689,7 @@ void RequestHandler::process_client_req(rpc::store&& req, std::functionpending == 0) - reply_or_fail(std::move(res)); + reply_or_fail(service_node_, std::move(res)); } void RequestHandler::process_client_req( @@ -918,7 +962,7 @@ void RequestHandler::process_client_req( add_misc_response_fields(res->result, service_node_, now); if (--res->pending == 0) - reply_or_fail(std::move(res)); + reply_or_fail(service_node_, std::move(res)); } void RequestHandler::process_client_req(rpc::delete_msgs&& req, std::function cb) { @@ -980,7 +1024,7 @@ void RequestHandler::process_client_req(rpc::delete_msgs&& req, std::functionresult, service_node_); if (--res->pending == 0) - reply_or_fail(std::move(res)); + reply_or_fail(service_node_, std::move(res)); } void RequestHandler::process_client_req( @@ -1031,7 +1075,7 @@ void RequestHandler::process_client_req( add_misc_response_fields(res->result, service_node_); if (--res->pending == 0) - reply_or_fail(std::move(res)); + reply_or_fail(service_node_, std::move(res)); } void RequestHandler::process_client_req( @@ -1085,7 +1129,7 @@ void RequestHandler::process_client_req( add_misc_response_fields(res->result, service_node_); if (--res->pending == 0) - reply_or_fail(std::move(res)); + reply_or_fail(service_node_, std::move(res)); } void RequestHandler::process_client_req( @@ -1212,7 +1256,7 @@ void RequestHandler::process_client_req( add_misc_response_fields(res->result, service_node_, now); if (--res->pending == 0) - reply_or_fail(std::move(res)); + reply_or_fail(service_node_, std::move(res)); } void RequestHandler::process_client_req(rpc::expire_all&& req, std::function cb) { @@ -1278,7 +1322,7 @@ void RequestHandler::process_client_req(rpc::expire_all&& req, std::functionresult, service_node_, now); if (--res->pending == 0) - reply_or_fail(std::move(res)); + reply_or_fail(service_node_, std::move(res)); } void RequestHandler::process_client_req(rpc::expire_msgs&& req, std::function cb) { @@ -1419,7 +1463,7 @@ void RequestHandler::process_client_req(rpc::expire_msgs&& req, std::functionresult, service_node_, now); if (--res->pending == 0) - reply_or_fail(std::move(res)); + reply_or_fail(service_node_, std::move(res)); } void RequestHandler::process_client_req(rpc::get_expiries&& req, std::function cb) { diff --git a/oxenss/rpc/request_handler.h b/oxenss/rpc/request_handler.h index dac913c5c..061eaf2bc 100644 --- a/oxenss/rpc/request_handler.h +++ b/oxenss/rpc/request_handler.h @@ -84,6 +84,21 @@ struct Response { status{status}, body{binary_response}, keepalive{keepalive} {} }; +enum class SNStorageCCResultStatus { + Good, + Timeout, + ErrorCodeReason, + BadPeerResponse, +}; + +// Helper struct that stores the decoded response of a 'sn.storage_cc' request to a storage server +// and consequently the possible replies/states that can be returned from this operation. +struct SNStorageCCResult { + SNStorageCCResultStatus status = {}; + std::string_view error_code; + std::string_view error_reason; +}; + // Views the string or string_view body inside a Response. Should only be called when the body // has already been verified to not contain a json object or binary blob. inline std::string_view view_body(const Response& r) { @@ -148,6 +163,11 @@ std::string compute_hash(Func hasher, const T&... args) { /// Computes a message hash using blake2b hash of various messages attributes. std::string computeMessageHash(const user_pubkey& pubkey, namespace_id ns, std::string_view data); +/// Interpret the result an OMQ request to the 'sn.storage_cc' endpoint, typically for recursive +/// swarm requests. +SNStorageCCResult interpret_sn_storage_cc_response_parts( + bool success, std::span parts); + struct OnionRequestMetadata { crypto::x25519_pubkey ephem_key; std::function cb; diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index dece37ede..3bb2cdcdc 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -2,6 +2,7 @@ #include "serialization.h" #include "sn_test.h" +#include #include #include #include @@ -43,7 +44,9 @@ using MISSING_PUBKEY_THRESHOLD = std::ratio<3, 100>; /// TODO: there should be config.h to store constants like these constexpr auto OXEND_PING_INTERVAL = 30s; -constexpr auto NEW_SWARM_MEMBER_INTERVAL = 10s; +// How often to trigger 'do_backlogged_msg_relay' which checks for 'data ready' handshakes from +// swarm members to distribute missing messages and/or retry recursive swarm requests that failed +constexpr auto DO_BACKLOGGED_MSG_RELAY_INTERVAL = 3s; SerialiseResult ServiceNode::serialize(BTSerialise serialise, std::string_view serialized_data) const { @@ -194,7 +197,7 @@ ServiceNode::ServiceNode( }, Database::CLEANUP_PERIOD); - omq_server->add_timer([this] { check_new_members(); }, NEW_SWARM_MEMBER_INTERVAL); + omq_server->add_timer([this] { do_msg_backlog_relay(); }, DO_BACKLOGGED_MSG_RELAY_INTERVAL); // We really want to make sure nodes don't get stuck in "syncing" mode, // so if we are still "syncing" after a long time, activate SN regardless @@ -330,6 +333,11 @@ static std::optional parse_swarm_update( return maybe_bu; } +void ServiceNode::add_retryable_request(RequestRetry&& item) { + std::unique_lock lock{retryable_requests_mutex}; + retryable_requests.emplace_back(item); +} + void ServiceNode::register_mq_server(server::MQBase* server) { mq_servers_.push_back(server); } @@ -489,7 +497,7 @@ bool ServiceNode::snode_ready(std::string* reason) { return problems.empty() || force_start_; } -bool ServiceNode::is_swarm_peer(const crypto::x25519_pubkey& xpk) { +std::optional ServiceNode::is_swarm_peer(const crypto::x25519_pubkey& xpk) { return swarm_.is_member(xpk); } @@ -522,71 +530,293 @@ void ServiceNode::record_retrieve_request() { all_stats_.bump_retrieve_requests(); } -void ServiceNode::check_new_members() { - for (const auto& pk : swarm_.extract_contact_pending_members()) { - auto c = network_.contacts.find(pk); - if (!c || !*c) { - // We don't have contact info, so don't do anything right now and this will get - // triggered again later. - log::debug( - logcat, - "Leaving {} as pending: node {}", - pk, - c ? "has missing contact info" : "is unknown"); - continue; +struct LookupRetryIndexes { + std::optional retryable_index; + std::optional node_index; +}; + +static LookupRetryIndexes lookup_retry_indexes( + std::span retryable_requests, + uint64_t request_hash, + const crypto::legacy_pubkey& key) { + LookupRetryIndexes result = {}; + + // Find the retry request + for (size_t index = 0; index < retryable_requests.size(); index++) { + if (retryable_requests[index].hash == request_hash) { + result.retryable_index = index; + break; + } + } + + // Find the matching node inside the retry request + if (result.retryable_index) { + const RequestRetry& request = retryable_requests[*result.retryable_index]; + for (size_t index = 0; index < request.nodes.size(); index++) { + if (request.nodes[index].key == key) { + result.node_index = index; + break; + } } + } + + return result; +} + +void ServiceNode::do_msg_backlog_relay() { + auto now = std::chrono::steady_clock::now(); + if (now >= swarm_member_deadline) { + swarm_member_deadline = now + 10s; + for (const auto& pk : swarm_.extract_contact_pending_members()) { + auto c = network_.contacts.find(pk); + if (!c || !*c) { + // We don't have contact info, so don't do anything right now and this will get + // triggered again later. + log::debug( + logcat, + "Leaving {} as pending: node {}", + pk, + c ? "has missing contact info" : "is unknown"); + continue; + } - if (c->version < NEW_SWARM_MEMBER_HANDSHAKE_VERSION) { + if (c->version < NEW_SWARM_MEMBER_HANDSHAKE_VERSION) { + log::debug( + logcat, + "Skipping handshake with new swarm member {}: v{}+ required, remote is v{}", + pk, + fmt::join(NEW_SWARM_MEMBER_HANDSHAKE_VERSION, "."), + fmt::join(c->version, ".")); + swarm_.set_member_contact_details_ready(pk, std::nullopt); + continue; + } + + log::debug(logcat, "Initiating contact with new swarm member {}", pk); + omq_server_->request( + c->pubkey_x25519.view(), + "sn.data_ready", + [this, pk](bool success, std::vector data) { + server::SNDataReadyResponse response = {}; + BTSerialiseResult read_result = {}; + if (data.empty()) { + read_result.read_error = "Empty reply"; + } else { + read_result = server::sn_data_ready_response_serialise( + response, BTSerialise::Read, data[0]); + } + + if (!read_result.success) { + log::info( + logcat, + "Failed to connect to remote SS {} to initiate new " + "data transfer ({}: {}); will retry soon", + pk, + fmt::join(data, ", "), + read_result.read_error); + return; + } + log::debug( + logcat, + "Successful contact made with swarm member {}, marking as ready", + pk); + swarm_.set_member_contact_details_ready(pk, response.newest_timestamp); + }); + } + + if (auto send_now = swarm_.extract_contacts_needing_db_dump(); !send_now.empty()) { + auto msgs = db->retrieve_all(); log::debug( logcat, - "Skipping handshake with new swarm member {}: v{}+ required, remote is v{}", - pk, - fmt::join(NEW_SWARM_MEMBER_HANDSHAKE_VERSION, "."), - fmt::join(c->version, ".")); - swarm_.set_member_contact_details_ready(pk, std::nullopt); - continue; + "Initiating swarm message dump ({} message) to new swarm member(s): {}", + msgs.size(), + fmt::join(send_now, ", ")); + relay_messages(std::move(msgs), send_now); } + } - log::debug(logcat, "Initiating contact with new swarm member {}", pk); - omq_server_->request( - c->pubkey_x25519.view(), - "sn.data_ready", - [this, pk](bool success, std::vector data) { - server::SNDataReadyResponse response = {}; - BTSerialiseResult read_result = {}; - if (data.empty()) { - read_result.read_error = "Empty reply"; - } else { - read_result = server::sn_data_ready_response_serialise( - response, BTSerialise::Read, data[0]); - } + // Retry failed/timed-out requests + std::unique_lock lock{retryable_requests_mutex}; + if (log::Level level = log::Level::debug; + log::get_level(logcat) <= level && retryable_requests.size()) { + + fmt::memory_buffer log_buffer; + fmt::format_to( + std::back_inserter(log_buffer), + "Attempting {} retryable requests\n", + retryable_requests.size()); + + for (size_t index = 0; index < retryable_requests.size(); index++) { + const auto& item = retryable_requests[index]; + fmt::format_to( + std::back_inserter(log_buffer), + "{} [{}] '{}' command {} to {} node(s){}", + index ? "\n" : "", + index, + item.cmd, + util::get_human_readable_bytes(item.req_payload.size()), + item.nodes.size(), + item.nodes.size() ? "\n NODES" : ""); + + for (size_t node_index = 0; node_index < item.nodes.size(); node_index++) { + const auto& node_item = item.nodes[node_index]; + std::string_view reason = ""; + switch (node_item.reason) { + case RetryReason::NON_CONTACTABLE: reason = "non-contactable"; break; + case RetryReason::FAILED_TO_SEND: reason = "failed to send"; break; + } - if (!read_result.success) { - log::info( + std::string deadline = "now"; + if (node_item.deadline >= now) { + auto delta = node_item.deadline - now; + deadline = + "in {}"_format(std::chrono::duration_cast(delta)); + } + + fmt::format_to( + std::back_inserter(log_buffer), + "\n {}: {} ({}) retrying {}", + index, + node_item.key, + reason, + deadline); + } + } + + log::log(logcat, level, "{}", fmt::to_string(log_buffer)); + } + + for (auto it = retryable_requests.begin(); it != retryable_requests.end();) { + + // Create a hash of the inputs so that we can match dispatched requests easily with the + // originating retry item. + if (it->hash == 0) { + it->hash = FNV1A64_SEED; + it->hash = fnv1a64_hasher(it->cmd, it->hash); + it->hash = fnv1a64_hasher(it->req_payload, it->hash); + } + + for (auto node_it = it->nodes.begin(); node_it != it->nodes.end();) { + auto on_request_done = [this, hash = it->hash, key = node_it->key]( + bool success, std::vector parts) { + + std::unique_lock lock{retryable_requests_mutex}; + + // Lookup the originating retry-request responsible for this OMQ response + LookupRetryIndexes lookup = lookup_retry_indexes(retryable_requests, hash, key); + if (!lookup.retryable_index) + return; + + RequestRetry& request = retryable_requests[*lookup.retryable_index]; + if (lookup.node_index) { + RequestRetryEntry& node = request.nodes[*lookup.node_index]; + + // We cleanup the request in all situations except timeout (timeout + // indicating that the node was non-responsive, maybe offline). In an error + // state we don't know what state the recipient's storage server is in and + // we default to deleting it and ending the retry attempts. + rpc::SNStorageCCResult store_result = + rpc::interpret_sn_storage_cc_response_parts(success, parts); + bool cleanup = store_result.status != rpc::SNStorageCCResultStatus::Timeout; + + if (cleanup) { + std::string_view outcome = "succeeded"; + if (store_result.status != rpc::SNStorageCCResultStatus::Good) + outcome = "failed unrecoverably"; + + log::trace( logcat, - "Failed to connect to remote SS {} to initiate new " - "data transfer ({}: {}); will retry soon", - pk, - fmt::join(data, ", "), - read_result.read_error); - return; + "Retry to {} for {} ({}) {}, cleaning up", + key, + request.cmd, + util::get_human_readable_bytes(request.req_payload.size()), + outcome); + + request.nodes.erase(request.nodes.begin() + *lookup.node_index); + } else { + // Extend the next retry deadline and re-attempt later + float& delay_coeff = node.deadline_delay_coeff; + delay_coeff = std::max(delay_coeff, 1.f) * 1.3f; + delay_coeff = std::min(delay_coeff, 2.f); + + auto init_delay = std::chrono::duration_cast( + DO_BACKLOGGED_MSG_RELAY_INTERVAL); + auto delay = std::chrono::seconds( + static_cast(init_delay.count() * delay_coeff)); + + node.deadline = std::chrono::steady_clock::now() + delay; + log::trace( + logcat, + "Retry to {} for {} ({}) timed out, next attempt in ~{}", + key, + request.cmd, + util::get_human_readable_bytes(request.req_payload.size()), + delay); } - log::debug( + } + + // Remove retryable request if there are no more nodes to retry to + if (request.nodes.empty()) + retryable_requests.erase(retryable_requests.begin() + *lookup.retryable_index); + }; + + std::optional is_member = swarm_.is_member(node_it->key); + if (is_member) { + // Retry request if ready + bool is_due = now >= node_it->deadline; + bool ready = + (is_member->status == SwarmMemberStatus::ContactDetailsReady || + is_member->status == SwarmMemberStatus::Ready); + crypto::x25519_pubkey pubkey_x25519 = {}; + + if (ready && is_due) { + auto ct = contacts().find(node_it->key); + if (ct && *ct) + pubkey_x25519 = ct->pubkey_x25519; + } + + if (ready && is_due && pubkey_x25519) { + omq_server()->request( + pubkey_x25519.view(), + "sn.storage_cc", + on_request_done, + it->cmd, + it->req_payload, + oxenmq::send_option::request_timeout{5s}); + } + + if (!ready) { // Separate logging from logic for code clarity + log::trace( logcat, - "Successful contact made with swarm member {}, queuing a message push", - pk); - swarm_.set_member_contact_details_ready(pk, response.newest_timestamp); - }); - } + "Retry to {} ({}) deferred, member hasn't signaled 'data ready' (was " + "{})", + node_it->key, + it->cmd, + static_cast(is_member->status)); + } else if (!pubkey_x25519) { + log::trace( + logcat, + "Retry to {} ({}) deferred, contact info missing", + node_it->key, + it->cmd); + } + } - if (auto send_now = swarm_.extract_contacts_needing_db_dump(); !send_now.empty()) { - auto msgs = db->retrieve_all(); - log::debug( - logcat, - "Initiating swarm message dump ({} message) to new swarm member(s): {}", - msgs.size(), - fmt::join(send_now, ", ")); - relay_messages(std::move(msgs), send_now); + if (is_member) { + node_it++; + } else { + log::trace( + logcat, + "Retry to {} ({}) cancelled, not a member in swarm anymore", + node_it->key, + it->cmd); + node_it = it->nodes.erase(node_it); + } + } + + if (it->nodes.empty()) + it = retryable_requests.erase(it); + else + it++; } } diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index bfa93d3e5..57db46ef4 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -76,11 +76,30 @@ constexpr std::string_view to_string(SnodeStatus status) { struct SerialiseResult { BTSerialiseResult bt_serialise; - std::map swarm_members; + std::map swarm_members; swarms_t network_swarms; swarm_id_t swarm_cur_swarm_id; }; +enum class RetryReason { + NON_CONTACTABLE, + FAILED_TO_SEND, +}; + +struct RequestRetryEntry { + crypto::legacy_pubkey key; + RetryReason reason; + std::chrono::steady_clock::time_point deadline; + float deadline_delay_coeff; +}; + +struct RequestRetry { + std::string_view cmd; + std::string req_payload; + uint64_t hash; + std::vector nodes; +}; + /// All service node logic that is not network-specific class ServiceNode { bool syncing_ = true; @@ -121,6 +140,12 @@ class ServiceNode { mutable std::recursive_mutex sn_mutex_; + std::mutex retryable_requests_mutex; + + std::vector retryable_requests; + + std::chrono::steady_clock::time_point swarm_member_deadline = {}; + void send_notifies(message m); // Save multiple messages to the database at once (i.e. in a single transaction) @@ -133,7 +158,7 @@ class ServiceNode { void on_snodes_update(block_update&& bu); // Called periodically to attempt to initiate transfers to new snode members - void check_new_members(); + void do_msg_backlog_relay(); // Called if our oxend looks like it is missing lots of records when we first get data from it // to load initial data (especially contact info) from the bootstrap nodes. @@ -193,10 +218,14 @@ class ServiceNode { const Swarm& swarm() { return swarm_; } Contacts& contacts() { return network_.contacts; } + const Contacts& contacts() const { return network_.contacts; } const contact& own_address() { return our_contact_; } + // Enqueue a request to be re-attempted every 'DO_BACKLOGGED_MSG_RELAY_INTERVAL' intervals. + void add_retryable_request(RequestRetry&& item); + // Adds a MQ server, i.e. QUIC. The OMQ server is added automatically during construction and // should not be added. void register_mq_server(server::MQBase* server); @@ -223,8 +252,9 @@ class ServiceNode { rpc::OnionRequestMetadata&& data, std::function data)> cb) const; - // Returns true if the given x pubkey is recognized as one of our current swarm members - bool is_swarm_peer(const crypto::x25519_pubkey& xpk); + // Returns the peer's state if the given x pubkey is recognized as one of our current swarm + // members + std::optional is_swarm_peer(const crypto::x25519_pubkey& xpk); const hf_revision& hf() const { return hardfork_; } diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index 137cae029..15bb6cb1b 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -182,35 +182,40 @@ bool Swarm::is_pubkey_for_us(const user_pubkey& pk) const { return maybe_swarm && cur_swarm_id_ == *maybe_swarm; } -std::map Swarm::members() const { +std::map Swarm::members() const { std::shared_lock lock{network.mut_}; return members_; } // Returns a copy of all the other members of this swarm, not including this node. -std::map Swarm::peers() const { +std::map Swarm::peers() const { auto peers = members(); peers.erase(our_pk); return peers; } -bool Swarm::is_member(const crypto::legacy_pubkey& pk) const { +std::optional Swarm::is_member(const crypto::legacy_pubkey& pk) const { std::shared_lock lock{network.mut_}; - return members_.count(pk); + std::optional result; + if (const auto& it = members_.find(pk); it != members_.end()) + result = it->second; + return result; } -bool Swarm::is_member(const crypto::x25519_pubkey& pk) const { +std::optional Swarm::is_member(const crypto::x25519_pubkey& pk) const { std::shared_lock lock{network.mut_}; + std::optional result; if (auto lpk = network.contacts.lookup(pk)) - return members_.count(*lpk); - return false; + result = is_member(*lpk); + return result; } -bool Swarm::is_member(const crypto::ed25519_pubkey& pk) const { +std::optional Swarm::is_member(const crypto::ed25519_pubkey& pk) const { std::shared_lock lock{network.mut_}; + std::optional result; if (auto lpk = network.contacts.lookup(pk)) - return members_.count(*lpk); - return false; + result = is_member(*lpk); + return result; } size_t Swarm::size() const { @@ -224,8 +229,8 @@ std::set Swarm::extract_contact_pending_members() { std::set result; auto now = std::chrono::steady_clock::now(); for (auto it = members_.begin(); it != members_.end(); it++) { - MemberState& state = it->second; - if (state.status != MemberStatus::ContactDetailsPending) + SwarmMemberState& state = it->second; + if (state.status != SwarmMemberStatus::ContactDetailsPending) continue; std::chrono::steady_clock::time_point& next_retry = it->second.check_contact_info_next_retry; @@ -244,10 +249,10 @@ std::set Swarm::extract_contacts_needing_db_dump() { std::set result; for (auto& it : members_) { - if (it.second.status != MemberStatus::ContactDetailsReady) + if (it.second.status != SwarmMemberStatus::ContactDetailsReady) continue; const crypto::legacy_pubkey& pk = it.first; - it.second.status = MemberStatus::Ready; + it.second.status = SwarmMemberStatus::Ready; if (it.second.new_swarm_member) { it.second.new_swarm_member = false; result.insert(pk); @@ -265,7 +270,7 @@ void Swarm::set_member_contact_details_ready( assert(it != members_.end()); if (it != members_.end()) { - it->second.status = MemberStatus::ContactDetailsReady; + it->second.status = SwarmMemberStatus::ContactDetailsReady; if (last_synced_ts) it->second.newest_msg_timestamp = *last_synced_ts; } diff --git a/oxenss/snode/swarm.h b/oxenss/snode/swarm.h index 5a3e3d451..cdfdca3b6 100644 --- a/oxenss/snode/swarm.h +++ b/oxenss/snode/swarm.h @@ -27,6 +27,31 @@ struct SwarmEvents { std::set our_swarm_members; }; +enum struct SwarmMemberStatus { + // Pubkeys of new members into our swarm who we haven't yet established communications with; + // once we do, we push all our swarm's messages to them. + ContactDetailsPending, + ContactDetailsReady, + Ready, +}; + +struct SwarmMemberState { + SwarmMemberStatus status; + + std::chrono::milliseconds newest_msg_timestamp; + + // Set if this member joined the swarm. They are assumed to not have any of the messages for + // the swarm yet so a full DB will be initiated + bool new_swarm_member; + + // The earliest timestamp at which the swarm will check if they have received contact + // information for this member yet and can send them data. Only utilised when status is + // 'ContactDetailsPending' before transitioning to 'ContactDetailsReady' when the contact + // detail has been confirmed. + std::chrono::steady_clock::time_point check_contact_info_next_retry; +}; + + // How often we wait, after returning a pending new member, before we return the member again from // `extract_new_members()`. constexpr auto NEW_SWARM_MEMBER_RETRY = 30s; @@ -35,39 +60,19 @@ class Swarm { // Extract relevant information from incoming swarm composition. SwarmEvents derive_swarm_events(uint64_t height, const swarms_t& swarms) const; + friend class ServiceNode; + + std::map + members_; // includes `our_pk`, when we are in a swarm. + + swarm_id_t cur_swarm_id_ = INVALID_SWARM_ID; + public: Swarm(Network& network, const crypto::legacy_pubkey& our_pk) : network{network}, our_pk{our_pk} {} ~Swarm(); - enum struct MemberStatus { - // Pubkeys of new members into our swarm who we haven't yet established communications with; - // once we do, we push all our swarm's messages to them. - ContactDetailsPending, - ContactDetailsReady, - Ready, - }; - - struct MemberState { - MemberStatus status; - std::chrono::milliseconds newest_msg_timestamp; - - // Set if this member joined the swarm. They are assumed to not have any of the messages for - // the swarm yet so a full DB will be initiated - bool new_swarm_member; - - // The earliest timestamp at which the swarm will check if they have received contact - // information for this member yet and can send them data. Only utilised when status is - // 'ContactDetailsPending' before transitioning to 'ContactDetailsReady' when the contact - // detail has been confirmed. - std::chrono::steady_clock::time_point check_contact_info_next_retry; - }; - - swarm_id_t cur_swarm_id_ = INVALID_SWARM_ID; - - std::map members_; // includes `our_pk`, when we are in a swarm. - Network& network; const crypto::legacy_pubkey our_pk; @@ -82,15 +87,15 @@ class Swarm { bool is_pubkey_for_us(const user_pubkey& pk) const; // Returns a copy of all the members of this swarm, including this node. - std::map members() const; + std::map members() const; // Returns a copy of all the other members of this swarm, not including this node. - std::map peers() const; + std::map peers() const; - // Returns true if the given pubkey is recognized as a member of this swarm. - bool is_member(const crypto::legacy_pubkey& pk) const; - bool is_member(const crypto::x25519_pubkey& pk) const; - bool is_member(const crypto::ed25519_pubkey& pk) const; + // Returns the swarm member's state if the given pubkey is recognized as a member of this swarm. + std::optional is_member(const crypto::legacy_pubkey& pk) const; + std::optional is_member(const crypto::x25519_pubkey& pk) const; + std::optional is_member(const crypto::ed25519_pubkey& pk) const; // Returns the size of this swarm (including this node). size_t size() const; From c3f97e856901d8868e6d127bbc56de1fafbb73c9 Mon Sep 17 00:00:00 2001 From: doylet Date: Thu, 12 Jun 2025 11:34:26 +1000 Subject: [PATCH 13/50] Get rid of duplicate fnv hash, already present in serialize.h --- oxenss/snode/service_node.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 3bb2cdcdc..a24be3a70 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -146,12 +146,6 @@ SerialiseResult ServiceNode::serialize(BTSerialise serialise, std::string_view s return result; } -static uint64_t fnv1a64_hasher(std::string_view bytes, uint64_t hash) { - for (size_t i = 0; i < bytes.size(); i++) - hash = (bytes[i] ^ hash) * 1099511628211 /*FNV Prime*/; - return hash; -} - ServiceNode::ServiceNode( const crypto::legacy_keypair& keys, const contact& contact, From be3bd396780919d225ee11ebe665b1c77313cef7 Mon Sep 17 00:00:00 2001 From: doylet Date: Thu, 12 Jun 2025 11:35:50 +1000 Subject: [PATCH 14/50] Remove unused TEST_RETRY_INTERVAl/PERIOD constants --- oxenss/rpc/request_handler.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/oxenss/rpc/request_handler.h b/oxenss/rpc/request_handler.h index 061eaf2bc..2eeff3a78 100644 --- a/oxenss/rpc/request_handler.h +++ b/oxenss/rpc/request_handler.h @@ -22,13 +22,6 @@ namespace oxenss::rpc { -// When a storage test returns a "retry" response, we retry again after this interval: -inline constexpr auto TEST_RETRY_INTERVAL = 50ms; - -// If a storage test is still returning "retry" after this long since the initial request then -// we give up and send an error response back to the requestor: -inline constexpr auto TEST_RETRY_PERIOD = 55s; - // Minimum and maximum TTL permitted for storing a new, public message inline constexpr auto TTL_MINIMUM = 10s; inline constexpr auto TTL_MAXIMUM = 14 * 24h; From 7b4bee513f2c5f49861db4a1004a9fde2f6fb8b0 Mon Sep 17 00:00:00 2001 From: doylet Date: Thu, 12 Jun 2025 13:32:13 +1000 Subject: [PATCH 15/50] Rename BTSerialise to Serialise and prefix SN's SerialiseResult to be SNSerialiseResult --- oxenss/common/serialize.h | 2 +- oxenss/server/omq.cpp | 9 ++++----- oxenss/server/omq.h | 2 +- oxenss/snode/service_node.cpp | 20 ++++++++++---------- oxenss/snode/service_node.h | 4 ++-- oxenss/storage/database.cpp | 6 +++--- oxenss/storage/database.hpp | 2 +- 7 files changed, 22 insertions(+), 23 deletions(-) diff --git a/oxenss/common/serialize.h b/oxenss/common/serialize.h index a4720a0dc..cd1496d86 100644 --- a/oxenss/common/serialize.h +++ b/oxenss/common/serialize.h @@ -5,7 +5,7 @@ #include namespace oxenss { -enum class BTSerialise { +enum class Serialise { Read, Write, }; diff --git a/oxenss/server/omq.cpp b/oxenss/server/omq.cpp index 37a84ff5c..e032be1ec 100644 --- a/oxenss/server/omq.cpp +++ b/oxenss/server/omq.cpp @@ -30,15 +30,13 @@ namespace oxenss::server { static auto logcat = log::Cat("server"); BTSerialiseResult sn_data_ready_response_serialise( - SNDataReadyResponse& item, - BTSerialise serialise, - std::string_view serialized_data) { + SNDataReadyResponse& item, Serialise serialise, std::string_view serialized_data) { BTSerialiseResult result = {}; constexpr std::string_view STATUS_KEY = "s"; constexpr std::string_view TIMESTAMP_KEY = "t"; - if (serialise == BTSerialise::Write) { + if (serialise == Serialise::Write) { assert(serialized_data.empty()); oxenc::bt_dict_producer dict; dict.append(STATUS_KEY, static_cast(item.status)); @@ -109,7 +107,8 @@ void OMQ::handle_sn_data_ready(oxenmq::Message& message) { response.newest_timestamp = service_node_->db->retrieve_newest_timestamp(); } - BTSerialiseResult write_result = sn_data_ready_response_serialise(response, BTSerialise::Write, ""); + BTSerialiseResult write_result = + sn_data_ready_response_serialise(response, Serialise::Write, ""); assert(write_result.success); message.send_reply(write_result.write_payload); } diff --git a/oxenss/server/omq.h b/oxenss/server/omq.h index 03f4e1512..a3321ac67 100644 --- a/oxenss/server/omq.h +++ b/oxenss/server/omq.h @@ -43,7 +43,7 @@ struct SNDataReadyResponse { }; BTSerialiseResult sn_data_ready_response_serialise( - server::SNDataReadyResponse& item, BTSerialise serialise, std::string_view serialized_data); + server::SNDataReadyResponse& item, Serialise serialise, std::string_view serialized_data); class OMQ : public MQBase { oxenmq::OxenMQ omq_; diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index a24be3a70..e8c648e06 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -48,9 +48,9 @@ constexpr auto OXEND_PING_INTERVAL = 30s; // swarm members to distribute missing messages and/or retry recursive swarm requests that failed constexpr auto DO_BACKLOGGED_MSG_RELAY_INTERVAL = 3s; -SerialiseResult ServiceNode::serialize(BTSerialise serialise, std::string_view serialized_data) const +SNSerialiseResult ServiceNode::serialize(Serialise serialise, std::string_view serialized_data) const { - SerialiseResult result = {}; + SNSerialiseResult result = {}; constexpr std::string_view VERSION_KEY = "@"; constexpr std::string_view NETWORK_SWARMS_KEY = "network.swarms"; @@ -58,7 +58,7 @@ SerialiseResult ServiceNode::serialize(BTSerialise serialise, std::string_view s constexpr std::string_view SWARM_MEMBERS_KEY = "swarm.members"; uint32_t version = 0; - if (serialise == BTSerialise::Write) { + if (serialise == Serialise::Write) { oxenc::bt_dict_producer d; d.append(VERSION_KEY, version); @@ -163,8 +163,8 @@ ServiceNode::ServiceNode( db{std::make_unique(dblocation)} { mq_servers_.push_back(&omq_server); - std::string blob_data = db->runtime_state_sn_blob(BTSerialise::Read, ""); - SerialiseResult serialise_result = serialize(BTSerialise::Read, blob_data); + std::string blob_data = db->runtime_state_sn_blob(Serialise::Read, ""); + SNSerialiseResult serialise_result = serialize(Serialise::Read, blob_data); if (serialise_result.bt_serialise.success) { last_serialize_hash = fnv1a64_hasher(blob_data, FNV1A64_SEED); swarm_.members_ = std::move(serialise_result.swarm_members); @@ -596,7 +596,7 @@ void ServiceNode::do_msg_backlog_relay() { read_result.read_error = "Empty reply"; } else { read_result = server::sn_data_ready_response_serialise( - response, BTSerialise::Read, data[0]); + response, Serialise::Read, data[0]); } if (!read_result.success) { @@ -908,7 +908,7 @@ void ServiceNode::on_bootstrap_update(block_update&& bu) { swarm_.update_swarms(bu.height, std::move(bu.swarms), bu.contacts); target_height_ = std::max(target_height_, bu.height); - snode::SerialiseResult serialise_result = serialize(BTSerialise::Write, ""); + snode::SNSerialiseResult serialise_result = serialize(Serialise::Write, ""); if (serialise_result.bt_serialise.success) { uint64_t hash = fnv1a64_hasher(serialise_result.bt_serialise.write_payload, FNV1A64_SEED); if (last_serialize_hash != hash) { @@ -923,7 +923,7 @@ void ServiceNode::on_bootstrap_update(block_update&& bu) { last_serialize_hash = hash; db->runtime_state_sn_blob( - BTSerialise::Write, serialise_result.bt_serialise.write_payload); + Serialise::Write, serialise_result.bt_serialise.write_payload); } } } @@ -969,7 +969,7 @@ void ServiceNode::on_snodes_update(block_update&& bu) { auto events = swarm_.update_swarms(bu.height, std::move(bu.swarms), bu.contacts); // Serialise state to blob and store into DB if dirtied - snode::SerialiseResult serialise_result = serialize(BTSerialise::Write, ""); + snode::SNSerialiseResult serialise_result = serialize(Serialise::Write, ""); if (serialise_result.bt_serialise.success) { uint64_t hash = fnv1a64_hasher(serialise_result.bt_serialise.write_payload, FNV1A64_SEED); if (last_serialize_hash != hash) { @@ -984,7 +984,7 @@ void ServiceNode::on_snodes_update(block_update&& bu) { last_serialize_hash = hash; db->runtime_state_sn_blob( - BTSerialise::Write, serialise_result.bt_serialise.write_payload); + Serialise::Write, serialise_result.bt_serialise.write_payload); } } diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index 57db46ef4..fe6f3511e 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -74,7 +74,7 @@ constexpr std::string_view to_string(SnodeStatus status) { return "Unknown"sv; } -struct SerialiseResult { +struct SNSerialiseResult { BTSerialiseResult bt_serialise; std::map swarm_members; swarms_t network_swarms; @@ -207,7 +207,7 @@ class ServiceNode { bool force_start, bool skip_bootstrap); - SerialiseResult serialize(BTSerialise serialise, std::string_view serialized_data) const; + SNSerialiseResult serialize(Serialise serialise, std::string_view serialized_data) const; uint64_t last_serialize_hash = 0; diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index 4b7ede1b9..bfbff5453 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -1217,11 +1217,11 @@ void oxenss::Database::test_suite_block_for(std::chrono::milliseconds duration) std::this_thread::sleep_for(duration); } -std::string Database::runtime_state_sn_blob(BTSerialise serialise, const std::string& write_blob) +std::string Database::runtime_state_sn_blob(Serialise serialise, const std::string& write_blob) { std::string result; - auto impl = get_impl(serialise == BTSerialise::Write); - if (serialise == BTSerialise::Read) { + auto impl = get_impl(serialise == Serialise::Write); + if (serialise == Serialise::Read) { auto stmt = impl->prepared_st("SELECT sn_blob FROM runtime_state LIMIT 1"); auto maybe_result = exec_and_maybe_get(stmt); if (maybe_result) diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index b4d18665a..a7437b21c 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -217,7 +217,7 @@ class Database { std::map get_expiries( const user_pubkey& pubkey, const std::vector& msg_hashes); - std::string runtime_state_sn_blob(BTSerialise serialise, const std::string& write_blob); + std::string runtime_state_sn_blob(Serialise serialise, const std::string& write_blob); }; } // namespace oxenss From 3924d1f8cf5706ca1adce22e3af41812cb5cf3d2 Mon Sep 17 00:00:00 2001 From: doylet Date: Thu, 12 Jun 2025 13:33:38 +1000 Subject: [PATCH 16/50] Remove unused MessageTestStatus --- oxenss/snode/service_node.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index fe6f3511e..9f0167c1d 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -61,9 +61,6 @@ inline constexpr std::array NEW_SWARM_MEMBER_HANDSHAKE_VERSION = {2 class Swarm; -/// WRONG_REQ - request was ignored as not valid (e.g. incorrect tester) -enum class MessageTestStatus { SUCCESS, RETRY, ERROR, WRONG_REQ }; - constexpr std::string_view to_string(SnodeStatus status) { switch (status) { case SnodeStatus::UNSTAKED: return "Unstaked"sv; From ad1079460beb400648921913267eaaf07c177908 Mon Sep 17 00:00:00 2001 From: doylet Date: Thu, 12 Jun 2025 15:52:17 +1000 Subject: [PATCH 17/50] Move retryable requests into its own thread and restore pending swarm members routine --- oxenss/rpc/request_handler.cpp | 8 +- oxenss/server/omq.cpp | 3 + oxenss/snode/service_node.cpp | 551 ++++++++++++++++++--------------- oxenss/snode/service_node.h | 19 +- 4 files changed, 329 insertions(+), 252 deletions(-) diff --git a/oxenss/rpc/request_handler.cpp b/oxenss/rpc/request_handler.cpp index 6394fd08c..3f597eae2 100644 --- a/oxenss/rpc/request_handler.cpp +++ b/oxenss/rpc/request_handler.cpp @@ -461,6 +461,9 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrpending += peers.size(); + // When a request to a peer fails, set the initial retry to 1s in the future + constexpr auto default_deadline_delay = 1s; + for (auto& peer : peers) { auto ct = sn.contacts().find(peer.first); if (!ct || !*ct) { @@ -475,6 +478,7 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrretry_nodes.push_back(entry); continue; } @@ -482,7 +486,8 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrrequest( ct->pubkey_x25519.view(), "sn.storage_cc", - [res, peer, peer_ed = ct->pubkey_ed25519, &sn](bool success, auto parts) { + [res, peer, peer_ed = ct->pubkey_ed25519, &sn, default_deadline_delay]( + bool success, auto parts) { json peer_result; SNStorageCCResult store_result = interpret_sn_storage_cc_response_parts(success, parts); @@ -528,6 +533,7 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrretry_nodes.push_back(entry); } else if (res->b64) { if (auto it = peer_result.find("signature"); diff --git a/oxenss/server/omq.cpp b/oxenss/server/omq.cpp index e032be1ec..e52ee27c7 100644 --- a/oxenss/server/omq.cpp +++ b/oxenss/server/omq.cpp @@ -318,6 +318,9 @@ OMQ::OMQ( log::debug(logcat,"Received new snode address info from oxend for {}", pk.hex()); service_node_->contacts().update(pk, c); + + // Wake up thread incase there are retryable requests blocked on missing contact info + service_node_->retryable_requests_cv.notify_all(); } catch (const std::exception& e) { log::error(logcat, "Received invalid snode address update from oxend: {}", e.what()); } diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index e8c648e06..25918b7f9 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -44,9 +44,9 @@ using MISSING_PUBKEY_THRESHOLD = std::ratio<3, 100>; /// TODO: there should be config.h to store constants like these constexpr auto OXEND_PING_INTERVAL = 30s; -// How often to trigger 'do_backlogged_msg_relay' which checks for 'data ready' handshakes from -// swarm members to distribute missing messages and/or retry recursive swarm requests that failed -constexpr auto DO_BACKLOGGED_MSG_RELAY_INTERVAL = 3s; +// How often to trigger 'check_new_members' which checks for 'data ready' handshakes from +// swarm members and propagate a DB dump if necessary. +constexpr auto NEW_SWARM_MEMBER_INTERVAL = 30s; SNSerialiseResult ServiceNode::serialize(Serialise serialise, std::string_view serialized_data) const { @@ -191,7 +191,7 @@ ServiceNode::ServiceNode( }, Database::CLEANUP_PERIOD); - omq_server->add_timer([this] { do_msg_backlog_relay(); }, DO_BACKLOGGED_MSG_RELAY_INTERVAL); + omq_server->add_timer([this] { check_new_members(); }, NEW_SWARM_MEMBER_INTERVAL); // We really want to make sure nodes don't get stuck in "syncing" mode, // so if we are still "syncing" after a long time, activate SN regardless @@ -209,6 +209,8 @@ ServiceNode::ServiceNode( }, 1h); + // Setup the retryable requests thread + retryable_requests_thread = std::thread(&ServiceNode::retryable_requests_thread_entry_point, this); } void ServiceNode::on_oxend_connected() { @@ -330,6 +332,7 @@ static std::optional parse_swarm_update( void ServiceNode::add_retryable_request(RequestRetry&& item) { std::unique_lock lock{retryable_requests_mutex}; retryable_requests.emplace_back(item); + retryable_requests_cv.notify_all(); // Wake up retry thread } void ServiceNode::register_mq_server(server::MQBase* server) { @@ -463,6 +466,8 @@ void ServiceNode::bootstrap_fallback() { void ServiceNode::shutdown() { shutting_down_ = true; + retryable_requests_cv.notify_all(); + retryable_requests_thread.join(); } bool ServiceNode::snode_ready(std::string* reason) { @@ -557,260 +562,71 @@ static LookupRetryIndexes lookup_retry_indexes( return result; } -void ServiceNode::do_msg_backlog_relay() { - auto now = std::chrono::steady_clock::now(); - if (now >= swarm_member_deadline) { - swarm_member_deadline = now + 10s; - for (const auto& pk : swarm_.extract_contact_pending_members()) { - auto c = network_.contacts.find(pk); - if (!c || !*c) { - // We don't have contact info, so don't do anything right now and this will get - // triggered again later. - log::debug( - logcat, - "Leaving {} as pending: node {}", - pk, - c ? "has missing contact info" : "is unknown"); - continue; - } - - if (c->version < NEW_SWARM_MEMBER_HANDSHAKE_VERSION) { - log::debug( - logcat, - "Skipping handshake with new swarm member {}: v{}+ required, remote is v{}", - pk, - fmt::join(NEW_SWARM_MEMBER_HANDSHAKE_VERSION, "."), - fmt::join(c->version, ".")); - swarm_.set_member_contact_details_ready(pk, std::nullopt); - continue; - } - - log::debug(logcat, "Initiating contact with new swarm member {}", pk); - omq_server_->request( - c->pubkey_x25519.view(), - "sn.data_ready", - [this, pk](bool success, std::vector data) { - server::SNDataReadyResponse response = {}; - BTSerialiseResult read_result = {}; - if (data.empty()) { - read_result.read_error = "Empty reply"; - } else { - read_result = server::sn_data_ready_response_serialise( - response, Serialise::Read, data[0]); - } - - if (!read_result.success) { - log::info( - logcat, - "Failed to connect to remote SS {} to initiate new " - "data transfer ({}: {}); will retry soon", - pk, - fmt::join(data, ", "), - read_result.read_error); - return; - } - log::debug( - logcat, - "Successful contact made with swarm member {}, marking as ready", - pk); - swarm_.set_member_contact_details_ready(pk, response.newest_timestamp); - }); - } - - if (auto send_now = swarm_.extract_contacts_needing_db_dump(); !send_now.empty()) { - auto msgs = db->retrieve_all(); +void ServiceNode::check_new_members() { + for (const auto& pk : swarm_.extract_contact_pending_members()) { + auto c = network_.contacts.find(pk); + if (!c || !*c) { + // We don't have contact info, so don't do anything right now and this will get + // triggered again later. log::debug( logcat, - "Initiating swarm message dump ({} message) to new swarm member(s): {}", - msgs.size(), - fmt::join(send_now, ", ")); - relay_messages(std::move(msgs), send_now); - } - } - - // Retry failed/timed-out requests - std::unique_lock lock{retryable_requests_mutex}; - if (log::Level level = log::Level::debug; - log::get_level(logcat) <= level && retryable_requests.size()) { - - fmt::memory_buffer log_buffer; - fmt::format_to( - std::back_inserter(log_buffer), - "Attempting {} retryable requests\n", - retryable_requests.size()); - - for (size_t index = 0; index < retryable_requests.size(); index++) { - const auto& item = retryable_requests[index]; - fmt::format_to( - std::back_inserter(log_buffer), - "{} [{}] '{}' command {} to {} node(s){}", - index ? "\n" : "", - index, - item.cmd, - util::get_human_readable_bytes(item.req_payload.size()), - item.nodes.size(), - item.nodes.size() ? "\n NODES" : ""); - - for (size_t node_index = 0; node_index < item.nodes.size(); node_index++) { - const auto& node_item = item.nodes[node_index]; - std::string_view reason = ""; - switch (node_item.reason) { - case RetryReason::NON_CONTACTABLE: reason = "non-contactable"; break; - case RetryReason::FAILED_TO_SEND: reason = "failed to send"; break; - } - - std::string deadline = "now"; - if (node_item.deadline >= now) { - auto delta = node_item.deadline - now; - deadline = - "in {}"_format(std::chrono::duration_cast(delta)); - } - - fmt::format_to( - std::back_inserter(log_buffer), - "\n {}: {} ({}) retrying {}", - index, - node_item.key, - reason, - deadline); - } + "Leaving {} as pending: node {}", + pk, + c ? "has missing contact info" : "is unknown"); + continue; } - log::log(logcat, level, "{}", fmt::to_string(log_buffer)); - } - - for (auto it = retryable_requests.begin(); it != retryable_requests.end();) { - - // Create a hash of the inputs so that we can match dispatched requests easily with the - // originating retry item. - if (it->hash == 0) { - it->hash = FNV1A64_SEED; - it->hash = fnv1a64_hasher(it->cmd, it->hash); - it->hash = fnv1a64_hasher(it->req_payload, it->hash); + if (c->version < NEW_SWARM_MEMBER_HANDSHAKE_VERSION) { + log::debug( + logcat, + "Skipping handshake with new swarm member {}: v{}+ required, remote is v{}", + pk, + fmt::join(NEW_SWARM_MEMBER_HANDSHAKE_VERSION, "."), + fmt::join(c->version, ".")); + swarm_.set_member_contact_details_ready(pk, std::nullopt); + continue; } - for (auto node_it = it->nodes.begin(); node_it != it->nodes.end();) { - auto on_request_done = [this, hash = it->hash, key = node_it->key]( - bool success, std::vector parts) { - - std::unique_lock lock{retryable_requests_mutex}; - - // Lookup the originating retry-request responsible for this OMQ response - LookupRetryIndexes lookup = lookup_retry_indexes(retryable_requests, hash, key); - if (!lookup.retryable_index) - return; - - RequestRetry& request = retryable_requests[*lookup.retryable_index]; - if (lookup.node_index) { - RequestRetryEntry& node = request.nodes[*lookup.node_index]; - - // We cleanup the request in all situations except timeout (timeout - // indicating that the node was non-responsive, maybe offline). In an error - // state we don't know what state the recipient's storage server is in and - // we default to deleting it and ending the retry attempts. - rpc::SNStorageCCResult store_result = - rpc::interpret_sn_storage_cc_response_parts(success, parts); - bool cleanup = store_result.status != rpc::SNStorageCCResultStatus::Timeout; - - if (cleanup) { - std::string_view outcome = "succeeded"; - if (store_result.status != rpc::SNStorageCCResultStatus::Good) - outcome = "failed unrecoverably"; - - log::trace( - logcat, - "Retry to {} for {} ({}) {}, cleaning up", - key, - request.cmd, - util::get_human_readable_bytes(request.req_payload.size()), - outcome); - - request.nodes.erase(request.nodes.begin() + *lookup.node_index); + log::debug(logcat, "Initiating contact with new swarm member {}", pk); + omq_server_->request( + c->pubkey_x25519.view(), + "sn.data_ready", + [this, pk](bool success, std::vector data) { + server::SNDataReadyResponse response = {}; + BTSerialiseResult read_result = {}; + if (data.empty()) { + read_result.read_error = "Empty reply"; } else { - // Extend the next retry deadline and re-attempt later - float& delay_coeff = node.deadline_delay_coeff; - delay_coeff = std::max(delay_coeff, 1.f) * 1.3f; - delay_coeff = std::min(delay_coeff, 2.f); - - auto init_delay = std::chrono::duration_cast( - DO_BACKLOGGED_MSG_RELAY_INTERVAL); - auto delay = std::chrono::seconds( - static_cast(init_delay.count() * delay_coeff)); - - node.deadline = std::chrono::steady_clock::now() + delay; - log::trace( - logcat, - "Retry to {} for {} ({}) timed out, next attempt in ~{}", - key, - request.cmd, - util::get_human_readable_bytes(request.req_payload.size()), - delay); + read_result = server::sn_data_ready_response_serialise( + response, Serialise::Read, data[0]); } - } - // Remove retryable request if there are no more nodes to retry to - if (request.nodes.empty()) - retryable_requests.erase(retryable_requests.begin() + *lookup.retryable_index); - }; - - std::optional is_member = swarm_.is_member(node_it->key); - if (is_member) { - // Retry request if ready - bool is_due = now >= node_it->deadline; - bool ready = - (is_member->status == SwarmMemberStatus::ContactDetailsReady || - is_member->status == SwarmMemberStatus::Ready); - crypto::x25519_pubkey pubkey_x25519 = {}; - - if (ready && is_due) { - auto ct = contacts().find(node_it->key); - if (ct && *ct) - pubkey_x25519 = ct->pubkey_x25519; - } - - if (ready && is_due && pubkey_x25519) { - omq_server()->request( - pubkey_x25519.view(), - "sn.storage_cc", - on_request_done, - it->cmd, - it->req_payload, - oxenmq::send_option::request_timeout{5s}); - } - - if (!ready) { // Separate logging from logic for code clarity - log::trace( - logcat, - "Retry to {} ({}) deferred, member hasn't signaled 'data ready' (was " - "{})", - node_it->key, - it->cmd, - static_cast(is_member->status)); - } else if (!pubkey_x25519) { - log::trace( + if (!read_result.success) { + log::info( + logcat, + "Failed to connect to remote SS {} to initiate new " + "data transfer ({}: {}); will retry soon", + pk, + fmt::join(data, ", "), + read_result.read_error); + return; + } + log::debug( logcat, - "Retry to {} ({}) deferred, contact info missing", - node_it->key, - it->cmd); - } - } - - if (is_member) { - node_it++; - } else { - log::trace( - logcat, - "Retry to {} ({}) cancelled, not a member in swarm anymore", - node_it->key, - it->cmd); - node_it = it->nodes.erase(node_it); - } - } + "Successful contact made with swarm member {}, marking as ready", + pk); + swarm_.set_member_contact_details_ready(pk, response.newest_timestamp); + }); + } - if (it->nodes.empty()) - it = retryable_requests.erase(it); - else - it++; + if (auto send_now = swarm_.extract_contacts_needing_db_dump(); !send_now.empty()) { + auto msgs = db->retrieve_all(); + log::debug( + logcat, + "Initiating swarm message dump ({} message) to new swarm member(s): {}", + msgs.size(), + fmt::join(send_now, ", ")); + relay_messages(std::move(msgs), send_now); } } @@ -1591,4 +1407,245 @@ void ServiceNode::process_push_batch(std::string_view blob, std::string_view sen log::trace(logcat, "Saving all: end"); } +void ServiceNode::retryable_requests_thread_entry_point() { + // The min and max amount of time this node will backoff between failed retry requests + constexpr auto MIN_RETRY_DELAY = 1s; + constexpr auto MAX_RETRY_DELAY = 60s; + constexpr auto RETRY_BACKOFF_COEFF = 1.75f; + + while (!shutting_down_) { + // At longest, we timeout on the blocking sleep every 5s, or, as soon as someone wakes up + // the thread by notifying the condition var + // - when a new retryable request is added + // - we're shutting down + // - or we know there's an earlier deadline in the list of requests to be retried + // - a node's contact detail was updated + // - a retryable request failed and a new deadline was posted + auto earliest_deadline = std::chrono::steady_clock::now() + 5s; + + std::unique_lock lock{retryable_requests_mutex}; + retryable_requests_cv.wait_until(lock, earliest_deadline); + + if (shutting_down_) + continue; + + // Log the current retries + auto now = std::chrono::steady_clock::now(); + if (log::Level level = log::Level::debug; + log::get_level(logcat) <= level && retryable_requests.size()) { + + size_t due_requests = 0; + size_t total_requests = 0; + fmt::memory_buffer trace_buffer; + for (size_t index = 0; index < retryable_requests.size(); index++) { + const auto& item = retryable_requests[index]; + if (log::get_level(logcat) <= log::Level::trace) { + fmt::format_to( + std::back_inserter(trace_buffer), + "{} [{}] '{}' command {} to {} node(s)", + index ? "\n" : "", + index, + item.cmd, + util::get_human_readable_bytes(item.req_payload.size()), + item.nodes.size()); + } + + for (size_t node_index = 0; node_index < item.nodes.size(); node_index++) { + const auto& node_item = item.nodes[node_index]; + bool is_due = now>= node_item.deadline; + due_requests += is_due; + + if (log::get_level(logcat) <= log::Level::trace) { + if (node_index == 0) + fmt::format_to(std::back_inserter(trace_buffer), "\n NODES"); + + std::string_view reason = ""; + switch (node_item.reason) { + case RetryReason::NON_CONTACTABLE: reason = "non-contactable"; break; + case RetryReason::FAILED_TO_SEND: reason = "failed to send"; break; + } + + std::string deadline = "now"; + if (!is_due) { + auto delta = node_item.deadline - now; + deadline = "in {}"_format( + std::chrono::duration_cast(delta)); + } + + fmt::format_to( + std::back_inserter(trace_buffer), + "\n {}: {} ({}) retrying {}", + index, + node_item.key, + reason, + deadline); + } + } + + total_requests += item.nodes.size(); + } + + log::log( + logcat, + level, + "Attempting {}/{} retryable requests", + due_requests, + total_requests); + + if (log::get_level(logcat) <= log::Level::trace) + log::trace(logcat, "Retryables:\n{}", fmt::to_string(trace_buffer)); + } + + for (auto it = retryable_requests.begin(); it != retryable_requests.end();) { + // Create a hash of the inputs so that we can match dispatched requests easily with the + // originating retry item. + if (it->hash == 0) { + it->hash = FNV1A64_SEED; + it->hash = fnv1a64_hasher(it->cmd, it->hash); + it->hash = fnv1a64_hasher(it->req_payload, it->hash); + } + + for (auto node_it = it->nodes.begin(); node_it != it->nodes.end();) { + auto on_request_done = [MIN_RETRY_DELAY, + MAX_RETRY_DELAY, + RETRY_BACKOFF_COEFF, + this, + hash = it->hash, + key = node_it->key]( + bool success, std::vector parts) { + std::unique_lock lock{retryable_requests_mutex}; + + // Lookup the originating retry-request responsible for this OMQ response + LookupRetryIndexes lookup = lookup_retry_indexes(retryable_requests, hash, key); + if (!lookup.retryable_index) + return; + + RequestRetry& request = retryable_requests[*lookup.retryable_index]; + if (lookup.node_index) { + RequestRetryEntry& node = request.nodes[*lookup.node_index]; + node.retry_underway = false; + + // We cleanup the request in all situations except timeout (timeout + // indicating that the node was non-responsive, maybe offline). In an error + // state we don't know what state the recipient's storage server is in and + // we default to deleting it and ending the retry attempts. + rpc::SNStorageCCResult store_result = + rpc::interpret_sn_storage_cc_response_parts(success, parts); + bool cleanup = store_result.status != rpc::SNStorageCCResultStatus::Timeout; + + if (cleanup) { + std::string_view outcome = "succeeded"; + if (store_result.status != rpc::SNStorageCCResultStatus::Good) + outcome = "failed unrecoverably"; + + log::debug( + logcat, + "Retry to {} for {} ({}) {}, cleaning up", + key, + request.cmd, + util::get_human_readable_bytes(request.req_payload.size()), + outcome); + + request.nodes.erase(request.nodes.begin() + *lookup.node_index); + } else { + // Extend the next retry deadline and re-attempt later + node.next_retry_delay = std::max( + node.next_retry_delay, + std::chrono::milliseconds(MIN_RETRY_DELAY)); + + size_t delay_ms = std::chrono::duration_cast( + node.next_retry_delay) + .count(); + delay_ms *= RETRY_BACKOFF_COEFF; + node.next_retry_delay = std::min( + std::chrono::milliseconds(delay_ms), + std::chrono::milliseconds(MAX_RETRY_DELAY)); + node.deadline = std::chrono::steady_clock::now() + node.next_retry_delay; + + // Wake up retryable request thread, it will take into consideration the + // new deadline for the blocking sleep + retryable_requests_cv.notify_all(); + + log::debug( + logcat, + "Retry to {} for {} ({}) timed out, next attempt in ~{}", + key, + request.cmd, + util::get_human_readable_bytes(request.req_payload.size()), + node.next_retry_delay); + } + } + + // Remove retryable request if there are no more nodes to retry to + if (request.nodes.empty()) + retryable_requests.erase( + retryable_requests.begin() + *lookup.retryable_index); + }; + + std::optional is_member = swarm_.is_member(node_it->key); + if (is_member && !node_it->retry_underway) { + // Retry request if ready + bool is_due = now >= node_it->deadline; + bool ready = + (is_member->status == SwarmMemberStatus::ContactDetailsReady || + is_member->status == SwarmMemberStatus::Ready); + crypto::x25519_pubkey pubkey_x25519 = {}; + + if (ready) { + auto ct = contacts().find(node_it->key); + if (ct && *ct) + pubkey_x25519 = ct->pubkey_x25519; + } + + if (pubkey_x25519) { + if (is_due) { + node_it->retry_underway = true; + omq_server()->request( + pubkey_x25519.view(), + "sn.storage_cc", + on_request_done, + it->cmd, + it->req_payload, + oxenmq::send_option::request_timeout{5s}); + } else { + earliest_deadline = std::min(earliest_deadline, node_it->deadline); + } + } + + if (!ready) { + log::debug( + logcat, + "Retry to {} ({}) deferred, member hasn't signaled 'data ready' " + "(was {})", + node_it->key, + it->cmd, + static_cast(is_member->status)); + } else if (!pubkey_x25519) { + log::debug( + logcat, + "Retry to {} ({}) deferred, contact info missing", + node_it->key, + it->cmd); + } + } + + if (is_member) { + node_it++; + } else { + log::debug( + logcat, + "Retry to {} ({}) cancelled, not a member in swarm anymore", + node_it->key, + it->cmd); + node_it = it->nodes.erase(node_it); + } + } + + if (it->nodes.empty()) + it = retryable_requests.erase(it); + else + it++; + } + } +} } // namespace oxenss::snode diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index 9f0167c1d..ceef7714d 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -86,8 +86,9 @@ enum class RetryReason { struct RequestRetryEntry { crypto::legacy_pubkey key; RetryReason reason; + bool retry_underway; std::chrono::steady_clock::time_point deadline; - float deadline_delay_coeff; + std::chrono::milliseconds next_retry_delay; }; struct RequestRetry { @@ -137,11 +138,17 @@ class ServiceNode { mutable std::recursive_mutex sn_mutex_; + // Lock to be taken when interacting with the 'retryable_requests' queue std::mutex retryable_requests_mutex; + // List of requests that will be re-attempted periodically through the + // 'retryable_requests_thread' std::vector retryable_requests; - std::chrono::steady_clock::time_point swarm_member_deadline = {}; + std::thread retryable_requests_thread; + + // The time point at which the next swarm member check should be executed + std::chrono::steady_clock::time_point swarm_member_check_deadline = {}; void send_notifies(message m); @@ -155,7 +162,7 @@ class ServiceNode { void on_snodes_update(block_update&& bu); // Called periodically to attempt to initiate transfers to new snode members - void do_msg_backlog_relay(); + void check_new_members(); // Called if our oxend looks like it is missing lots of records when we first get data from it // to load initial data (especially contact info) from the bootstrap nodes. @@ -220,7 +227,7 @@ class ServiceNode { const contact& own_address() { return our_contact_; } - // Enqueue a request to be re-attempted every 'DO_BACKLOGGED_MSG_RELAY_INTERVAL' intervals. + // Enqueue a request to be re-attempted void add_retryable_request(RequestRetry&& item); // Adds a MQ server, i.e. QUIC. The OMQ server is added automatically during construction and @@ -312,6 +319,10 @@ class ServiceNode { void update_swarms(std::promise* on_completion = nullptr); server::OMQ& omq_server() { return omq_server_; } + + std::condition_variable retryable_requests_cv; + + void retryable_requests_thread_entry_point(); }; } // namespace oxenss::snode From c1ee07843f13e0a5971aef68736c85c2d70f1715 Mon Sep 17 00:00:00 2001 From: doylet Date: Fri, 13 Jun 2025 13:37:59 +1000 Subject: [PATCH 18/50] Revert SN data ready response, timestamp not needed --- oxenss/server/omq.cpp | 70 ++++------------------------------- oxenss/server/omq.h | 17 --------- oxenss/snode/service_node.cpp | 22 +++++------ oxenss/snode/swarm.cpp | 10 +---- oxenss/snode/swarm.h | 4 +- oxenss/storage/database.cpp | 15 -------- oxenss/storage/database.hpp | 3 -- 7 files changed, 20 insertions(+), 121 deletions(-) diff --git a/oxenss/server/omq.cpp b/oxenss/server/omq.cpp index e52ee27c7..7420cf33d 100644 --- a/oxenss/server/omq.cpp +++ b/oxenss/server/omq.cpp @@ -29,48 +29,6 @@ namespace oxenss::server { static auto logcat = log::Cat("server"); -BTSerialiseResult sn_data_ready_response_serialise( - SNDataReadyResponse& item, Serialise serialise, std::string_view serialized_data) { - - BTSerialiseResult result = {}; - - constexpr std::string_view STATUS_KEY = "s"; - constexpr std::string_view TIMESTAMP_KEY = "t"; - if (serialise == Serialise::Write) { - assert(serialized_data.empty()); - oxenc::bt_dict_producer dict; - dict.append(STATUS_KEY, static_cast(item.status)); - dict.append(TIMESTAMP_KEY, item.newest_timestamp.count()); - result.write_payload = dict.view(); - result.success = true; - } else { - oxenc::bt_dict_consumer d{serialized_data}; - SNDataReadyResponse response = {}; - try { - uint32_t status_u32 = d.require(STATUS_KEY); - uint32_t last = static_cast(SNDataReadyStatus::Count); - if (status_u32 >= last) - result.read_error = "SN data ready status was OOB (received {})"_format(last); - else - response.status = static_cast(status_u32); - } catch (const std::exception& e) { - result.read_error = "SN data ready status was not a 4 byte unsigned integer"; - } - - try { - uint64_t newest_timestamp = d.require(TIMESTAMP_KEY); - response.newest_timestamp = std::chrono::milliseconds(newest_timestamp); - } catch (const std::exception& e) { - result.read_error = "SN data ready timestamp was not an 8 byte unsigned integer"; - } - - result.success = result.read_error.empty(); - if (result.success) - item = std::move(response); - } - return result; -} - std::string OMQ::peer_lookup(std::string_view pubkey_bin) const { log::trace(logcat, "[OMQ] Peer Lookup"); @@ -89,28 +47,16 @@ std::string OMQ::peer_lookup(std::string_view pubkey_bin) const { void OMQ::handle_sn_data_ready(oxenmq::Message& message) { log::debug(logcat, "[OMQ] handle sn.data_ready from: {}", message.conn.to_string()); - SNDataReadyResponse response = {}; - if (response.status == SNDataReadyStatus::Nil) { - auto& xpk_str = message.conn.pubkey(); - if (xpk_str.size() != sizeof(crypto::x25519_pubkey)) { - response.status = SNDataReadyStatus::RemoteNotRecognizedAsSN; - } else { - crypto::x25519_pubkey xpk; - std::memcpy(xpk.data(), xpk_str.data(), sizeof(crypto::x25519_pubkey)); - if (!service_node_->is_swarm_peer(xpk)) - response.status = SNDataReadyStatus::SwarmMismatch; - } - } + auto& xpk_str = message.conn.pubkey(); + if (xpk_str.size() != sizeof(crypto::x25519_pubkey)) + return message.send_reply("Remote not recognized as SN"); - if (response.status == SNDataReadyStatus::Nil) { - response.status = SNDataReadyStatus::OK; - response.newest_timestamp = service_node_->db->retrieve_newest_timestamp(); - } + crypto::x25519_pubkey xpk; + std::memcpy(xpk.data(), xpk_str.data(), sizeof(crypto::x25519_pubkey)); + if (!service_node_->is_swarm_peer(xpk)) + return message.send_reply("Swarm mismatch"); - BTSerialiseResult write_result = - sn_data_ready_response_serialise(response, Serialise::Write, ""); - assert(write_result.success); - message.send_reply(write_result.write_payload); + message.send_reply("OK"); } void OMQ::handle_sn_data(oxenmq::Message& message) { diff --git a/oxenss/server/omq.h b/oxenss/server/omq.h index a3321ac67..7e3e947dc 100644 --- a/oxenss/server/omq.h +++ b/oxenss/server/omq.h @@ -28,23 +28,6 @@ namespace snode { } // namespace oxenss namespace oxenss::server { - -enum class SNDataReadyStatus { - Nil, - RemoteNotRecognizedAsSN, - SwarmMismatch, - OK, - Count, -}; - -struct SNDataReadyResponse { - SNDataReadyStatus status; - std::chrono::milliseconds newest_timestamp; -}; - -BTSerialiseResult sn_data_ready_response_serialise( - server::SNDataReadyResponse& item, Serialise serialise, std::string_view serialized_data); - class OMQ : public MQBase { oxenmq::OxenMQ omq_; oxenmq::ConnectionID oxend_conn_; diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 25918b7f9..3fc1b4c11 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -583,7 +583,7 @@ void ServiceNode::check_new_members() { pk, fmt::join(NEW_SWARM_MEMBER_HANDSHAKE_VERSION, "."), fmt::join(c->version, ".")); - swarm_.set_member_contact_details_ready(pk, std::nullopt); + swarm_.set_member_contact_details_ready(pk); continue; } @@ -592,30 +592,26 @@ void ServiceNode::check_new_members() { c->pubkey_x25519.view(), "sn.data_ready", [this, pk](bool success, std::vector data) { - server::SNDataReadyResponse response = {}; - BTSerialiseResult read_result = {}; if (data.empty()) { - read_result.read_error = "Empty reply"; - } else { - read_result = server::sn_data_ready_response_serialise( - response, Serialise::Read, data[0]); + success = false; + data.push_back("Empty reply"s); + } else if (data[0] != "OK"sv) { + success = false; } - - if (!read_result.success) { + if (!success) { log::info( logcat, "Failed to connect to remote SS {} to initiate new " - "data transfer ({}: {}); will retry soon", + "data transfer ({}); will retry soon", pk, - fmt::join(data, ", "), - read_result.read_error); + fmt::join(data, ", ")); return; } log::debug( logcat, "Successful contact made with swarm member {}, marking as ready", pk); - swarm_.set_member_contact_details_ready(pk, response.newest_timestamp); + swarm_.set_member_contact_details_ready(pk); }); } diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index 15bb6cb1b..db536e067 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -262,17 +262,11 @@ std::set Swarm::extract_contacts_needing_db_dump() { return result; } -void Swarm::set_member_contact_details_ready( - const crypto::legacy_pubkey& pk, std::optional last_synced_ts) { +void Swarm::set_member_contact_details_ready(const crypto::legacy_pubkey& pk) { std::lock_guard lock{network.mut_}; - auto it = members_.find(pk); assert(it != members_.end()); - - if (it != members_.end()) { + if (it != members_.end()) it->second.status = SwarmMemberStatus::ContactDetailsReady; - if (last_synced_ts) - it->second.newest_msg_timestamp = *last_synced_ts; - } } } // namespace oxenss::snode diff --git a/oxenss/snode/swarm.h b/oxenss/snode/swarm.h index cdfdca3b6..1fa0cafd6 100644 --- a/oxenss/snode/swarm.h +++ b/oxenss/snode/swarm.h @@ -112,9 +112,7 @@ class Swarm { // Marks a pending member as ready, so that it is returned by the next call to // `extract_contact_details_ready_members()`, and is no longer returned by // `extract_contract_details_pending_member()`. - void set_member_contact_details_ready( - const crypto::legacy_pubkey& pk, - std::optional last_synced_ts); + void set_member_contact_details_ready(const crypto::legacy_pubkey& pk); swarm_id_t our_swarm_id() const { std::shared_lock lock{network.mut_}; diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index bfbff5453..f665f9cb1 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -672,21 +672,6 @@ std::optional Database::retrieve_by_hash(const std::string& msg_hash) { return get_message(*impl, st); } -std::chrono::milliseconds Database::retrieve_newest_timestamp() { - auto impl = get_impl(false); - auto st = impl->prepared_st( - "SELECT COALESCE((SELECT timestamp FROM owned_messages ORDER BY timestamp DESC LIMIT " - "1), 0);"); - std::chrono::milliseconds result = {}; - while (st->executeStep()) { - int64_t time = get(st); - assert(time >= 0); - if (time >= 0) - result = std::chrono::milliseconds(static_cast(time)); - } - return result; -} - StoreResult Database::store(const message& msg, std::chrono::system_clock::time_point* expiry) { auto impl = get_impl(true); diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index a7437b21c..7c532bc81 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -132,9 +132,6 @@ class Database { // pubkey or namespace! std::optional retrieve_by_hash(const std::string& msg_hash); - // Get the latest owned message's timestamp. Returns 0 if there are no messages in the DB - std::chrono::milliseconds retrieve_newest_timestamp(); - // Removes expired messages from the database; the `Database` instance owner should call // this periodically. void clean_expired(); From 936c81039b2b639c57fc2fdbc48c22ab1acb7bba Mon Sep 17 00:00:00 2001 From: doylet Date: Mon, 16 Jun 2025 11:19:58 +1000 Subject: [PATCH 19/50] Serialise retryable requests to the DB --- oxenss/common/serialize.h | 2 +- oxenss/rpc/request_handler.cpp | 1 + oxenss/snode/service_node.cpp | 359 +++++++++++++++++++++++++++------ oxenss/snode/service_node.h | 32 +-- oxenss/storage/database.cpp | 18 +- oxenss/storage/database.hpp | 7 +- 6 files changed, 335 insertions(+), 84 deletions(-) diff --git a/oxenss/common/serialize.h b/oxenss/common/serialize.h index cd1496d86..17925204f 100644 --- a/oxenss/common/serialize.h +++ b/oxenss/common/serialize.h @@ -10,7 +10,7 @@ enum class Serialise { Write, }; -struct BTSerialiseResult { +struct SerialiseBTResult { bool success; std::string write_payload; std::string read_error; diff --git a/oxenss/rpc/request_handler.cpp b/oxenss/rpc/request_handler.cpp index 3f597eae2..acf9642db 100644 --- a/oxenss/rpc/request_handler.cpp +++ b/oxenss/rpc/request_handler.cpp @@ -432,6 +432,7 @@ static void reply_or_fail(snode::ServiceNode& sn, const std::shared_ptrretry_nodes); retry.cmd = res->cmd; retry.req_payload = std::move(res->req_payload); + retry.create_time = std::chrono::steady_clock::now(); sn.add_retryable_request(std::move(retry)); } } diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 3fc1b4c11..58f0f4373 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -48,9 +48,190 @@ constexpr auto OXEND_PING_INTERVAL = 30s; // swarm members and propagate a DB dump if necessary. constexpr auto NEW_SWARM_MEMBER_INTERVAL = 30s; -SNSerialiseResult ServiceNode::serialize(Serialise serialise, std::string_view serialized_data) const +static SerialiseRetryableRequestsResult serialize_retryable_requests( + Serialise serialise, std::string_view read_data, std::span write_data) { + SerialiseRetryableRequestsResult result = {}; + uint32_t version = 0; + + constexpr std::string_view VERSION_KEY = "@"; + constexpr std::string_view RETRYABLE_REQUESTS_KEY = "r"; + assert(VERSION_KEY < RETRYABLE_REQUESTS_KEY); + + // Retryable request keys + constexpr std::string_view COMMAND_KEY = "c"; + constexpr std::string_view REQ_PAYLOAD_KEY = "r"; + constexpr std::string_view CREATE_TIME_KEY = "t"; + constexpr std::string_view NODES_KEY = "u"; + assert(COMMAND_KEY < CREATE_TIME_KEY); + assert(REQ_PAYLOAD_KEY < CREATE_TIME_KEY); + assert(CREATE_TIME_KEY < NODES_KEY); + + // Retrayble request entry keys + constexpr std::string_view KEY_KEY = "i"; + constexpr std::string_view DEADLINE_KEY = "l"; + constexpr std::string_view NEXT_RETRY_DELAY_KEY = "n"; + constexpr std::string_view REASON_KEY = "r"; + assert(KEY_KEY < DEADLINE_KEY); + assert(DEADLINE_KEY < NEXT_RETRY_DELAY_KEY); + assert(NEXT_RETRY_DELAY_KEY < REASON_KEY); + + if (serialise == Serialise::Write) { + oxenc::bt_dict_producer d; + d.append(VERSION_KEY, version); + + oxenc::bt_list_producer retry_list = d.append_list(RETRYABLE_REQUESTS_KEY); + for (const auto& it : write_data) { + oxenc::bt_dict_producer retry_dict = retry_list.append_dict(); + retry_dict.append(COMMAND_KEY, it.cmd); + retry_dict.append(REQ_PAYLOAD_KEY, it.req_payload); + uint64_t create_time_u64 = std::chrono::duration_cast( + it.create_time.time_since_epoch()) + .count(); + retry_dict.append(CREATE_TIME_KEY, create_time_u64); + oxenc::bt_list_producer node_list = retry_dict.append_list(NODES_KEY); + for (const auto& node_it : it.nodes) { + oxenc::bt_dict_producer node_dict = node_list.append_dict(); + uint32_t reason_u32 = static_cast(node_it.reason); + uint64_t deadline_u64 = std::chrono::duration_cast( + node_it.deadline.time_since_epoch()) + .count(); + uint64_t next_retry_delay_u64 = node_it.next_retry_delay.count(); + node_dict.append(KEY_KEY, node_it.key); + node_dict.append(DEADLINE_KEY, deadline_u64); + node_dict.append(NEXT_RETRY_DELAY_KEY, next_retry_delay_u64); + node_dict.append(REASON_KEY, reason_u32); + } + } + + result.bt.success = true; + result.bt.write_payload = d.view(); + } else { + if (read_data.size()) { + oxenc::bt_dict_consumer d{read_data}; + try { + version = d.require(VERSION_KEY); + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to parse retryable request version: {}"_format(e.what()); + } + + if (version != 0) + result.bt.read_error = + "Unrecognised retryable request version: {}, skipping"_format(version); + + if (result.bt.read_error.empty()) { + // Initially a dummy list that we will std::move the real list into + oxenc::bt_list_consumer retry_list("l"); + try { + auto [key, list] = d.next_list_consumer(); + assert(key == RETRYABLE_REQUESTS_KEY); + retry_list = std::move(list); + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to read retryable request list: {}"_format(e.what()); + } + + while (result.bt.read_error.empty() && !retry_list.is_finished()) { + auto request_dict = retry_list.consume_dict_consumer(); + + RequestRetry request = {}; + try { + request.cmd = request_dict.require(COMMAND_KEY); + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to read retryable request command: {}"_format(e.what()); + continue; + } + + try { + request.req_payload = request_dict.require(REQ_PAYLOAD_KEY); + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to read retryable request, request payload: {}"_format( + e.what()); + continue; + } + + try { + uint64_t create_time_u64 = request_dict.require(CREATE_TIME_KEY); + request.create_time = std::chrono::steady_clock::time_point( + std::chrono::milliseconds(create_time_u64)); + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to read retryable request, create time: {}"_format(e.what()); + continue; + } + + oxenc::bt_list_consumer node_list("l"); // Dummy list + try { + auto [key, list] = request_dict.next_list_consumer(); + assert(key == NODES_KEY); + node_list = std::move(list); + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to read retryable request, node list: {}"_format(e.what()); + continue; + } + + while (result.bt.read_error.empty() && !node_list.is_finished()) { + auto node_dict = node_list.consume_dict_consumer(); + RequestRetryEntry node = {}; + try { + std::string_view key_bytes = + node_dict.require(KEY_KEY); + node.key = crypto::legacy_pubkey::from_bytes(key_bytes); + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to parse retryable request node key: {}"_format( + e.what()); + continue; + } + + try { + uint64_t deadline_u64 = node_dict.require(DEADLINE_KEY); + node.deadline = std::chrono::steady_clock::time_point( + std::chrono::milliseconds(deadline_u64)); + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to parse retryable request node deadline: {}"_format( + e.what()); + continue; + } + + try { + uint64_t next_retry_delay_u64 = + node_dict.require(NEXT_RETRY_DELAY_KEY); + node.next_retry_delay = std::chrono::milliseconds(next_retry_delay_u64); + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to parse retryable request next retry delay: {}"_format( + e.what()); + continue; + } + + try { + uint32_t reason_u32 = node_dict.require(REASON_KEY); + node.reason = static_cast(reason_u32); + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to parse retryable request reason {}"_format(e.what()); + continue; + } + + request.nodes.emplace_back(std::move(node)); + } + result.retryable_requests.emplace_back(std::move(request)); + } + } + } + result.bt.success = result.bt.read_error.empty(); + } + return result; +} + +SerialiseSwarmsResult ServiceNode::serialize_swarms(Serialise serialise, std::string_view read_data) const { - SNSerialiseResult result = {}; + SerialiseSwarmsResult result = {}; constexpr std::string_view VERSION_KEY = "@"; constexpr std::string_view NETWORK_SWARMS_KEY = "network.swarms"; @@ -77,24 +258,24 @@ SNSerialiseResult ServiceNode::serialize(Serialise serialise, std::string_view s d.append(SWARM_CUR_SWARM_ID, swarm_.cur_swarm_id_); - { // Append list of _our_ swarm members + { // Append list of _our_ swarm members oxenc::bt_list_producer swarm_member_list = d.append_list(SWARM_MEMBERS_KEY); for (auto it : swarm_.members_) swarm_member_list.append(it.first); // pk } - result.bt_serialise.success = true; - result.bt_serialise.write_payload = d.view(); + result.bt.success = true; + result.bt.write_payload = d.view(); } else { - if (serialized_data.size()) { - oxenc::bt_dict_consumer d{serialized_data}; + if (read_data.size()) { + oxenc::bt_dict_consumer d{read_data}; try { version = d.require(VERSION_KEY); } catch (const std::exception& e) { - result.bt_serialise.read_error = "Failed to parse version: {}"_format(e.what()); + result.bt.read_error = "Failed to parse version: {}"_format(e.what()); } - if (result.bt_serialise.read_error.empty()) { + if (result.bt.read_error.empty()) { try { // Network swarms auto [key, network_swarm_list] = d.next_list_consumer(); assert(key == NETWORK_SWARMS_KEY); @@ -111,21 +292,21 @@ SNSerialiseResult ServiceNode::serialize(Serialise serialise, std::string_view s } } catch (const std::exception& e) { - result.bt_serialise.read_error = + result.bt.read_error = "Failed to parse network swarms: {}"_format(e.what()); } } - if (result.bt_serialise.read_error.empty()) { + if (result.bt.read_error.empty()) { try { result.swarm_cur_swarm_id = d.require(SWARM_CUR_SWARM_ID); } catch (const std::exception& e) { - result.bt_serialise.read_error = + result.bt.read_error = "Failed to swarm's current swarm ID: {}"_format(e.what()); } } - if (result.bt_serialise.read_error.empty()) { + if (result.bt.read_error.empty()) { try { // Swarm members auto [key, list] = d.next_list_consumer(); assert(key == SWARM_MEMBERS_KEY); @@ -135,12 +316,12 @@ SNSerialiseResult ServiceNode::serialize(Serialise serialise, std::string_view s result.swarm_members[crypto::legacy_pubkey::from_bytes(bytes)]; } } catch (const std::exception& e) { - result.bt_serialise.read_error = + result.bt.read_error = "Failed to parse swarm members: {}"_format(e.what()); } } } - result.bt_serialise.success = result.bt_serialise.read_error.empty(); + result.bt.success = result.bt.read_error.empty(); } return result; @@ -163,26 +344,44 @@ ServiceNode::ServiceNode( db{std::make_unique(dblocation)} { mq_servers_.push_back(&omq_server); - std::string blob_data = db->runtime_state_sn_blob(Serialise::Read, ""); - SNSerialiseResult serialise_result = serialize(Serialise::Read, blob_data); - if (serialise_result.bt_serialise.success) { - last_serialize_hash = fnv1a64_hasher(blob_data, FNV1A64_SEED); - swarm_.members_ = std::move(serialise_result.swarm_members); - network_.swarms_ = std::move(serialise_result.network_swarms); - swarm_.cur_swarm_id_ = serialise_result.swarm_cur_swarm_id; + std::string swarms_blob = db->runtime_state_blob(BlobType::Swarms, Serialise::Read, ""); + SerialiseSwarmsResult swarm_result = serialize_swarms(Serialise::Read, swarms_blob); + if (swarm_result.bt.success) { + last_swarms_serialize_hash = fnv1a64_hasher(swarms_blob, FNV1A64_SEED); + swarm_.members_ = std::move(swarm_result.swarm_members); + network_.swarms_ = std::move(swarm_result.network_swarms); + swarm_.cur_swarm_id_ = swarm_result.swarm_cur_swarm_id; } else { - blob_data.clear(); + log::error(logcat, "Deserialising of swarms failed: {}", swarm_result.bt.read_error); + swarms_blob.clear(); + } + + std::string retryable_blob = + db->runtime_state_blob(BlobType::RetryableRequests, Serialise::Read, ""); + SerialiseRetryableRequestsResult retryable_result = + serialize_retryable_requests(Serialise::Read, retryable_blob, {}); + if (retryable_result.bt.success) { + last_retryable_serialize_hash = fnv1a64_hasher(retryable_blob, FNV1A64_SEED); + retryable_requests = std::move(retryable_result.retryable_requests); + } else { + log::error( + logcat, + "Deserialising of retryable requests failed: {}", + retryable_result.bt.read_error); + retryable_blob.clear(); } log::info( logcat, - "Loaded {} ({}) swarms from disk (#{:x}; in swarm {:x} w/ {} members). Requesting " - "initial swarm state", + "Loaded {} ({}) swarms (#{:x}; in swarm {:x} w/ {} members) and {} ({}) retryable " + "requests from disk. Requesting initial swarm state", network_.swarms_.size(), - util::get_human_readable_bytes(blob_data.size()), - last_serialize_hash, + util::get_human_readable_bytes(swarms_blob.size()), + last_swarms_serialize_hash, swarm_.cur_swarm_id_, - swarm_.members_.size()); + swarm_.members_.size(), + retryable_requests.size(), + util::get_human_readable_bytes(retryable_blob.size())); omq_server->add_timer( [this] { @@ -332,7 +531,7 @@ static std::optional parse_swarm_update( void ServiceNode::add_retryable_request(RequestRetry&& item) { std::unique_lock lock{retryable_requests_mutex}; retryable_requests.emplace_back(item); - retryable_requests_cv.notify_all(); // Wake up retry thread + retryable_requests_cv.notify_all(); // Wake up retry thread } void ServiceNode::register_mq_server(server::MQBase* server) { @@ -716,30 +915,40 @@ void ServiceNode::save_bulk(const std::vector& msgs) { log::trace(logcat, "saved messages count: {}", msgs.size()); } -void ServiceNode::on_bootstrap_update(block_update&& bu) { - swarm_.update_swarms(bu.height, std::move(bu.swarms), bu.contacts); - target_height_ = std::max(target_height_, bu.height); - - snode::SNSerialiseResult serialise_result = serialize(Serialise::Write, ""); - if (serialise_result.bt_serialise.success) { - uint64_t hash = fnv1a64_hasher(serialise_result.bt_serialise.write_payload, FNV1A64_SEED); - if (last_serialize_hash != hash) { - log::info( +static void store_swarms_blob_if_changed(uint64_t block_height, const SerialiseSwarmsResult& serialise_result, Database& db, uint64_t& last_hash) { + if (serialise_result.bt.success) { + uint64_t hash = fnv1a64_hasher(serialise_result.bt.write_payload, FNV1A64_SEED); + if (last_hash != hash) { + log::debug( logcat, "Swarm state dirtied at blk {}; #{:x} => #{:x}, saving {} to DB", - block_height_, - last_serialize_hash, + block_height, + last_hash, hash, - util::get_human_readable_bytes( - serialise_result.bt_serialise.write_payload.size())); - - last_serialize_hash = hash; - db->runtime_state_sn_blob( - Serialise::Write, serialise_result.bt_serialise.write_payload); + util::get_human_readable_bytes(serialise_result.bt.write_payload.size())); + last_hash = hash; + db.runtime_state_blob( + BlobType::Swarms, Serialise::Write, serialise_result.bt.write_payload); + } + } else { + if (static bool once = true; once) { + once = false; + log::error( + logcat, + "Failed to serialize swarms to blob: {}", + serialise_result.bt.write_payload); } } } +void ServiceNode::on_bootstrap_update(block_update&& bu) { + swarm_.update_swarms(bu.height, std::move(bu.swarms), bu.contacts); + target_height_ = std::max(target_height_, bu.height); + + snode::SerialiseSwarmsResult write = serialize_swarms(Serialise::Write, ""); + store_swarms_blob_if_changed(block_height_, write, *db, last_swarms_serialize_hash); +} + void ServiceNode::on_snodes_update(block_update&& bu) { hf_revision net_ver{bu.hardfork, bu.snode_revision}; if (hardfork_ != net_ver) { @@ -781,24 +990,8 @@ void ServiceNode::on_snodes_update(block_update&& bu) { auto events = swarm_.update_swarms(bu.height, std::move(bu.swarms), bu.contacts); // Serialise state to blob and store into DB if dirtied - snode::SNSerialiseResult serialise_result = serialize(Serialise::Write, ""); - if (serialise_result.bt_serialise.success) { - uint64_t hash = fnv1a64_hasher(serialise_result.bt_serialise.write_payload, FNV1A64_SEED); - if (last_serialize_hash != hash) { - log::info( - logcat, - "Swarm state dirtied at blk {}; #{:x} => #{:x}, saving {} to DB", - block_height_, - last_serialize_hash, - hash, - util::get_human_readable_bytes( - serialise_result.bt_serialise.write_payload.size())); - - last_serialize_hash = hash; - db->runtime_state_sn_blob( - Serialise::Write, serialise_result.bt_serialise.write_payload); - } - } + snode::SerialiseSwarmsResult write = serialize_swarms(Serialise::Write, ""); + store_swarms_blob_if_changed(block_height_, write, *db, last_swarms_serialize_hash); if (const SnodeStatus status = events.our_swarm_id != INVALID_SWARM_ID ? SnodeStatus::ACTIVE : bu.decommed ? SnodeStatus::DECOMMISSIONED @@ -1435,6 +1628,11 @@ void ServiceNode::retryable_requests_thread_entry_point() { fmt::memory_buffer trace_buffer; for (size_t index = 0; index < retryable_requests.size(); index++) { const auto& item = retryable_requests[index]; + auto item_age = + std::chrono::duration_cast(now - item.create_time); + if (item_age >= rpc::TTL_MAXIMUM_PRIVATE) + continue; + if (log::get_level(logcat) <= log::Level::trace) { fmt::format_to( std::back_inserter(trace_buffer), @@ -1448,7 +1646,7 @@ void ServiceNode::retryable_requests_thread_entry_point() { for (size_t node_index = 0; node_index < item.nodes.size(); node_index++) { const auto& node_item = item.nodes[node_index]; - bool is_due = now>= node_item.deadline; + bool is_due = now >= node_item.deadline; due_requests += is_due; if (log::get_level(logcat) <= log::Level::trace) { @@ -1501,6 +1699,12 @@ void ServiceNode::retryable_requests_thread_entry_point() { it->hash = fnv1a64_hasher(it->req_payload, it->hash); } + auto it_age = std::chrono::duration_cast(now - it->create_time); + if (it_age >= rpc::TTL_MAXIMUM_PRIVATE) { + log::debug(logcat, "Retry request ({}) expired after {}", it->cmd, it_age); + it->nodes.clear(); + } + for (auto node_it = it->nodes.begin(); node_it != it->nodes.end();) { auto on_request_done = [MIN_RETRY_DELAY, MAX_RETRY_DELAY, @@ -1642,6 +1846,31 @@ void ServiceNode::retryable_requests_thread_entry_point() { else it++; } + + SerialiseRetryableRequestsResult write = + serialize_retryable_requests(Serialise::Write, "", retryable_requests); + if (write.bt.success) { + uint64_t hash = fnv1a64_hasher(write.bt.write_payload, FNV1A64_SEED); + if (last_retryable_serialize_hash != hash) { + log::debug( + logcat, + "Retryable requests dirtied #{:x} => #{:x}, saving {} to DB", + last_retryable_serialize_hash, + hash, + util::get_human_readable_bytes(write.bt.write_payload.size())); + last_retryable_serialize_hash = hash; + db->runtime_state_blob( + BlobType::RetryableRequests, Serialise::Write, write.bt.write_payload); + } + } else { + if (static bool once = true; once) { + once = false; + log::error( + logcat, + "Failed to serialize retryable requests to blob: {}", + write.bt.write_payload); + } + } } } } // namespace oxenss::snode diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index ceef7714d..d99199823 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -71,13 +71,6 @@ constexpr std::string_view to_string(SnodeStatus status) { return "Unknown"sv; } -struct SNSerialiseResult { - BTSerialiseResult bt_serialise; - std::map swarm_members; - swarms_t network_swarms; - swarm_id_t swarm_cur_swarm_id; -}; - enum class RetryReason { NON_CONTACTABLE, FAILED_TO_SEND, @@ -92,12 +85,25 @@ struct RequestRetryEntry { }; struct RequestRetry { - std::string_view cmd; + std::string cmd; std::string req_payload; uint64_t hash; + std::chrono::steady_clock::time_point create_time; std::vector nodes; }; +struct SerialiseRetryableRequestsResult { + SerialiseBTResult bt; + std::vector retryable_requests; +}; + +struct SerialiseSwarmsResult { + SerialiseBTResult bt; + std::map swarm_members; + swarms_t network_swarms; + swarm_id_t swarm_cur_swarm_id; +}; + /// All service node logic that is not network-specific class ServiceNode { bool syncing_ = true; @@ -139,7 +145,7 @@ class ServiceNode { mutable std::recursive_mutex sn_mutex_; // Lock to be taken when interacting with the 'retryable_requests' queue - std::mutex retryable_requests_mutex; + mutable std::mutex retryable_requests_mutex; // List of requests that will be re-attempted periodically through the // 'retryable_requests_thread' @@ -150,6 +156,10 @@ class ServiceNode { // The time point at which the next swarm member check should be executed std::chrono::steady_clock::time_point swarm_member_check_deadline = {}; + uint64_t last_swarms_serialize_hash = 0; + + uint64_t last_retryable_serialize_hash = 0; + void send_notifies(message m); // Save multiple messages to the database at once (i.e. in a single transaction) @@ -211,9 +221,7 @@ class ServiceNode { bool force_start, bool skip_bootstrap); - SNSerialiseResult serialize(Serialise serialise, std::string_view serialized_data) const; - - uint64_t last_serialize_hash = 0; + SerialiseSwarmsResult serialize_swarms(Serialise serialise, std::string_view read_data) const; std::unique_ptr db; diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index f665f9cb1..70379595f 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -338,10 +338,12 @@ CREATE TRIGGER IF NOT EXISTS revoked_autoclean log::info(logcat, "Upgrading database schema: adding runtime_state"); db.exec(R"( CREATE TABLE runtime_state ( - sn_blob BLOB + swarms_blob BLOB, + retryable_requests_blob BLOB ); +INSERT INTO runtime_state VALUES (null, null); +PRAGMA user_version = 1; )"); - db.exec("PRAGMA user_version = 1;"); } views_triggers_indices(); @@ -1202,18 +1204,24 @@ void oxenss::Database::test_suite_block_for(std::chrono::milliseconds duration) std::this_thread::sleep_for(duration); } -std::string Database::runtime_state_sn_blob(Serialise serialise, const std::string& write_blob) +std::string Database::runtime_state_blob(BlobType type, Serialise serialise, const std::string& write_blob) { + std::string_view key = {}; + switch (type) { + case BlobType::Swarms: key = "swarms_blob"; break; + case BlobType::RetryableRequests: key = "retryable_requests_blob"; break; + } + std::string result; auto impl = get_impl(serialise == Serialise::Write); if (serialise == Serialise::Read) { - auto stmt = impl->prepared_st("SELECT sn_blob FROM runtime_state LIMIT 1"); + auto stmt = impl->prepared_st("SELECT {} FROM runtime_state LIMIT 1"_format(key)); auto maybe_result = exec_and_maybe_get(stmt); if (maybe_result) result = std::move(*maybe_result); } else { if (write_blob.size()) { - auto stmt = impl->prepared_st("REPLACE INTO runtime_state (sn_blob) VALUES (?)"); + auto stmt = impl->prepared_st("UPDATE runtime_state SET {} = ?"_format(key)); exec_query(stmt, write_blob); } } diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index 7c532bc81..12d6ffcb7 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -35,6 +35,11 @@ enum class StoreResult { inline std::atomic tmp_init_db_version = 0; +enum class BlobType { + Swarms, + RetryableRequests, +}; + // Storage database class. class Database { std::stack> impl_pool_; @@ -214,7 +219,7 @@ class Database { std::map get_expiries( const user_pubkey& pubkey, const std::vector& msg_hashes); - std::string runtime_state_sn_blob(Serialise serialise, const std::string& write_blob); + std::string runtime_state_blob(BlobType type, Serialise serialise, const std::string& write_blob); }; } // namespace oxenss From 18c7c7a33943d5173b6a525ae9fba58c17d5412f Mon Sep 17 00:00:00 2001 From: doylet Date: Tue, 17 Jun 2025 11:16:23 +1000 Subject: [PATCH 20/50] Match comment w/ code regarding when to add retryable request on failure Comment states that only timed-out requests are retried. This is correct as an error response with error code and text are stiuations that might mean that the recipient node is not in a valid state or will ever accept the request in which case the safe default is to not retry to that node. It is possible in future since all possible error states are known to handle them specifically for the command. But for now, a sane default is to only allow retries to nodes that were offline or we failed to communicate with. --- oxenss/rpc/request_handler.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/oxenss/rpc/request_handler.cpp b/oxenss/rpc/request_handler.cpp index acf9642db..9c1ce3295 100644 --- a/oxenss/rpc/request_handler.cpp +++ b/oxenss/rpc/request_handler.cpp @@ -463,7 +463,7 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrpending += peers.size(); // When a request to a peer fails, set the initial retry to 1s in the future - constexpr auto default_deadline_delay = 1s; + constexpr auto default_retry_delay = 1s; for (auto& peer : peers) { auto ct = sn.contacts().find(peer.first); @@ -479,7 +479,7 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrretry_nodes.push_back(entry); continue; } @@ -487,7 +487,7 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrrequest( ct->pubkey_x25519.view(), "sn.storage_cc", - [res, peer, peer_ed = ct->pubkey_ed25519, &sn, default_deadline_delay]( + [res, peer, peer_ed = ct->pubkey_ed25519, &sn, default_retry_delay]( bool success, auto parts) { json peer_result; SNStorageCCResult store_result = @@ -531,11 +531,13 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrretry_nodes.push_back(entry); + if (timeout) { + snode::RequestRetryEntry entry = {}; + entry.key = peer.first; + entry.reason = snode::RetryReason::FAILED_TO_SEND; + entry.deadline = std::chrono::steady_clock::now() + default_retry_delay; + res->retry_nodes.push_back(entry); + } } else if (res->b64) { if (auto it = peer_result.find("signature"); it != peer_result.end() && it->is_string()) From 0c0c5e5afeb4041faf2a05b2e0dc62ee62204e2c Mon Sep 17 00:00:00 2001 From: doylet Date: Tue, 17 Jun 2025 11:19:04 +1000 Subject: [PATCH 21/50] Revert NEW_SWARM_MEMBER_INTERVAL to 10s This was originally 10s and it was mistakenly changed to 30s. We've reverted all the changes to the swarm member checks so this should go back to its original values as retries are handled in a separate subsystem instead of intertwined with the member checks. --- oxenss/snode/service_node.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 58f0f4373..9a01e58a8 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -46,7 +46,7 @@ constexpr auto OXEND_PING_INTERVAL = 30s; // How often to trigger 'check_new_members' which checks for 'data ready' handshakes from // swarm members and propagate a DB dump if necessary. -constexpr auto NEW_SWARM_MEMBER_INTERVAL = 30s; +constexpr auto NEW_SWARM_MEMBER_INTERVAL = 10s; static SerialiseRetryableRequestsResult serialize_retryable_requests( Serialise serialise, std::string_view read_data, std::span write_data) { From 9da04604b7bb0215a7e14e74f7aaad7e7163fa99 Mon Sep 17 00:00:00 2001 From: doylet Date: Tue, 17 Jun 2025 11:33:50 +1000 Subject: [PATCH 22/50] Update the swarms serialisation to match retryable requests patterns Move serialise result struct for retryable requests into impl file as it's only used locally. --- oxenss/snode/service_node.cpp | 61 ++++++++++++++++++++++++----------- oxenss/snode/service_node.h | 5 --- 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 9a01e58a8..c7aece53d 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -48,6 +48,11 @@ constexpr auto OXEND_PING_INTERVAL = 30s; // swarm members and propagate a DB dump if necessary. constexpr auto NEW_SWARM_MEMBER_INTERVAL = 10s; +struct SerialiseRetryableRequestsResult { + SerialiseBTResult bt; + std::vector retryable_requests; +}; + static SerialiseRetryableRequestsResult serialize_retryable_requests( Serialise serialise, std::string_view read_data, std::span write_data) { SerialiseRetryableRequestsResult result = {}; @@ -62,7 +67,7 @@ static SerialiseRetryableRequestsResult serialize_retryable_requests( constexpr std::string_view REQ_PAYLOAD_KEY = "r"; constexpr std::string_view CREATE_TIME_KEY = "t"; constexpr std::string_view NODES_KEY = "u"; - assert(COMMAND_KEY < CREATE_TIME_KEY); + assert(COMMAND_KEY < REQ_PAYLOAD_KEY); assert(REQ_PAYLOAD_KEY < CREATE_TIME_KEY); assert(CREATE_TIME_KEY < NODES_KEY); @@ -276,24 +281,37 @@ SerialiseSwarmsResult ServiceNode::serialize_swarms(Serialise serialise, std::st } if (result.bt.read_error.empty()) { - try { // Network swarms - auto [key, network_swarm_list] = d.next_list_consumer(); + // Initially a dummy list that we will std::move the real list into + oxenc::bt_list_consumer swarm_list("l"); + try { + auto [key, list] = d.next_list_consumer(); assert(key == NETWORK_SWARMS_KEY); + swarm_list = std::move(list); + } catch (const std::exception& e) { + result.bt.read_error = "Failed to parse network swarms: {}"_format(e.what()); + } - while (!network_swarm_list.is_finished()) { - auto swarm = network_swarm_list.consume_list_consumer(); - uint64_t swarm_id = swarm.consume(); + while (result.bt.read_error.empty() && !swarm_list.is_finished()) { + auto swarm = swarm_list.consume_list_consumer(); + uint64_t swarm_id = 0; + try { + swarm_id = swarm.consume(); + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to parse swarm id from swarm list: {}"_format(e.what()); + continue; + } - std::set& keys = result.network_swarms[swarm_id]; - while (!swarm.is_finished()) { + std::set& keys = result.network_swarms[swarm_id]; + while (result.bt.read_error.empty() && !swarm.is_finished()) { + try { auto bytes = swarm.consume(); keys.insert(keys.end(), crypto::legacy_pubkey::from_bytes(bytes)); + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to parse swarm pubkey from swarm: {}"_format(e.what()); } } - - } catch (const std::exception& e) { - result.bt.read_error = - "Failed to parse network swarms: {}"_format(e.what()); } } @@ -307,18 +325,25 @@ SerialiseSwarmsResult ServiceNode::serialize_swarms(Serialise serialise, std::st } if (result.bt.read_error.empty()) { - try { // Swarm members + oxenc::bt_list_consumer swarm_members("l"); + try { auto [key, list] = d.next_list_consumer(); assert(key == SWARM_MEMBERS_KEY); - - while (!list.is_finished()) { - auto bytes = list.consume(); - result.swarm_members[crypto::legacy_pubkey::from_bytes(bytes)]; - } + swarm_members = std::move(list); } catch (const std::exception& e) { result.bt.read_error = "Failed to parse swarm members: {}"_format(e.what()); } + + while (result.bt.read_error.empty() && !swarm_members.is_finished()) { + try { + auto bytes = swarm_members.consume(); + result.swarm_members[crypto::legacy_pubkey::from_bytes(bytes)]; + } catch (const std::exception& e) { + result.bt.read_error = + "Failed to parse swarm member from list: {}"_format(e.what()); + } + } } } result.bt.success = result.bt.read_error.empty(); diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index d99199823..b9e6d40bf 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -92,11 +92,6 @@ struct RequestRetry { std::vector nodes; }; -struct SerialiseRetryableRequestsResult { - SerialiseBTResult bt; - std::vector retryable_requests; -}; - struct SerialiseSwarmsResult { SerialiseBTResult bt; std::map swarm_members; From 48c3de004036c9dcc1ed9cbdee2dee81ace74228 Mon Sep 17 00:00:00 2001 From: doylet Date: Tue, 17 Jun 2025 11:50:37 +1000 Subject: [PATCH 23/50] Remove unused swarm_member_check_deadline, add comments --- oxenss/snode/service_node.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index b9e6d40bf..c4a1d3400 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -148,11 +148,12 @@ class ServiceNode { std::thread retryable_requests_thread; - // The time point at which the next swarm member check should be executed - std::chrono::steady_clock::time_point swarm_member_check_deadline = {}; - + // The hash of the last swarms blob that was serialised, used for dirty checks before storing to + // the DB. uint64_t last_swarms_serialize_hash = 0; + // The hash of the last retryable requsts blob that was serialised, used for dirty checks before + // storing to the DB. uint64_t last_retryable_serialize_hash = 0; void send_notifies(message m); From a36b208573035ecdd5cebc2085a3b02edae85ac1 Mon Sep 17 00:00:00 2001 From: doylet Date: Tue, 17 Jun 2025 11:53:23 +1000 Subject: [PATCH 24/50] Simplify swarm member status states by removing ContactDetailsReady The `new_swarm_member` flag disambiguates between nodes that need a DB dump vs nodes that don't sufficiently that we don't need the intermediate contact details ready state. Also remove unused member var. --- oxenss/snode/service_node.cpp | 4 +--- oxenss/snode/swarm.cpp | 15 +++++++-------- oxenss/snode/swarm.h | 7 ++----- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index c7aece53d..e75da0ebe 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -1811,9 +1811,7 @@ void ServiceNode::retryable_requests_thread_entry_point() { if (is_member && !node_it->retry_underway) { // Retry request if ready bool is_due = now >= node_it->deadline; - bool ready = - (is_member->status == SwarmMemberStatus::ContactDetailsReady || - is_member->status == SwarmMemberStatus::Ready); + bool ready = is_member->status == SwarmMemberStatus::Ready; crypto::x25519_pubkey pubkey_x25519 = {}; if (ready) { diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index db536e067..c26343edb 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -249,13 +249,12 @@ std::set Swarm::extract_contacts_needing_db_dump() { std::set result; for (auto& it : members_) { - if (it.second.status != SwarmMemberStatus::ContactDetailsReady) - continue; - const crypto::legacy_pubkey& pk = it.first; - it.second.status = SwarmMemberStatus::Ready; - if (it.second.new_swarm_member) { - it.second.new_swarm_member = false; - result.insert(pk); + if (it.second.status == SwarmMemberStatus::Ready) { + const crypto::legacy_pubkey& pk = it.first; + if (it.second.new_swarm_member) { + it.second.new_swarm_member = false; + result.insert(pk); + } } } @@ -267,6 +266,6 @@ void Swarm::set_member_contact_details_ready(const crypto::legacy_pubkey& pk) { auto it = members_.find(pk); assert(it != members_.end()); if (it != members_.end()) - it->second.status = SwarmMemberStatus::ContactDetailsReady; + it->second.status = SwarmMemberStatus::Ready; } } // namespace oxenss::snode diff --git a/oxenss/snode/swarm.h b/oxenss/snode/swarm.h index 1fa0cafd6..ccc91e5f2 100644 --- a/oxenss/snode/swarm.h +++ b/oxenss/snode/swarm.h @@ -31,17 +31,15 @@ enum struct SwarmMemberStatus { // Pubkeys of new members into our swarm who we haven't yet established communications with; // once we do, we push all our swarm's messages to them. ContactDetailsPending, - ContactDetailsReady, Ready, }; struct SwarmMemberState { SwarmMemberStatus status; - std::chrono::milliseconds newest_msg_timestamp; - // Set if this member joined the swarm. They are assumed to not have any of the messages for - // the swarm yet so a full DB will be initiated + // the swarm yet so a full DB dump will be initiated for messages we own that belong to the + // swarm. bool new_swarm_member; // The earliest timestamp at which the swarm will check if they have received contact @@ -51,7 +49,6 @@ struct SwarmMemberState { std::chrono::steady_clock::time_point check_contact_info_next_retry; }; - // How often we wait, after returning a pending new member, before we return the member again from // `extract_new_members()`. constexpr auto NEW_SWARM_MEMBER_RETRY = 30s; From 5a67656ffdd29ebfde0e0e94fefe3cb66bc34aef Mon Sep 17 00:00:00 2001 From: doylet Date: Tue, 17 Jun 2025 12:16:16 +1000 Subject: [PATCH 25/50] Fix dblocation typo -> db_location --- oxenss/snode/service_node.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index e75da0ebe..ee659ac87 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -356,7 +356,7 @@ ServiceNode::ServiceNode( const crypto::legacy_keypair& keys, const contact& contact, server::OMQ& omq_server, - const std::filesystem::path& dblocation, + const std::filesystem::path& db_location, bool force_start, bool skip_bootstrap) : force_start_{force_start}, @@ -366,7 +366,7 @@ ServiceNode::ServiceNode( network_{*omq_server}, omq_server_{omq_server}, all_stats_{*omq_server}, - db{std::make_unique(dblocation)} { + db{std::make_unique(db_location)} { mq_servers_.push_back(&omq_server); std::string swarms_blob = db->runtime_state_blob(BlobType::Swarms, Serialise::Read, ""); From aa5fe9683e1fc48d1e0c03c1e6515f62addf48d9 Mon Sep 17 00:00:00 2001 From: doylet Date: Tue, 17 Jun 2025 17:27:05 +1000 Subject: [PATCH 26/50] Remove RETRY_BACKOFF_COEFF from retry request on done lambda Not sure why, it's used in the lambda itself but it does not need to be captured unlike the other 2 constexpr variables that are being captured and used. Alas CI is complaining about it and treating the warning as an error so removing it in this commit. --- oxenss/snode/service_node.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index ee659ac87..7845e0dc0 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -1733,7 +1733,6 @@ void ServiceNode::retryable_requests_thread_entry_point() { for (auto node_it = it->nodes.begin(); node_it != it->nodes.end();) { auto on_request_done = [MIN_RETRY_DELAY, MAX_RETRY_DELAY, - RETRY_BACKOFF_COEFF, this, hash = it->hash, key = node_it->key]( From f52aecd5a0b81fca3c5de6ba98958a461de12763 Mon Sep 17 00:00:00 2001 From: doylet Date: Tue, 17 Jun 2025 17:44:41 +1000 Subject: [PATCH 27/50] Linting --- oxenss/common/serialize.h | 1 - oxenss/rpc/request_handler.cpp | 3 ++- oxenss/snode/service_node.cpp | 31 ++++++++++++++++++------------- oxenss/snode/swarm.cpp | 6 +++--- oxenss/storage/database.cpp | 4 ++-- oxenss/storage/database.hpp | 3 ++- 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/oxenss/common/serialize.h b/oxenss/common/serialize.h index 17925204f..864aeb93e 100644 --- a/oxenss/common/serialize.h +++ b/oxenss/common/serialize.h @@ -25,4 +25,3 @@ inline uint64_t fnv1a64_hasher(std::string_view bytes, uint64_t hash) { } }; // namespace oxenss - diff --git a/oxenss/rpc/request_handler.cpp b/oxenss/rpc/request_handler.cpp index 9c1ce3295..691fe107d 100644 --- a/oxenss/rpc/request_handler.cpp +++ b/oxenss/rpc/request_handler.cpp @@ -516,7 +516,8 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptr problems; if (!hf_at_least(STORAGE_SERVER_HARDFORK)) - problems.push_back( - fmt::format( - "not yet on hardfork {}.{}", - STORAGE_SERVER_HARDFORK.first, - STORAGE_SERVER_HARDFORK.second)); + problems.push_back(fmt::format( + "not yet on hardfork {}.{}", + STORAGE_SERVER_HARDFORK.first, + STORAGE_SERVER_HARDFORK.second)); if (syncing_) problems.push_back("not done syncing"); @@ -940,7 +940,11 @@ void ServiceNode::save_bulk(const std::vector& msgs) { log::trace(logcat, "saved messages count: {}", msgs.size()); } -static void store_swarms_blob_if_changed(uint64_t block_height, const SerialiseSwarmsResult& serialise_result, Database& db, uint64_t& last_hash) { +static void store_swarms_blob_if_changed( + uint64_t block_height, + const SerialiseSwarmsResult& serialise_result, + Database& db, + uint64_t& last_hash) { if (serialise_result.bt.success) { uint64_t hash = fnv1a64_hasher(serialise_result.bt.write_payload, FNV1A64_SEED); if (last_hash != hash) { @@ -1784,7 +1788,8 @@ void ServiceNode::retryable_requests_thread_entry_point() { node.next_retry_delay = std::min( std::chrono::milliseconds(delay_ms), std::chrono::milliseconds(MAX_RETRY_DELAY)); - node.deadline = std::chrono::steady_clock::now() + node.next_retry_delay; + node.deadline = + std::chrono::steady_clock::now() + node.next_retry_delay; // Wake up retryable request thread, it will take into consideration the // new deadline for the blocking sleep diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index c26343edb..a60d0ea80 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -138,7 +138,7 @@ SwarmEvents Swarm::update_swarms( log::info(logswarm, "New network swarm: {}", swarm); // Remove members that are no longer in the swarm from our runtime state - for (auto it = members_.begin(); it != members_.end(); ) { + for (auto it = members_.begin(); it != members_.end();) { if (events.our_swarm_members.find(it->first) == events.our_swarm_members.end()) it = members_.erase(it); else @@ -162,13 +162,13 @@ SwarmEvents Swarm::update_swarms( // point onwards they will correctly identify nodes that are leaving and joining their // swarm and only do a message dump when necessary. if (oxenss::tmp_init_db_version == 0) { - pair.new_swarm_member = false; // Prevent the swarm DB dump on newly migrated nodes + pair.new_swarm_member = false; // Prevent the swarm DB dump on newly migrated nodes } else { pair.new_swarm_member = true; } } } - oxenss::tmp_init_db_version = 1; // Disable after the first swarm update + oxenss::tmp_init_db_version = 1; // Disable after the first swarm update cur_swarm_id_ = events.our_swarm_id; diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index 70379595f..cbe98e6f5 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -1204,8 +1204,8 @@ void oxenss::Database::test_suite_block_for(std::chrono::milliseconds duration) std::this_thread::sleep_for(duration); } -std::string Database::runtime_state_blob(BlobType type, Serialise serialise, const std::string& write_blob) -{ +std::string Database::runtime_state_blob( + BlobType type, Serialise serialise, const std::string& write_blob) { std::string_view key = {}; switch (type) { case BlobType::Swarms: key = "swarms_blob"; break; diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index 12d6ffcb7..baf3c2c39 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -219,7 +219,8 @@ class Database { std::map get_expiries( const user_pubkey& pubkey, const std::vector& msg_hashes); - std::string runtime_state_blob(BlobType type, Serialise serialise, const std::string& write_blob); + std::string runtime_state_blob( + BlobType type, Serialise serialise, const std::string& write_blob); }; } // namespace oxenss From 9153442bc96bdebbfd7bc4ee24046fc19efcbb94 Mon Sep 17 00:00:00 2001 From: doylet Date: Thu, 19 Jun 2025 17:07:19 +1000 Subject: [PATCH 28/50] Request a DB dump on data-ready handshake if DB is empty --- oxenss/common/serialize.h | 2 +- oxenss/server/omq.cpp | 41 +++++- oxenss/snode/service_node.cpp | 233 +++++++++++++++++++++++++--------- oxenss/snode/service_node.h | 18 +++ oxenss/snode/swarm.cpp | 88 +++++++++---- oxenss/snode/swarm.h | 44 +++++-- oxenss/storage/database.cpp | 13 +- oxenss/storage/database.hpp | 7 +- unit_test/storage.cpp | 26 ++-- 9 files changed, 356 insertions(+), 116 deletions(-) diff --git a/oxenss/common/serialize.h b/oxenss/common/serialize.h index 864aeb93e..be3155b0e 100644 --- a/oxenss/common/serialize.h +++ b/oxenss/common/serialize.h @@ -13,7 +13,7 @@ enum class Serialise { struct SerialiseBTResult { bool success; std::string write_payload; - std::string read_error; + std::string error; }; constexpr uint64_t FNV1A64_SEED = 14695981039346656037ULL; diff --git a/oxenss/server/omq.cpp b/oxenss/server/omq.cpp index 7420cf33d..96544814d 100644 --- a/oxenss/server/omq.cpp +++ b/oxenss/server/omq.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -45,7 +46,11 @@ std::string OMQ::peer_lookup(std::string_view pubkey_bin) const { } void OMQ::handle_sn_data_ready(oxenmq::Message& message) { - log::debug(logcat, "[OMQ] handle sn.data_ready from: {}", message.conn.to_string()); + log::debug( + logcat, + "[OMQ] handle sn.data_ready from: {} (parts {})", + message.conn.to_string(), + message.data.size()); auto& xpk_str = message.conn.pubkey(); if (xpk_str.size() != sizeof(crypto::x25519_pubkey)) @@ -56,6 +61,40 @@ void OMQ::handle_sn_data_ready(oxenmq::Message& message) { if (!service_node_->is_swarm_peer(xpk)) return message.send_reply("Swarm mismatch"); + std::optional ct = service_node_->contacts().find(xpk); + if (!ct) + return message.send_reply("Contact info missing"); + + if (ct->version >= snode::SN_DATA_READY_WITH_REQUEST_VERSION) { + if (message.data.empty()) + return message.send_reply("Request payload missing"); + + snode::SerialiseDataReadyRequestResult deserialised = + snode::serialise_data_ready_request(Serialise::Read, message.data[0], {}); + if (!deserialised.bt.success) + return message.send_reply("Request payload malformed {}"_format(deserialised.bt.error)); + + const snode::DataReadyRequest& request = deserialised.request; + if (request.needs_db_dump) + service_node_->set_member_needs_db_dump(crypto::legacy_pubkey{ct->pubkey_ed25519}); + + if (auto level = log::Level::debug; log::get_level(logcat) <= level) { + std::string label; + if (deserialised.bt.success) + label = "rejected, bad request payload. {})"_format(deserialised.bt.error); + else + label = "rejected due to bad request args"; + + log::log( + logcat, + level, + "sn.data ready processed (edpk: {}, db dump: {}): {}", + ct->pubkey_ed25519, + request.needs_db_dump, + label); + } + } + message.send_reply("OK"); } diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index baa774ea2..d7d7e124d 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -53,6 +53,49 @@ struct SerialiseRetryableRequestsResult { std::vector retryable_requests; }; +SerialiseDataReadyRequestResult serialise_data_ready_request( + Serialise serialise, std::string_view read_data, const DataReadyRequest& write_data) { + SerialiseDataReadyRequestResult result = {}; + uint32_t version = 0; + constexpr std::string_view VERSION_KEY = "@"; + constexpr std::string_view STATUS_KEY = "s"; + constexpr std::string_view NEED_DB_DUMP_KEY = "t"; + static_assert(VERSION_KEY < STATUS_KEY); + static_assert(STATUS_KEY < NEED_DB_DUMP_KEY); + + if (serialise == Serialise::Write) { + oxenc::bt_dict_producer d; + d.append(VERSION_KEY, version); + d.append(NEED_DB_DUMP_KEY, write_data.needs_db_dump); + result.bt.write_payload = d.view(); + result.bt.success = result.bt.error.empty(); + } else { + if (read_data.size()) { + oxenc::bt_dict_consumer d{read_data}; + try { + version = d.require(VERSION_KEY); + } catch (const std::exception& e) { + result.bt.error = + "Failed to parse sn data ready request version: {}"_format(e.what()); + } + + if (result.bt.error.empty()) { + try { + result.request.needs_db_dump = d.require(NEED_DB_DUMP_KEY); + } catch (const std::exception& e) { + result.bt.error = + "Failed to parse sn data ready db dump flag: {}"_format(e.what()); + } + } + } else { + result.bt.error = "Failed to parse data ready payload: no bytes given"; + } + + result.bt.success = result.bt.error.empty(); + } + return result; +} + static SerialiseRetryableRequestsResult serialize_retryable_requests( Serialise serialise, std::string_view read_data, std::span write_data) { SerialiseRetryableRequestsResult result = {}; @@ -116,15 +159,14 @@ static SerialiseRetryableRequestsResult serialize_retryable_requests( try { version = d.require(VERSION_KEY); } catch (const std::exception& e) { - result.bt.read_error = - "Failed to parse retryable request version: {}"_format(e.what()); + result.bt.error = "Failed to parse retryable request version: {}"_format(e.what()); } if (version != 0) - result.bt.read_error = + result.bt.error = "Unrecognised retryable request version: {}, skipping"_format(version); - if (result.bt.read_error.empty()) { + if (result.bt.error.empty()) { // Initially a dummy list that we will std::move the real list into oxenc::bt_list_consumer retry_list("l"); try { @@ -132,18 +174,17 @@ static SerialiseRetryableRequestsResult serialize_retryable_requests( assert(key == RETRYABLE_REQUESTS_KEY); retry_list = std::move(list); } catch (const std::exception& e) { - result.bt.read_error = - "Failed to read retryable request list: {}"_format(e.what()); + result.bt.error = "Failed to read retryable request list: {}"_format(e.what()); } - while (result.bt.read_error.empty() && !retry_list.is_finished()) { + while (result.bt.error.empty() && !retry_list.is_finished()) { auto request_dict = retry_list.consume_dict_consumer(); RequestRetry request = {}; try { request.cmd = request_dict.require(COMMAND_KEY); } catch (const std::exception& e) { - result.bt.read_error = + result.bt.error = "Failed to read retryable request command: {}"_format(e.what()); continue; } @@ -151,7 +192,7 @@ static SerialiseRetryableRequestsResult serialize_retryable_requests( try { request.req_payload = request_dict.require(REQ_PAYLOAD_KEY); } catch (const std::exception& e) { - result.bt.read_error = + result.bt.error = "Failed to read retryable request, request payload: {}"_format( e.what()); continue; @@ -162,7 +203,7 @@ static SerialiseRetryableRequestsResult serialize_retryable_requests( request.create_time = std::chrono::steady_clock::time_point( std::chrono::milliseconds(create_time_u64)); } catch (const std::exception& e) { - result.bt.read_error = + result.bt.error = "Failed to read retryable request, create time: {}"_format( e.what()); continue; @@ -174,12 +215,12 @@ static SerialiseRetryableRequestsResult serialize_retryable_requests( assert(key == NODES_KEY); node_list = std::move(list); } catch (const std::exception& e) { - result.bt.read_error = + result.bt.error = "Failed to read retryable request, node list: {}"_format(e.what()); continue; } - while (result.bt.read_error.empty() && !node_list.is_finished()) { + while (result.bt.error.empty() && !node_list.is_finished()) { auto node_dict = node_list.consume_dict_consumer(); RequestRetryEntry node = {}; try { @@ -187,7 +228,7 @@ static SerialiseRetryableRequestsResult serialize_retryable_requests( node_dict.require(KEY_KEY); node.key = crypto::legacy_pubkey::from_bytes(key_bytes); } catch (const std::exception& e) { - result.bt.read_error = + result.bt.error = "Failed to parse retryable request node key: {}"_format( e.what()); continue; @@ -198,7 +239,7 @@ static SerialiseRetryableRequestsResult serialize_retryable_requests( node.deadline = std::chrono::steady_clock::time_point( std::chrono::milliseconds(deadline_u64)); } catch (const std::exception& e) { - result.bt.read_error = + result.bt.error = "Failed to parse retryable request node deadline: {}"_format( e.what()); continue; @@ -209,7 +250,7 @@ static SerialiseRetryableRequestsResult serialize_retryable_requests( node_dict.require(NEXT_RETRY_DELAY_KEY); node.next_retry_delay = std::chrono::milliseconds(next_retry_delay_u64); } catch (const std::exception& e) { - result.bt.read_error = + result.bt.error = "Failed to parse retryable request next retry delay: {}"_format( e.what()); continue; @@ -219,7 +260,7 @@ static SerialiseRetryableRequestsResult serialize_retryable_requests( uint32_t reason_u32 = node_dict.require(REASON_KEY); node.reason = static_cast(reason_u32); } catch (const std::exception& e) { - result.bt.read_error = + result.bt.error = "Failed to parse retryable request reason {}"_format(e.what()); continue; } @@ -230,7 +271,7 @@ static SerialiseRetryableRequestsResult serialize_retryable_requests( } } } - result.bt.success = result.bt.read_error.empty(); + result.bt.success = result.bt.error.empty(); } return result; } @@ -278,10 +319,10 @@ SerialiseSwarmsResult ServiceNode::serialize_swarms( try { version = d.require(VERSION_KEY); } catch (const std::exception& e) { - result.bt.read_error = "Failed to parse version: {}"_format(e.what()); + result.bt.error = "Failed to parse version: {}"_format(e.what()); } - if (result.bt.read_error.empty()) { + if (result.bt.error.empty()) { // Initially a dummy list that we will std::move the real list into oxenc::bt_list_consumer swarm_list("l"); try { @@ -289,64 +330,64 @@ SerialiseSwarmsResult ServiceNode::serialize_swarms( assert(key == NETWORK_SWARMS_KEY); swarm_list = std::move(list); } catch (const std::exception& e) { - result.bt.read_error = "Failed to parse network swarms: {}"_format(e.what()); + result.bt.error = "Failed to parse network swarms: {}"_format(e.what()); } - while (result.bt.read_error.empty() && !swarm_list.is_finished()) { + while (result.bt.error.empty() && !swarm_list.is_finished()) { auto swarm = swarm_list.consume_list_consumer(); uint64_t swarm_id = 0; try { swarm_id = swarm.consume(); } catch (const std::exception& e) { - result.bt.read_error = + result.bt.error = "Failed to parse swarm id from swarm list: {}"_format(e.what()); continue; } std::set& keys = result.network_swarms[swarm_id]; - while (result.bt.read_error.empty() && !swarm.is_finished()) { + while (result.bt.error.empty() && !swarm.is_finished()) { try { auto bytes = swarm.consume(); keys.insert(keys.end(), crypto::legacy_pubkey::from_bytes(bytes)); } catch (const std::exception& e) { - result.bt.read_error = + result.bt.error = "Failed to parse swarm pubkey from swarm: {}"_format(e.what()); } } } } - if (result.bt.read_error.empty()) { + if (result.bt.error.empty()) { try { result.swarm_cur_swarm_id = d.require(SWARM_CUR_SWARM_ID); } catch (const std::exception& e) { - result.bt.read_error = + result.bt.error = "Failed to swarm's current swarm ID: {}"_format(e.what()); } } - if (result.bt.read_error.empty()) { + if (result.bt.error.empty()) { oxenc::bt_list_consumer swarm_members("l"); try { auto [key, list] = d.next_list_consumer(); assert(key == SWARM_MEMBERS_KEY); swarm_members = std::move(list); } catch (const std::exception& e) { - result.bt.read_error = "Failed to parse swarm members: {}"_format(e.what()); + result.bt.error = "Failed to parse swarm members: {}"_format(e.what()); } - while (result.bt.read_error.empty() && !swarm_members.is_finished()) { + while (result.bt.error.empty() && !swarm_members.is_finished()) { try { auto bytes = swarm_members.consume(); - result.swarm_members[crypto::legacy_pubkey::from_bytes(bytes)]; + result.swarm_members[crypto::legacy_pubkey::from_bytes(bytes)] = {}; } catch (const std::exception& e) { - result.bt.read_error = + result.bt.error = "Failed to parse swarm member from list: {}"_format(e.what()); } } } } - result.bt.success = result.bt.read_error.empty(); + result.bt.success = result.bt.error.empty(); } return result; @@ -377,7 +418,7 @@ ServiceNode::ServiceNode( network_.swarms_ = std::move(swarm_result.network_swarms); swarm_.cur_swarm_id_ = swarm_result.swarm_cur_swarm_id; } else { - log::error(logcat, "Deserialising of swarms failed: {}", swarm_result.bt.read_error); + log::error(logcat, "Deserialising of swarms failed: {}", swarm_result.bt.error); swarms_blob.clear(); } @@ -392,7 +433,7 @@ ServiceNode::ServiceNode( log::error( logcat, "Deserialising of retryable requests failed: {}", - retryable_result.bt.read_error); + retryable_result.bt.error); retryable_blob.clear(); } @@ -408,6 +449,19 @@ ServiceNode::ServiceNode( retryable_requests.size(), util::get_human_readable_bytes(retryable_blob.size())); + // Check if the DB was empty and remember if so for later when talking to swarm members on + // handshake that we need to request a DB dump from them to populate our DB. In the edge case + // where there _are_ 0 messages, this will request a DB dump of 0 messages and essentially + // no-op. + if (db->get_message_count(Database::GetMessageCount::Owned) == 0) { + swarm_.db_was_initially_empty = true; + + // The 'cur_swarm_id' might be INVALID_SWARM_ID. This will be the case if the DB was deletd + // (and so the blobs storing our swarms were also deleted). The swarm is then + // bootstrapped to a proper swarm when we process the first handshake from a swarm member. + swarm_.db_was_initially_empty_with_swarm_id = swarm_.cur_swarm_id_; + } + omq_server->add_timer( [this] { std::lock_guard l{sn_mutex_}; @@ -807,43 +861,92 @@ void ServiceNode::check_new_members() { pk, fmt::join(NEW_SWARM_MEMBER_HANDSHAKE_VERSION, "."), fmt::join(c->version, ".")); - swarm_.set_member_contact_details_ready(pk); + + std::lock_guard network_lock{network().mut_}; + if (SwarmMemberState* member = swarm_.is_member_locked(pk); member) + member->status = SwarmMemberStatus::Ready; continue; } - log::debug(logcat, "Initiating contact with new swarm member {}", pk); - omq_server_->request( - c->pubkey_x25519.view(), - "sn.data_ready", - [this, pk](bool success, std::vector data) { - if (data.empty()) { - success = false; - data.push_back("Empty reply"s); - } else if (data[0] != "OK"sv) { - success = false; - } - if (!success) { - log::info( - logcat, - "Failed to connect to remote SS {} to initiate new " - "data transfer ({}); will retry soon", - pk, - fmt::join(data, ", ")); - return; + auto on_sn_data_ready_response = [this, pk](bool success, std::vector data) { + if (data.empty()) { + success = false; + data.push_back("Empty reply"s); + } else if (data[0] != "OK"sv) { + success = false; + } + + if (success) { + log::debug( + logcat, + "Successful contact made with swarm member {}, marking as ready", + pk); + } else { + log::info( + logcat, + "Failed to connect to remote SS {} to initiate new " + "data transfer ({}); will retry soon", + pk, + fmt::join(data, ", ")); + } + + // The 'pk' member might not be in the swarm anymore if the request elapsed over a + // period of time where the swarm composition changed. + std::lock_guard network_lock{network().mut_}; + if (SwarmMemberState* member = swarm_.is_member_locked(pk); member) { + // Update the requested DB dump state machine if necessary. + SwarmRequestedDBDump& status = member->our_ss_requested_db_dump; + if (status == SwarmRequestedDBDump::RequestUnderway) { + status = success ? SwarmRequestedDBDump::Done + : SwarmRequestedDBDump::NeedsToRequest; + } + + if (success) + member->status = SwarmMemberStatus::Ready; + } + }; + + if (c->version >= SN_DATA_READY_WITH_REQUEST_VERSION) { + // Build 'data ready' request + snode::DataReadyRequest request = {}; + { + std::lock_guard network_lock{network().mut_}; + if (SwarmMemberState* member = swarm_.is_member_locked(pk); member) { + SwarmRequestedDBDump& status = member->our_ss_requested_db_dump; + if (status == SwarmRequestedDBDump::NeedsToRequest) { + status = SwarmRequestedDBDump::RequestUnderway; + request.needs_db_dump = true; } - log::debug( - logcat, - "Successful contact made with swarm member {}, marking as ready", - pk); - swarm_.set_member_contact_details_ready(pk); - }); + } + } + + // Serialise our response and send it off + snode::SerialiseDataReadyRequestResult serialised = + snode::serialise_data_ready_request(Serialise::Write, "", request); + assert(serialised.bt.success); + + log::debug( + logcat, + "Initiating contact with new swarm member {}{}", + pk, + request.needs_db_dump ? " (requesting DB dump)" : ""); + omq_server_->request( + c->pubkey_x25519.view(), + "sn.data_ready", + on_sn_data_ready_response, + std::move(serialised.bt.write_payload)); + } else { + log::debug(logcat, "Initiating contact with new swarm member {}", pk); + omq_server_->request( + c->pubkey_x25519.view(), "sn.data_ready", on_sn_data_ready_response); + } } if (auto send_now = swarm_.extract_contacts_needing_db_dump(); !send_now.empty()) { auto msgs = db->retrieve_all(); log::debug( logcat, - "Initiating swarm message dump ({} message) to new swarm member(s): {}", + "Initiating swarm message dump ({} message) to swarm member(s): {}", msgs.size(), fmt::join(send_now, ", ")); relay_messages(std::move(msgs), send_now); @@ -1101,6 +1204,12 @@ void ServiceNode::update_swarms(std::promise* on_finish) { params.dump()); } +void ServiceNode::set_member_needs_db_dump(const crypto::legacy_pubkey& pk) { + std::lock_guard lock{network().mut_}; // Use the same lock as Swarm member functions + if (SwarmMemberState* state = swarm_.is_member_locked(pk); state) + state->their_ss_needs_db_dump = true; +} + void ServiceNode::process_snodes_update(std::string_view data) { auto maybe_bu = parse_swarm_update(data, our_keys_.pub); @@ -1591,7 +1700,7 @@ std::string ServiceNode::get_status_line() const { STORAGE_SERVER_VERSION_STRING, oxenss::is_mainnet ? "" : " (TESTNET)", syncing_ ? "; SYNCING" : "", - db->get_message_count(), + db->get_message_count(Database::GetMessageCount::All), util::get_human_readable_bytes(db->get_used_bytes()), db->get_owner_count(), stats.client_store_requests, diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index c4a1d3400..a88e7d281 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -58,6 +58,7 @@ inline constexpr hf_revision STORAGE_SERVER_HARDFORK = {19, 6}; // The storage server version at which initial handshaking is supported before attempting a swarm // message transfer. inline constexpr std::array NEW_SWARM_MEMBER_HANDSHAKE_VERSION = {2, 10, 0}; +inline constexpr std::array SN_DATA_READY_WITH_REQUEST_VERSION = {2, 10, 0}; // TODO: Bump the version class Swarm; @@ -322,6 +323,12 @@ class ServiceNode { // Called when oxend notifies us of a new block to update swarm info void update_swarms(std::promise* on_completion = nullptr); + // Mark the swarm member identified by 'pk' as needing a dump of the DB. When the 'check new + // members' routine for swarms is periodically executed, swarm members marked with this flag + // will then get the entire DB synchronised to them. No-op if the key does not match anyone in + // the swarm. + void set_member_needs_db_dump(const crypto::legacy_pubkey& pk); + server::OMQ& omq_server() { return omq_server_; } std::condition_variable retryable_requests_cv; @@ -329,6 +336,17 @@ class ServiceNode { void retryable_requests_thread_entry_point(); }; +struct DataReadyRequest { + bool needs_db_dump; +}; + +struct SerialiseDataReadyRequestResult { + SerialiseBTResult bt; + DataReadyRequest request; +}; + +SerialiseDataReadyRequestResult serialise_data_ready_request( + Serialise serialise, std::string_view read_data, const DataReadyRequest& write_data); } // namespace oxenss::snode template <> diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index a60d0ea80..2886ff4fc 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -129,6 +129,8 @@ SwarmEvents Swarm::update_swarms( std::lock_guard lock{network.mut_}; auto events = derive_swarm_events(height, swarms); + if (db_was_initially_empty_with_swarm_id == INVALID_SWARM_ID) + db_was_initially_empty_with_swarm_id = events.our_swarm_id; if (events.our_swarm_id != INVALID_SWARM_ID) { for (const auto& pk : events.new_swarm_members) @@ -145,29 +147,62 @@ SwarmEvents Swarm::update_swarms( it++; } - // Add members from the swarm that are missing from our runtime state + // TODO: Remove the versions checks below after everyone migrates their SQL DB to v1. The + // version checks gate the new behaviour where this SS will request a dump of the swarm + // member's DB to synchronise new messages. + // + // When a SS upgrades to this version, their DB is initially set to v0 and all the prior + // active service nodes that upgrade will have the chain synchronised and their SS's sitting + // in the correct swarm. We do _not_ want those storage servers to, on upgrade, request a DB + // dump of all the messages from each swarm peer as they are (presumably) relatively synced. + // + // The SS's on v0 don't persist the swarm state to the DB, so on startup they always + // re-bootstrap the state of their swarms. This populates the new-swarm-members array and + // hence triggers the extraneous swarm dump. + // + // The version gate protects against that happening to all the individual nodes on upgrade. + // Once all v0 SS's upgrade, the DB will be marked v1. From that point, swarms are persisted + // onto disk and so any SS's that appear in the new-swarm-members array is _actually_ a new + // SS and we _should_ request a DB a dump from them to synchronise messages they might have + // for us. + // + // New incoming nodes in general are going to end up having 0 messages for us if they are + // joining the network for the first time. + // + // If we are joining a swarm, then, all the members of the swarm are in the + // new-swarm-members array and we will request a DB dump from them. + // + // In a swarm dissolving case, then, these new nodes will have a chunk of messages in the + // adjacent message space that belong to this swarm they are merging into. That is handled + // here. + + // Add members from the swarm that are missing from our runtime state and request a DB dump + // from them to ensure we have all the messages they have that we don't. for (auto it : events.new_swarm_members) { auto& pair = members_[it]; + if (oxenss::tmp_init_db_version == 1) { + if (pair.our_ss_requested_db_dump == SwarmRequestedDBDump::Nil) + pair.our_ss_requested_db_dump = SwarmRequestedDBDump::NeedsToRequest; + } + } - // TODO: Remove this after everyone migrates their DB version to v1. v1 is when we - // started making the nodes store the swarm list and their swarm members to the DB to - // persist on restart. - // - // Before this, on startup they would consider all the nodes in the swarm they loaded - // from get_service_nodes as joining the swarm and perform a full message DB dump. - // Deploying this onto a live network would cause all the nodes to do a DB dump to each - // other the moment they upgraded. - // - // However after they upgrade and start persisting the swarm state to disk, from that - // point onwards they will correctly identify nodes that are leaving and joining their - // swarm and only do a message dump when necessary. - if (oxenss::tmp_init_db_version == 0) { - pair.new_swarm_member = false; // Prevent the swarm DB dump on newly migrated nodes - } else { - pair.new_swarm_member = true; + // If the DB was empty on startup then we mark all swarm members as peers that we need to + // request a DB dump from. Note we only do this if the swarm matches the initial swarm we + // were in when the DB was queried. We might have changed swarms since startup, in which + // case, the above branch will already initiate a DB dump request for us. + // + // This also covers the case where someone drops the messages table and restarts the SS, we + // need to resync all the messages from everyone in the swarm. + if (oxenss::tmp_init_db_version == 1) { + if (db_was_initially_empty_with_swarm_id == events.our_swarm_id) { + for (auto& it : members_) { + if (it.second.our_ss_requested_db_dump == SwarmRequestedDBDump::Nil) + it.second.our_ss_requested_db_dump = SwarmRequestedDBDump::NeedsToRequest; + } } } } + oxenss::tmp_init_db_version = 1; // Disable after the first swarm update cur_swarm_id_ = events.our_swarm_id; @@ -218,6 +253,13 @@ std::optional Swarm::is_member(const crypto::ed25519_pubkey& p return result; } +SwarmMemberState* Swarm::is_member_locked(const crypto::legacy_pubkey& pk) { + SwarmMemberState* result = nullptr; + if (auto it = members_.find(pk); it != members_.end()) + result = &it->second; + return result; +} + size_t Swarm::size() const { std::shared_lock lock{network.mut_}; return members_.size(); @@ -251,8 +293,8 @@ std::set Swarm::extract_contacts_needing_db_dump() { for (auto& it : members_) { if (it.second.status == SwarmMemberStatus::Ready) { const crypto::legacy_pubkey& pk = it.first; - if (it.second.new_swarm_member) { - it.second.new_swarm_member = false; + if (it.second.their_ss_needs_db_dump) { + it.second.their_ss_needs_db_dump = false; result.insert(pk); } } @@ -260,12 +302,4 @@ std::set Swarm::extract_contacts_needing_db_dump() { return result; } - -void Swarm::set_member_contact_details_ready(const crypto::legacy_pubkey& pk) { - std::lock_guard lock{network.mut_}; - auto it = members_.find(pk); - assert(it != members_.end()); - if (it != members_.end()) - it->second.status = SwarmMemberStatus::Ready; -} } // namespace oxenss::snode diff --git a/oxenss/snode/swarm.h b/oxenss/snode/swarm.h index ccc91e5f2..f06a9d345 100644 --- a/oxenss/snode/swarm.h +++ b/oxenss/snode/swarm.h @@ -34,13 +34,26 @@ enum struct SwarmMemberStatus { Ready, }; +enum struct SwarmRequestedDBDump { + Nil, + NeedsToRequest, + RequestUnderway, + Done, +}; + struct SwarmMemberState { SwarmMemberStatus status; - // Set if this member joined the swarm. They are assumed to not have any of the messages for - // the swarm yet so a full DB dump will be initiated for messages we own that belong to the - // swarm. - bool new_swarm_member; + // Flags for if our storage server needs to initiate a request to receive a DB dump from this + // member. 'Nil' if no action is to be taken, otherwise this flag transition from + // 'NeedsToRequest' to 'RequestUnderway' to 'Done' via the outgoing data ready handshake. + SwarmRequestedDBDump our_ss_requested_db_dump; + + // Set if this swarm member has requested a DB dump from us in the data ready handshake. If set + // they are assumed to not have any of the messages for the swarm yet so a full DB dump will be + // initiated for messages we own that belong to the swarm when the 'check new members' routine + // occurs. + bool their_ss_needs_db_dump; // The earliest timestamp at which the swarm will check if they have received contact // information for this member yet and can send them data. Only utilised when status is @@ -64,6 +77,19 @@ class Swarm { swarm_id_t cur_swarm_id_ = INVALID_SWARM_ID; + // Track if the DB was empty on startup. It is important to remember this on startup because + // if you were active, you may start receiving messages before the server contacts peers to + // request a swarm DB dump to synchronise messages which would seed the database and checking + // this later would fail. + bool db_was_initially_empty = false; + + // Track which swarm we were set to when we determined that the DB was empty. This helps track + // which set of peers we should attempt to request a DB dump from since swarms may change during + // that asynchronous process. If the swarm does change, the act of joining a new swarm triggers + // a DB dump which invalidates the need to request a DB dump from our initial but now, + // irrelevant swarm peers, identified by this swarm ID. + swarm_id_t db_was_initially_empty_with_swarm_id = INVALID_SWARM_ID; + public: Swarm(Network& network, const crypto::legacy_pubkey& our_pk) : network{network}, our_pk{our_pk} {} @@ -94,6 +120,11 @@ class Swarm { std::optional is_member(const crypto::x25519_pubkey& pk) const; std::optional is_member(const crypto::ed25519_pubkey& pk) const; + // Returns the underlying swarm member's state. Returns a null pointer if 'pk' is not a member + // in your swarm. Caller must hold a lock on the network mutex to call this and the pointer is + // only valid whilst that lock remains held. + SwarmMemberState* is_member_locked(const crypto::legacy_pubkey& pk); + // Returns the size of this swarm (including this node). size_t size() const; @@ -106,11 +137,6 @@ class Swarm { // details for, mark them as ready and need a dump of the DB. std::set extract_contacts_needing_db_dump(); - // Marks a pending member as ready, so that it is returned by the next call to - // `extract_contact_details_ready_members()`, and is no longer returned by - // `extract_contract_details_pending_member()`. - void set_member_contact_details_ready(const crypto::legacy_pubkey& pk); - swarm_id_t our_swarm_id() const { std::shared_lock lock{network.mut_}; return cur_swarm_id_; diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index cbe98e6f5..a820cf040 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -605,8 +605,17 @@ void Database::clean_expired() { to_epoch_ms(std::chrono::system_clock::now())); } -int64_t Database::get_message_count() { - return get_impl(false)->prepared_get("SELECT COUNT(*) FROM messages"); +int64_t Database::get_message_count(GetMessageCount get) { + int64_t result = 0; + switch (get) { + case GetMessageCount::All: + result = get_impl(false)->prepared_get("SELECT COUNT(*) FROM messages"); + break; + case GetMessageCount::Owned: + result = get_impl(false)->prepared_get("SELECT COUNT(*) FROM owned_messages"); + break; + } + return result; } int64_t Database::get_owner_count() { diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index baf3c2c39..17c1feb2b 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -108,8 +108,13 @@ class Database { // Retrieves all messages. std::vector retrieve_all(); + enum class GetMessageCount { + All, + Owned, // Only messages that belong to this node's swarm + }; + // Return the total number of messages stored - int64_t get_message_count(); + int64_t get_message_count(GetMessageCount get); // Returns the per-owner counts of stored messages, for storage statistics purposes. std::vector get_message_counts(); diff --git a/unit_test/storage.cpp b/unit_test/storage.cpp index 96460481b..cce142655 100644 --- a/unit_test/storage.cpp +++ b/unit_test/storage.cpp @@ -42,7 +42,7 @@ TEST_CASE("storage - data persistence", "[storage]") { CHECK(storage.store({pubkey, hash, ns, now, now + ttl, bytes}) == StoreResult::New); CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count() == 1); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == 1); // the database is closed when storage goes out of scope } @@ -51,7 +51,7 @@ TEST_CASE("storage - data persistence", "[storage]") { Database storage{"."}; CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count() == 1); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == 1); auto [items, more] = storage.retrieve(pubkey, namespace_id::Default, ""); @@ -79,7 +79,7 @@ TEST_CASE("storage - data persistence, namespace", "[storage][namespace]") { CHECK(storage.store({pubkey, hash, ns, now, now + ttl, bytes}) == StoreResult::New); CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count() == 1); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == 1); // the database is closed when storage goes out of scope } @@ -88,7 +88,7 @@ TEST_CASE("storage - data persistence, namespace", "[storage][namespace]") { Database storage{"."}; CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count() == 1); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == 1); auto [items, more] = storage.retrieve(pubkey, ns, ""); @@ -131,7 +131,7 @@ TEST_CASE("storage - re-storing existing hash", "[storage]") { CHECK(ins == StoreResult::Exists); CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count() == 1); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == 1); } TEST_CASE("storage - only return entries for specified pubkey", "[storage]") { @@ -152,7 +152,7 @@ TEST_CASE("storage - only return entries for specified pubkey", "[storage]") { StoreResult::New); CHECK(storage.get_owner_count() == 2); - CHECK(storage.get_message_count() == 2); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == 2); const auto lastHash = ""; { @@ -184,7 +184,7 @@ TEST_CASE("storage - return entries older than lasthash", "[storage]") { } CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count() == 100); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == 100); { const auto lastHash = "hash0"; @@ -228,7 +228,7 @@ TEST_CASE("storage - remove expired entries", "[storage]") { StoreResult::New); CHECK(storage.get_owner_count() == 3); - CHECK(storage.get_message_count() == 6); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == 6); { const auto lastHash = ""; @@ -245,7 +245,7 @@ TEST_CASE("storage - remove expired entries", "[storage]") { } CHECK(storage.get_owner_count() == 2); - CHECK(storage.get_message_count() == 2); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == 2); } TEST_CASE("storage - bulk data storage", "[storage]") { @@ -284,7 +284,7 @@ TEST_CASE("storage - bulk data storage", "[storage]") { } CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count() == num_items); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == num_items); } TEST_CASE("storage - bulk storage with overlap", "[storage]") { @@ -307,7 +307,7 @@ TEST_CASE("storage - bulk storage with overlap", "[storage]") { StoreResult::New); CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count() == 2); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == 2); // bulk store { @@ -326,7 +326,7 @@ TEST_CASE("storage - bulk storage with overlap", "[storage]") { } CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count() == num_items); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == num_items); // retrieve { @@ -359,7 +359,7 @@ TEST_CASE("storage - retrieve limit", "[storage]") { } CHECK(storage.get_owner_count() == 2); - CHECK(storage.get_message_count() == num_entries + 5); + CHECK(storage.get_message_count(Database::GetMessageCount::All) == num_entries + 5); CHECK(storage.retrieve(pubkey, namespace_id::Default, "").first.size() == num_entries); CHECK(storage.retrieve(pubkey, namespace_id::Default, "", 10).first.size() == 10); From b9f5d2d1e80655d59fac7d4218594334fe0377dc Mon Sep 17 00:00:00 2001 From: doylet Date: Mon, 23 Jun 2025 15:40:23 +1000 Subject: [PATCH 29/50] Add 10GiB message payload to serialise test --- unit_test/serialization.cpp | 38 +++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/unit_test/serialization.cpp b/unit_test/serialization.cpp index ce250bc56..5ac5f6d1d 100644 --- a/unit_test/serialization.cpp +++ b/unit_test/serialization.cpp @@ -1,9 +1,11 @@ #include #include #include +#include #include +#include #include #include @@ -64,9 +66,41 @@ TEST_CASE("v1 serialization - batch serialization", "[serialization]") { auto first = serialized.front(); const size_t num_messages = (SERIALIZATION_BATCH_SIZE / (serialized.front().size() - 2)) + 1; msgs = {num_messages, msgs.front()}; - serialized = serialize_messages(msgs.begin(), msgs.end(), 1); + serialized = serialize_messages(msgs.begin(), msgs.end(), SERIALIZATION_VERSION_BT); CHECK(serialized.size() == 1); msgs.push_back(msgs.front()); - serialized = serialize_messages(msgs.begin(), msgs.end(), 1); + serialized = serialize_messages(msgs.begin(), msgs.end(), SERIALIZATION_VERSION_BT); CHECK(serialized.size() == 2); } + +TEST_CASE("v1 serialization - message payload 10GiB", "[serialization]") { + oxenss::user_pubkey pub_key; + REQUIRE(pub_key.load("054368520005786b249bcd461d28f75e560ea794014eeb17fcf6003f37d876783e"s)); + + const std::chrono::system_clock::time_point timestamp{1'622'576'077s}; + oxenss::message base_msg{ + pub_key, + "hash", + oxenss::namespace_id::Default, + timestamp, + timestamp + 24h, + std::string(1 * 1024 * 1024 /*1MiB*/, 'x')}; + std::vector msg_list(10'000, base_msg); // 10 GiB total + + auto begin = std::chrono::high_resolution_clock::now(); + auto serialized = serialize_messages(msg_list.begin(), msg_list.end(), SERIALIZATION_VERSION_BT); + auto elapsed = std::chrono::high_resolution_clock::now() - begin; + + size_t total_bytes = msg_list.size() * base_msg.data.size(); + std::string total_bytes_str = oxenss::util::get_human_readable_bytes(total_bytes); + double total_gbs = static_cast(total_bytes) / (1024 * 1024 * 1024); + double gbs_per_s = + total_gbs / std::chrono::duration_cast(elapsed).count(); + + fmt::println( + "Messages: {}; Size: {}; Elapsed: {}; Rate: {:.2f} GiB/s", + msg_list.size(), + oxenss::util::get_human_readable_bytes(total_bytes), + std::chrono::duration_cast(elapsed), + gbs_per_s); +} From 9ac6c91623f9fce41ca09f089d6984891b30f5cc Mon Sep 17 00:00:00 2001 From: doylet Date: Fri, 27 Jun 2025 16:19:22 +1000 Subject: [PATCH 30/50] Linting --- oxenss/snode/service_node.cpp | 4 ++-- oxenss/snode/service_node.h | 2 +- oxenss/storage/database.hpp | 2 +- unit_test/serialization.cpp | 9 +++++---- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index d7d7e124d..6590a2e85 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -362,7 +362,7 @@ SerialiseSwarmsResult ServiceNode::serialize_swarms( result.swarm_cur_swarm_id = d.require(SWARM_CUR_SWARM_ID); } catch (const std::exception& e) { result.bt.error = - "Failed to swarm's current swarm ID: {}"_format(e.what()); + "Failed to parse swarm's current swarm ID: {}"_format(e.what()); } } @@ -1205,7 +1205,7 @@ void ServiceNode::update_swarms(std::promise* on_finish) { } void ServiceNode::set_member_needs_db_dump(const crypto::legacy_pubkey& pk) { - std::lock_guard lock{network().mut_}; // Use the same lock as Swarm member functions + std::lock_guard lock{network().mut_}; // Use the same lock as Swarm member functions if (SwarmMemberState* state = swarm_.is_member_locked(pk); state) state->their_ss_needs_db_dump = true; } diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index a88e7d281..d7fbff355 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -58,7 +58,7 @@ inline constexpr hf_revision STORAGE_SERVER_HARDFORK = {19, 6}; // The storage server version at which initial handshaking is supported before attempting a swarm // message transfer. inline constexpr std::array NEW_SWARM_MEMBER_HANDSHAKE_VERSION = {2, 10, 0}; -inline constexpr std::array SN_DATA_READY_WITH_REQUEST_VERSION = {2, 10, 0}; // TODO: Bump the version +inline constexpr std::array SN_DATA_READY_WITH_REQUEST_VERSION = {2, 11, 0}; class Swarm; diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index 17c1feb2b..8a3e12763 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -110,7 +110,7 @@ class Database { enum class GetMessageCount { All, - Owned, // Only messages that belong to this node's swarm + Owned, // Only messages that belong to this node's swarm }; // Return the total number of messages stored diff --git a/unit_test/serialization.cpp b/unit_test/serialization.cpp index 5ac5f6d1d..bc51254eb 100644 --- a/unit_test/serialization.cpp +++ b/unit_test/serialization.cpp @@ -85,15 +85,16 @@ TEST_CASE("v1 serialization - message payload 10GiB", "[serialization]") { timestamp, timestamp + 24h, std::string(1 * 1024 * 1024 /*1MiB*/, 'x')}; - std::vector msg_list(10'000, base_msg); // 10 GiB total + std::vector msg_list(10'000, base_msg); // 10 GiB total auto begin = std::chrono::high_resolution_clock::now(); - auto serialized = serialize_messages(msg_list.begin(), msg_list.end(), SERIALIZATION_VERSION_BT); + auto serialized = + serialize_messages(msg_list.begin(), msg_list.end(), SERIALIZATION_VERSION_BT); auto elapsed = std::chrono::high_resolution_clock::now() - begin; - size_t total_bytes = msg_list.size() * base_msg.data.size(); + size_t total_bytes = msg_list.size() * base_msg.data.size(); std::string total_bytes_str = oxenss::util::get_human_readable_bytes(total_bytes); - double total_gbs = static_cast(total_bytes) / (1024 * 1024 * 1024); + double total_gbs = static_cast(total_bytes) / (1024 * 1024 * 1024); double gbs_per_s = total_gbs / std::chrono::duration_cast(elapsed).count(); From 26b6b22a2d8a58195e7b69e12d800c52e86dc57d Mon Sep 17 00:00:00 2001 From: doylet Date: Fri, 27 Jun 2025 16:26:26 +1000 Subject: [PATCH 31/50] Reduce serialisation test to 5GiB to not OOM on CI --- unit_test/serialization.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unit_test/serialization.cpp b/unit_test/serialization.cpp index bc51254eb..e69e14118 100644 --- a/unit_test/serialization.cpp +++ b/unit_test/serialization.cpp @@ -73,7 +73,7 @@ TEST_CASE("v1 serialization - batch serialization", "[serialization]") { CHECK(serialized.size() == 2); } -TEST_CASE("v1 serialization - message payload 10GiB", "[serialization]") { +TEST_CASE("v1 serialization - message payload 5GiB", "[serialization]") { oxenss::user_pubkey pub_key; REQUIRE(pub_key.load("054368520005786b249bcd461d28f75e560ea794014eeb17fcf6003f37d876783e"s)); @@ -85,7 +85,7 @@ TEST_CASE("v1 serialization - message payload 10GiB", "[serialization]") { timestamp, timestamp + 24h, std::string(1 * 1024 * 1024 /*1MiB*/, 'x')}; - std::vector msg_list(10'000, base_msg); // 10 GiB total + std::vector msg_list(500, base_msg); // 5 GiB total auto begin = std::chrono::high_resolution_clock::now(); auto serialized = From 6fd839165ea95349912ae1f08e4670720b4378fe Mon Sep 17 00:00:00 2001 From: doylet Date: Fri, 27 Jun 2025 16:32:50 +1000 Subject: [PATCH 32/50] Reduce serialisation test to 100MiB to not OOM on CI --- unit_test/serialization.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unit_test/serialization.cpp b/unit_test/serialization.cpp index e69e14118..7e96f6a42 100644 --- a/unit_test/serialization.cpp +++ b/unit_test/serialization.cpp @@ -73,7 +73,7 @@ TEST_CASE("v1 serialization - batch serialization", "[serialization]") { CHECK(serialized.size() == 2); } -TEST_CASE("v1 serialization - message payload 5GiB", "[serialization]") { +TEST_CASE("v1 serialization - message payload 100MiB", "[serialization]") { oxenss::user_pubkey pub_key; REQUIRE(pub_key.load("054368520005786b249bcd461d28f75e560ea794014eeb17fcf6003f37d876783e"s)); @@ -85,7 +85,7 @@ TEST_CASE("v1 serialization - message payload 5GiB", "[serialization]") { timestamp, timestamp + 24h, std::string(1 * 1024 * 1024 /*1MiB*/, 'x')}; - std::vector msg_list(500, base_msg); // 5 GiB total + std::vector msg_list(100, base_msg); // 100 MiB total auto begin = std::chrono::high_resolution_clock::now(); auto serialized = From 5a8f1dea07efb56bcc4b22160a82b30e0e9b1a52 Mon Sep 17 00:00:00 2001 From: doylet Date: Mon, 14 Jul 2025 11:52:05 +1000 Subject: [PATCH 33/50] Stop potential infinite bootstrap of DB if swarm ID doesn't change --- oxenss/snode/service_node.cpp | 2 -- oxenss/snode/swarm.cpp | 11 ++++++----- oxenss/snode/swarm.h | 13 +++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 6590a2e85..edde4fc8c 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -454,8 +454,6 @@ ServiceNode::ServiceNode( // where there _are_ 0 messages, this will request a DB dump of 0 messages and essentially // no-op. if (db->get_message_count(Database::GetMessageCount::Owned) == 0) { - swarm_.db_was_initially_empty = true; - // The 'cur_swarm_id' might be INVALID_SWARM_ID. This will be the case if the DB was deletd // (and so the blobs storing our swarms were also deleted). The swarm is then // bootstrapped to a proper swarm when we process the first handshake from a swarm member. diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index 2886ff4fc..d028b0cd6 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -193,11 +193,12 @@ SwarmEvents Swarm::update_swarms( // // This also covers the case where someone drops the messages table and restarts the SS, we // need to resync all the messages from everyone in the swarm. - if (oxenss::tmp_init_db_version == 1) { - if (db_was_initially_empty_with_swarm_id == events.our_swarm_id) { - for (auto& it : members_) { - if (it.second.our_ss_requested_db_dump == SwarmRequestedDBDump::Nil) - it.second.our_ss_requested_db_dump = SwarmRequestedDBDump::NeedsToRequest; + if (db_was_initially_empty_with_swarm_id == events.our_swarm_id && + !db_was_initially_empty_handled) { + db_was_initially_empty_handled = true; + for (auto& it : members_) { + if (it.second.our_ss_requested_db_dump == SwarmRequestedDBDump::Nil) { + it.second.our_ss_requested_db_dump = SwarmRequestedDBDump::NeedsToRequest; } } } diff --git a/oxenss/snode/swarm.h b/oxenss/snode/swarm.h index f06a9d345..a6eade761 100644 --- a/oxenss/snode/swarm.h +++ b/oxenss/snode/swarm.h @@ -77,19 +77,20 @@ class Swarm { swarm_id_t cur_swarm_id_ = INVALID_SWARM_ID; - // Track if the DB was empty on startup. It is important to remember this on startup because - // if you were active, you may start receiving messages before the server contacts peers to - // request a swarm DB dump to synchronise messages which would seed the database and checking - // this later would fail. - bool db_was_initially_empty = false; - // Track which swarm we were set to when we determined that the DB was empty. This helps track // which set of peers we should attempt to request a DB dump from since swarms may change during // that asynchronous process. If the swarm does change, the act of joining a new swarm triggers // a DB dump which invalidates the need to request a DB dump from our initial but now, // irrelevant swarm peers, identified by this swarm ID. + // + // It is important to remember this on startup because if you were active, you may start + // receiving messages before the server contacts peers to request a swarm DB dump to synchronise + // messages which would seed the database and checking this later would fail. swarm_id_t db_was_initially_empty_with_swarm_id = INVALID_SWARM_ID; + // Flag that stops the DB initially empty w/ swarm ID from executing more than once. + bool db_was_initially_empty_handled = false; + public: Swarm(Network& network, const crypto::legacy_pubkey& our_pk) : network{network}, our_pk{our_pk} {} From 29623323d0e30b02700121b04968a2285d96a15f Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Tue, 17 Feb 2026 16:16:09 -0500 Subject: [PATCH 34/50] Change retryable requests storage format Instead of keeping retryable requests in memory and storing them in the database as a serialized monolith, now they're stored in a queryable way (and thus don't need to be kept in memory at all times either, which is good in the event that there are a lot). Changed the retry timing to be simpler as well; if something times out on a 5 second timeout, we don't need to retry super frequently. --- oxenss/rpc/request_handler.cpp | 27 +- oxenss/server/omq.cpp | 5 +- oxenss/snode/service_node.cpp | 555 +++------------------------------ oxenss/snode/service_node.h | 22 +- oxenss/snode/swarm.cpp | 6 +- oxenss/snode/swarm.h | 9 +- oxenss/storage/database.cpp | 120 ++++++- oxenss/storage/database.hpp | 19 ++ 8 files changed, 201 insertions(+), 562 deletions(-) diff --git a/oxenss/rpc/request_handler.cpp b/oxenss/rpc/request_handler.cpp index 691fe107d..9595cfdca 100644 --- a/oxenss/rpc/request_handler.cpp +++ b/oxenss/rpc/request_handler.cpp @@ -406,6 +406,7 @@ struct swarm_response { std::vector retry_nodes; std::string cmd; std::string req_payload; + int64_t db_req_id{0}; }; // Replies to a swarm request via its callback; sends an http::OK unless all of the @@ -426,15 +427,6 @@ static void reply_or_fail(snode::ServiceNode& sn, const std::shared_ptrcb(Response{res_code, std::move(res->result)}); - - if (res->retry_nodes.size()) { - snode::RequestRetry retry = {}; - retry.nodes = std::move(res->retry_nodes); - retry.cmd = res->cmd; - retry.req_payload = std::move(res->req_payload); - retry.create_time = std::chrono::steady_clock::now(); - sn.add_retryable_request(std::move(retry)); - } } SNStorageCCResult interpret_sn_storage_cc_response_parts( @@ -462,9 +454,6 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrpending += peers.size(); - // When a request to a peer fails, set the initial retry to 1s in the future - constexpr auto default_retry_delay = 1s; - for (auto& peer : peers) { auto ct = sn.contacts().find(peer.first); if (!ct || !*ct) { @@ -476,18 +465,14 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrpending--; - snode::RequestRetryEntry entry = {}; - entry.key = peer.first; - entry.reason = snode::RetryReason::NON_CONTACTABLE; - entry.deadline = std::chrono::steady_clock::now() + default_retry_delay; - res->retry_nodes.push_back(entry); + res->db_req_id = sn.db->add_retry_request(peer.first, res->cmd, res->req_payload, res->db_req_id); continue; } sn.omq_server()->request( ct->pubkey_x25519.view(), "sn.storage_cc", - [res, peer, peer_ed = ct->pubkey_ed25519, &sn, default_retry_delay]( + [res, peer, peer_ed = ct->pubkey_ed25519, &sn]( bool success, auto parts) { json peer_result; SNStorageCCResult store_result = @@ -533,11 +518,7 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrretry_nodes.push_back(entry); + res->db_req_id = sn.db->add_retry_request(peer.first, res->cmd, res->req_payload, res->db_req_id); } } else if (res->b64) { if (auto it = peer_result.find("signature"); diff --git a/oxenss/server/omq.cpp b/oxenss/server/omq.cpp index 96544814d..cec62a9b7 100644 --- a/oxenss/server/omq.cpp +++ b/oxenss/server/omq.cpp @@ -78,16 +78,15 @@ void OMQ::handle_sn_data_ready(oxenmq::Message& message) { if (request.needs_db_dump) service_node_->set_member_needs_db_dump(crypto::legacy_pubkey{ct->pubkey_ed25519}); - if (auto level = log::Level::debug; log::get_level(logcat) <= level) { + if (log::get_level(logcat) <= log::Level::debug) { std::string label; if (deserialised.bt.success) label = "rejected, bad request payload. {})"_format(deserialised.bt.error); else label = "rejected due to bad request args"; - log::log( + log::debug( logcat, - level, "sn.data ready processed (edpk: {}, db dump: {}): {}", ct->pubkey_ed25519, request.needs_db_dump, diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index edde4fc8c..497e6ac94 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -96,186 +96,6 @@ SerialiseDataReadyRequestResult serialise_data_ready_request( return result; } -static SerialiseRetryableRequestsResult serialize_retryable_requests( - Serialise serialise, std::string_view read_data, std::span write_data) { - SerialiseRetryableRequestsResult result = {}; - uint32_t version = 0; - - constexpr std::string_view VERSION_KEY = "@"; - constexpr std::string_view RETRYABLE_REQUESTS_KEY = "r"; - assert(VERSION_KEY < RETRYABLE_REQUESTS_KEY); - - // Retryable request keys - constexpr std::string_view COMMAND_KEY = "c"; - constexpr std::string_view REQ_PAYLOAD_KEY = "r"; - constexpr std::string_view CREATE_TIME_KEY = "t"; - constexpr std::string_view NODES_KEY = "u"; - assert(COMMAND_KEY < REQ_PAYLOAD_KEY); - assert(REQ_PAYLOAD_KEY < CREATE_TIME_KEY); - assert(CREATE_TIME_KEY < NODES_KEY); - - // Retrayble request entry keys - constexpr std::string_view KEY_KEY = "i"; - constexpr std::string_view DEADLINE_KEY = "l"; - constexpr std::string_view NEXT_RETRY_DELAY_KEY = "n"; - constexpr std::string_view REASON_KEY = "r"; - assert(KEY_KEY < DEADLINE_KEY); - assert(DEADLINE_KEY < NEXT_RETRY_DELAY_KEY); - assert(NEXT_RETRY_DELAY_KEY < REASON_KEY); - - if (serialise == Serialise::Write) { - oxenc::bt_dict_producer d; - d.append(VERSION_KEY, version); - - oxenc::bt_list_producer retry_list = d.append_list(RETRYABLE_REQUESTS_KEY); - for (const auto& it : write_data) { - oxenc::bt_dict_producer retry_dict = retry_list.append_dict(); - retry_dict.append(COMMAND_KEY, it.cmd); - retry_dict.append(REQ_PAYLOAD_KEY, it.req_payload); - uint64_t create_time_u64 = std::chrono::duration_cast( - it.create_time.time_since_epoch()) - .count(); - retry_dict.append(CREATE_TIME_KEY, create_time_u64); - oxenc::bt_list_producer node_list = retry_dict.append_list(NODES_KEY); - for (const auto& node_it : it.nodes) { - oxenc::bt_dict_producer node_dict = node_list.append_dict(); - uint32_t reason_u32 = static_cast(node_it.reason); - uint64_t deadline_u64 = std::chrono::duration_cast( - node_it.deadline.time_since_epoch()) - .count(); - uint64_t next_retry_delay_u64 = node_it.next_retry_delay.count(); - node_dict.append(KEY_KEY, node_it.key); - node_dict.append(DEADLINE_KEY, deadline_u64); - node_dict.append(NEXT_RETRY_DELAY_KEY, next_retry_delay_u64); - node_dict.append(REASON_KEY, reason_u32); - } - } - - result.bt.success = true; - result.bt.write_payload = d.view(); - } else { - if (read_data.size()) { - oxenc::bt_dict_consumer d{read_data}; - try { - version = d.require(VERSION_KEY); - } catch (const std::exception& e) { - result.bt.error = "Failed to parse retryable request version: {}"_format(e.what()); - } - - if (version != 0) - result.bt.error = - "Unrecognised retryable request version: {}, skipping"_format(version); - - if (result.bt.error.empty()) { - // Initially a dummy list that we will std::move the real list into - oxenc::bt_list_consumer retry_list("l"); - try { - auto [key, list] = d.next_list_consumer(); - assert(key == RETRYABLE_REQUESTS_KEY); - retry_list = std::move(list); - } catch (const std::exception& e) { - result.bt.error = "Failed to read retryable request list: {}"_format(e.what()); - } - - while (result.bt.error.empty() && !retry_list.is_finished()) { - auto request_dict = retry_list.consume_dict_consumer(); - - RequestRetry request = {}; - try { - request.cmd = request_dict.require(COMMAND_KEY); - } catch (const std::exception& e) { - result.bt.error = - "Failed to read retryable request command: {}"_format(e.what()); - continue; - } - - try { - request.req_payload = request_dict.require(REQ_PAYLOAD_KEY); - } catch (const std::exception& e) { - result.bt.error = - "Failed to read retryable request, request payload: {}"_format( - e.what()); - continue; - } - - try { - uint64_t create_time_u64 = request_dict.require(CREATE_TIME_KEY); - request.create_time = std::chrono::steady_clock::time_point( - std::chrono::milliseconds(create_time_u64)); - } catch (const std::exception& e) { - result.bt.error = - "Failed to read retryable request, create time: {}"_format( - e.what()); - continue; - } - - oxenc::bt_list_consumer node_list("l"); // Dummy list - try { - auto [key, list] = request_dict.next_list_consumer(); - assert(key == NODES_KEY); - node_list = std::move(list); - } catch (const std::exception& e) { - result.bt.error = - "Failed to read retryable request, node list: {}"_format(e.what()); - continue; - } - - while (result.bt.error.empty() && !node_list.is_finished()) { - auto node_dict = node_list.consume_dict_consumer(); - RequestRetryEntry node = {}; - try { - std::string_view key_bytes = - node_dict.require(KEY_KEY); - node.key = crypto::legacy_pubkey::from_bytes(key_bytes); - } catch (const std::exception& e) { - result.bt.error = - "Failed to parse retryable request node key: {}"_format( - e.what()); - continue; - } - - try { - uint64_t deadline_u64 = node_dict.require(DEADLINE_KEY); - node.deadline = std::chrono::steady_clock::time_point( - std::chrono::milliseconds(deadline_u64)); - } catch (const std::exception& e) { - result.bt.error = - "Failed to parse retryable request node deadline: {}"_format( - e.what()); - continue; - } - - try { - uint64_t next_retry_delay_u64 = - node_dict.require(NEXT_RETRY_DELAY_KEY); - node.next_retry_delay = std::chrono::milliseconds(next_retry_delay_u64); - } catch (const std::exception& e) { - result.bt.error = - "Failed to parse retryable request next retry delay: {}"_format( - e.what()); - continue; - } - - try { - uint32_t reason_u32 = node_dict.require(REASON_KEY); - node.reason = static_cast(reason_u32); - } catch (const std::exception& e) { - result.bt.error = - "Failed to parse retryable request reason {}"_format(e.what()); - continue; - } - - request.nodes.emplace_back(std::move(node)); - } - result.retryable_requests.emplace_back(std::move(request)); - } - } - } - result.bt.success = result.bt.error.empty(); - } - return result; -} - SerialiseSwarmsResult ServiceNode::serialize_swarms( Serialise serialise, std::string_view read_data) const { SerialiseSwarmsResult result = {}; @@ -402,53 +222,15 @@ ServiceNode::ServiceNode( bool skip_bootstrap) : force_start_{force_start}, skip_bootstrap_{skip_bootstrap}, + db{std::make_unique(db_location)}, our_keys_{keys}, our_contact_{contact}, network_{*omq_server}, + swarm_{network_, our_keys_.pub, *db}, omq_server_{omq_server}, - all_stats_{*omq_server}, - db{std::make_unique(db_location)} { + all_stats_{*omq_server} { mq_servers_.push_back(&omq_server); - std::string swarms_blob = db->runtime_state_blob(BlobType::Swarms, Serialise::Read, ""); - SerialiseSwarmsResult swarm_result = serialize_swarms(Serialise::Read, swarms_blob); - if (swarm_result.bt.success) { - last_swarms_serialize_hash = fnv1a64_hasher(swarms_blob, FNV1A64_SEED); - swarm_.members_ = std::move(swarm_result.swarm_members); - network_.swarms_ = std::move(swarm_result.network_swarms); - swarm_.cur_swarm_id_ = swarm_result.swarm_cur_swarm_id; - } else { - log::error(logcat, "Deserialising of swarms failed: {}", swarm_result.bt.error); - swarms_blob.clear(); - } - - std::string retryable_blob = - db->runtime_state_blob(BlobType::RetryableRequests, Serialise::Read, ""); - SerialiseRetryableRequestsResult retryable_result = - serialize_retryable_requests(Serialise::Read, retryable_blob, {}); - if (retryable_result.bt.success) { - last_retryable_serialize_hash = fnv1a64_hasher(retryable_blob, FNV1A64_SEED); - retryable_requests = std::move(retryable_result.retryable_requests); - } else { - log::error( - logcat, - "Deserialising of retryable requests failed: {}", - retryable_result.bt.error); - retryable_blob.clear(); - } - - log::info( - logcat, - "Loaded {} ({}) swarms (#{:x}; in swarm {:x} w/ {} members) and {} ({}) retryable " - "requests from disk. Requesting initial swarm state", - network_.swarms_.size(), - util::get_human_readable_bytes(swarms_blob.size()), - last_swarms_serialize_hash, - swarm_.cur_swarm_id_, - swarm_.members_.size(), - retryable_requests.size(), - util::get_human_readable_bytes(retryable_blob.size())); - // Check if the DB was empty and remember if so for later when talking to swarm members on // handshake that we need to request a DB dump from them to populate our DB. In the edge case // where there _are_ 0 messages, this will request a DB dump of 0 messages and essentially @@ -606,12 +388,6 @@ static std::optional parse_swarm_update( return maybe_bu; } -void ServiceNode::add_retryable_request(RequestRetry&& item) { - std::unique_lock lock{retryable_requests_mutex}; - retryable_requests.emplace_back(item); - retryable_requests_cv.notify_all(); // Wake up retry thread -} - void ServiceNode::register_mq_server(server::MQBase* server) { mq_servers_.push_back(server); } @@ -810,34 +586,6 @@ struct LookupRetryIndexes { std::optional node_index; }; -static LookupRetryIndexes lookup_retry_indexes( - std::span retryable_requests, - uint64_t request_hash, - const crypto::legacy_pubkey& key) { - LookupRetryIndexes result = {}; - - // Find the retry request - for (size_t index = 0; index < retryable_requests.size(); index++) { - if (retryable_requests[index].hash == request_hash) { - result.retryable_index = index; - break; - } - } - - // Find the matching node inside the retry request - if (result.retryable_index) { - const RequestRetry& request = retryable_requests[*result.retryable_index]; - for (size_t index = 0; index < request.nodes.size(); index++) { - if (request.nodes[index].key == key) { - result.node_index = index; - break; - } - } - } - - return result; -} - void ServiceNode::check_new_members() { for (const auto& pk : swarm_.extract_contact_pending_members()) { auto c = network_.contacts.find(pk); @@ -1732,13 +1480,46 @@ void ServiceNode::process_push_batch(std::string_view blob, std::string_view sen log::trace(logcat, "Saving all: end"); } -void ServiceNode::retryable_requests_thread_entry_point() { - // The min and max amount of time this node will backoff between failed retry requests - constexpr auto MIN_RETRY_DELAY = 1s; - constexpr auto MAX_RETRY_DELAY = 60s; - constexpr auto RETRY_BACKOFF_COEFF = 1.75f; +void ServiceNode::check_retry_requests() { + db->foreach_ready_retry_request([this](const crypto::legacy_pubkey& key, const std::string& cmd, const std::string& payload, int64_t req_id) { + //FIXME: non-swarm-member retries should be purged automatically + //std::optional is_member = swarm_.is_member(key); + + crypto::x25519_pubkey pubkey_x25519 = {}; + + auto ct = contacts().find(key); + if (ct && *ct) + pubkey_x25519 = ct->pubkey_x25519; + + if (pubkey_x25519) { + auto on_request_done = [this, req_id](bool success, std::vector parts) { + // We cleanup the request in all situations except timeout (timeout + // indicating that the node was non-responsive, maybe offline). In an error + // state we don't know what state the recipient's storage server is in and + // we default to deleting it and ending the retry attempts. + rpc::SNStorageCCResult store_result = + rpc::interpret_sn_storage_cc_response_parts(success, parts); + if (store_result.status != rpc::SNStorageCCResultStatus::Timeout) { + db->remove_node_retry_request(req_id); + } + }; + omq_server()->request( + pubkey_x25519.view(), + "sn.storage_cc", + on_request_done, + cmd, + payload, + oxenmq::send_option::request_timeout{5s}); + } + }); +} +void ServiceNode::retryable_requests_thread_entry_point() { while (!shutting_down_) { + // FIXME: is this extra wakeup necessary/useful? If a retry is pending, the initial + // request must have timed out (5 seconds), so presumably just checking on retries + // every 5 seconds (maybe slightly more frequently?) should be fine. + // // At longest, we timeout on the blocking sleep every 5s, or, as soon as someone wakes up // the thread by notifying the condition var // - when a new retryable request is added @@ -1752,259 +1533,9 @@ void ServiceNode::retryable_requests_thread_entry_point() { retryable_requests_cv.wait_until(lock, earliest_deadline); if (shutting_down_) - continue; - - // Log the current retries - auto now = std::chrono::steady_clock::now(); - if (log::Level level = log::Level::debug; - log::get_level(logcat) <= level && retryable_requests.size()) { - - size_t due_requests = 0; - size_t total_requests = 0; - fmt::memory_buffer trace_buffer; - for (size_t index = 0; index < retryable_requests.size(); index++) { - const auto& item = retryable_requests[index]; - auto item_age = - std::chrono::duration_cast(now - item.create_time); - if (item_age >= rpc::TTL_MAXIMUM_PRIVATE) - continue; - - if (log::get_level(logcat) <= log::Level::trace) { - fmt::format_to( - std::back_inserter(trace_buffer), - "{} [{}] '{}' command {} to {} node(s)", - index ? "\n" : "", - index, - item.cmd, - util::get_human_readable_bytes(item.req_payload.size()), - item.nodes.size()); - } - - for (size_t node_index = 0; node_index < item.nodes.size(); node_index++) { - const auto& node_item = item.nodes[node_index]; - bool is_due = now >= node_item.deadline; - due_requests += is_due; - - if (log::get_level(logcat) <= log::Level::trace) { - if (node_index == 0) - fmt::format_to(std::back_inserter(trace_buffer), "\n NODES"); - - std::string_view reason = ""; - switch (node_item.reason) { - case RetryReason::NON_CONTACTABLE: reason = "non-contactable"; break; - case RetryReason::FAILED_TO_SEND: reason = "failed to send"; break; - } - - std::string deadline = "now"; - if (!is_due) { - auto delta = node_item.deadline - now; - deadline = "in {}"_format( - std::chrono::duration_cast(delta)); - } - - fmt::format_to( - std::back_inserter(trace_buffer), - "\n {}: {} ({}) retrying {}", - index, - node_item.key, - reason, - deadline); - } - } - - total_requests += item.nodes.size(); - } - - log::log( - logcat, - level, - "Attempting {}/{} retryable requests", - due_requests, - total_requests); - - if (log::get_level(logcat) <= log::Level::trace) - log::trace(logcat, "Retryables:\n{}", fmt::to_string(trace_buffer)); - } - - for (auto it = retryable_requests.begin(); it != retryable_requests.end();) { - // Create a hash of the inputs so that we can match dispatched requests easily with the - // originating retry item. - if (it->hash == 0) { - it->hash = FNV1A64_SEED; - it->hash = fnv1a64_hasher(it->cmd, it->hash); - it->hash = fnv1a64_hasher(it->req_payload, it->hash); - } - - auto it_age = std::chrono::duration_cast(now - it->create_time); - if (it_age >= rpc::TTL_MAXIMUM_PRIVATE) { - log::debug(logcat, "Retry request ({}) expired after {}", it->cmd, it_age); - it->nodes.clear(); - } - - for (auto node_it = it->nodes.begin(); node_it != it->nodes.end();) { - auto on_request_done = [MIN_RETRY_DELAY, - MAX_RETRY_DELAY, - this, - hash = it->hash, - key = node_it->key]( - bool success, std::vector parts) { - std::unique_lock lock{retryable_requests_mutex}; - - // Lookup the originating retry-request responsible for this OMQ response - LookupRetryIndexes lookup = lookup_retry_indexes(retryable_requests, hash, key); - if (!lookup.retryable_index) - return; - - RequestRetry& request = retryable_requests[*lookup.retryable_index]; - if (lookup.node_index) { - RequestRetryEntry& node = request.nodes[*lookup.node_index]; - node.retry_underway = false; - - // We cleanup the request in all situations except timeout (timeout - // indicating that the node was non-responsive, maybe offline). In an error - // state we don't know what state the recipient's storage server is in and - // we default to deleting it and ending the retry attempts. - rpc::SNStorageCCResult store_result = - rpc::interpret_sn_storage_cc_response_parts(success, parts); - bool cleanup = store_result.status != rpc::SNStorageCCResultStatus::Timeout; - - if (cleanup) { - std::string_view outcome = "succeeded"; - if (store_result.status != rpc::SNStorageCCResultStatus::Good) - outcome = "failed unrecoverably"; - - log::debug( - logcat, - "Retry to {} for {} ({}) {}, cleaning up", - key, - request.cmd, - util::get_human_readable_bytes(request.req_payload.size()), - outcome); - - request.nodes.erase(request.nodes.begin() + *lookup.node_index); - } else { - // Extend the next retry deadline and re-attempt later - node.next_retry_delay = std::max( - node.next_retry_delay, - std::chrono::milliseconds(MIN_RETRY_DELAY)); - - size_t delay_ms = std::chrono::duration_cast( - node.next_retry_delay) - .count(); - delay_ms *= RETRY_BACKOFF_COEFF; - node.next_retry_delay = std::min( - std::chrono::milliseconds(delay_ms), - std::chrono::milliseconds(MAX_RETRY_DELAY)); - node.deadline = - std::chrono::steady_clock::now() + node.next_retry_delay; - - // Wake up retryable request thread, it will take into consideration the - // new deadline for the blocking sleep - retryable_requests_cv.notify_all(); - - log::debug( - logcat, - "Retry to {} for {} ({}) timed out, next attempt in ~{}", - key, - request.cmd, - util::get_human_readable_bytes(request.req_payload.size()), - node.next_retry_delay); - } - } - - // Remove retryable request if there are no more nodes to retry to - if (request.nodes.empty()) - retryable_requests.erase( - retryable_requests.begin() + *lookup.retryable_index); - }; - - std::optional is_member = swarm_.is_member(node_it->key); - if (is_member && !node_it->retry_underway) { - // Retry request if ready - bool is_due = now >= node_it->deadline; - bool ready = is_member->status == SwarmMemberStatus::Ready; - crypto::x25519_pubkey pubkey_x25519 = {}; - - if (ready) { - auto ct = contacts().find(node_it->key); - if (ct && *ct) - pubkey_x25519 = ct->pubkey_x25519; - } - - if (pubkey_x25519) { - if (is_due) { - node_it->retry_underway = true; - omq_server()->request( - pubkey_x25519.view(), - "sn.storage_cc", - on_request_done, - it->cmd, - it->req_payload, - oxenmq::send_option::request_timeout{5s}); - } else { - earliest_deadline = std::min(earliest_deadline, node_it->deadline); - } - } - - if (!ready) { - log::debug( - logcat, - "Retry to {} ({}) deferred, member hasn't signaled 'data ready' " - "(was {})", - node_it->key, - it->cmd, - static_cast(is_member->status)); - } else if (!pubkey_x25519) { - log::debug( - logcat, - "Retry to {} ({}) deferred, contact info missing", - node_it->key, - it->cmd); - } - } - - if (is_member) { - node_it++; - } else { - log::debug( - logcat, - "Retry to {} ({}) cancelled, not a member in swarm anymore", - node_it->key, - it->cmd); - node_it = it->nodes.erase(node_it); - } - } - - if (it->nodes.empty()) - it = retryable_requests.erase(it); - else - it++; - } + break; - SerialiseRetryableRequestsResult write = - serialize_retryable_requests(Serialise::Write, "", retryable_requests); - if (write.bt.success) { - uint64_t hash = fnv1a64_hasher(write.bt.write_payload, FNV1A64_SEED); - if (last_retryable_serialize_hash != hash) { - log::debug( - logcat, - "Retryable requests dirtied #{:x} => #{:x}, saving {} to DB", - last_retryable_serialize_hash, - hash, - util::get_human_readable_bytes(write.bt.write_payload.size())); - last_retryable_serialize_hash = hash; - db->runtime_state_blob( - BlobType::RetryableRequests, Serialise::Write, write.bt.write_payload); - } - } else { - if (static bool once = true; once) { - once = false; - log::error( - logcat, - "Failed to serialize retryable requests to blob: {}", - write.bt.write_payload); - } - } + check_retry_requests(); } } } // namespace oxenss::snode diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index d7fbff355..515a01839 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -60,8 +60,6 @@ inline constexpr hf_revision STORAGE_SERVER_HARDFORK = {19, 6}; inline constexpr std::array NEW_SWARM_MEMBER_HANDSHAKE_VERSION = {2, 10, 0}; inline constexpr std::array SN_DATA_READY_WITH_REQUEST_VERSION = {2, 11, 0}; -class Swarm; - constexpr std::string_view to_string(SnodeStatus status) { switch (status) { case SnodeStatus::UNSTAKED: return "Unstaked"sv; @@ -114,6 +112,13 @@ class ServiceNode { std::string block_hash_; std::weak_ptr http_; + public: + + // bit messy, but Swarm needs db startup version, so db has to init before Swarm + std::unique_ptr db; + + private: + SnodeStatus status_ = SnodeStatus::UNKNOWN; const crypto::legacy_keypair our_keys_; @@ -121,7 +126,7 @@ class ServiceNode { Network network_; - Swarm swarm_{network_, our_keys_.pub}; + Swarm swarm_; server::OMQ& omq_server_; std::vector mq_servers_; @@ -143,10 +148,6 @@ class ServiceNode { // Lock to be taken when interacting with the 'retryable_requests' queue mutable std::mutex retryable_requests_mutex; - // List of requests that will be re-attempted periodically through the - // 'retryable_requests_thread' - std::vector retryable_requests; - std::thread retryable_requests_thread; // The hash of the last swarms blob that was serialised, used for dirty checks before storing to @@ -220,8 +221,6 @@ class ServiceNode { SerialiseSwarmsResult serialize_swarms(Serialise serialise, std::string_view read_data) const; - std::unique_ptr db; - const Network& network() { return network_; } const Swarm& swarm() { return swarm_; } @@ -232,9 +231,6 @@ class ServiceNode { const contact& own_address() { return our_contact_; } - // Enqueue a request to be re-attempted - void add_retryable_request(RequestRetry&& item); - // Adds a MQ server, i.e. QUIC. The OMQ server is added automatically during construction and // should not be added. void register_mq_server(server::MQBase* server); @@ -334,6 +330,8 @@ class ServiceNode { std::condition_variable retryable_requests_cv; void retryable_requests_thread_entry_point(); + + void check_retry_requests(); }; struct DataReadyRequest { diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index d028b0cd6..1c02f6fd1 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -180,12 +180,14 @@ SwarmEvents Swarm::update_swarms( // from them to ensure we have all the messages they have that we don't. for (auto it : events.new_swarm_members) { auto& pair = members_[it]; - if (oxenss::tmp_init_db_version == 1) { + if (!did_startup_version_check && _db.startup_version() == 1) { if (pair.our_ss_requested_db_dump == SwarmRequestedDBDump::Nil) pair.our_ss_requested_db_dump = SwarmRequestedDBDump::NeedsToRequest; } } + did_startup_version_check = true; + // If the DB was empty on startup then we mark all swarm members as peers that we need to // request a DB dump from. Note we only do this if the swarm matches the initial swarm we // were in when the DB was queried. We might have changed swarms since startup, in which @@ -204,8 +206,6 @@ SwarmEvents Swarm::update_swarms( } } - oxenss::tmp_init_db_version = 1; // Disable after the first swarm update - cur_swarm_id_ = events.our_swarm_id; network.update_swarms(std::move(swarms), new_contacts); diff --git a/oxenss/snode/swarm.h b/oxenss/snode/swarm.h index a6eade761..65598a60a 100644 --- a/oxenss/snode/swarm.h +++ b/oxenss/snode/swarm.h @@ -5,6 +5,7 @@ #include "network.h" #include "oxenss/crypto/keys.h" +#include "oxenss/storage/database.hpp" namespace oxenss::snode { @@ -91,9 +92,13 @@ class Swarm { // Flag that stops the DB initially empty w/ swarm ID from executing more than once. bool db_was_initially_empty_handled = false; + Database& _db; + + bool did_startup_version_check = false; + public: - Swarm(Network& network, const crypto::legacy_pubkey& our_pk) : - network{network}, our_pk{our_pk} {} + Swarm(Network& network, const crypto::legacy_pubkey& our_pk, Database& db) : + _db(db), network{network}, our_pk{our_pk} {} ~Swarm(); diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index a820cf040..f657016a9 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -18,6 +18,7 @@ #include #include #include +#include "oxenss/crypto/keys.h" #include #include @@ -283,7 +284,7 @@ class DatabaseImpl { } void initialize_database() { - tmp_init_db_version = db.execAndGet("PRAGMA user_version").getInt(); + parent._startup_version = db.execAndGet("PRAGMA user_version").getInt(); if (!db.tableExists("owners")) { create_schema(); @@ -334,14 +335,71 @@ CREATE TRIGGER IF NOT EXISTS revoked_autoclean )"); } - if (!db.tableExists("runtime_state")) { - log::info(logcat, "Upgrading database schema: adding runtime_state"); + // use version for schema changes from now + if (parent._startup_version == 0) { + log::info(logcat, + "Upgrading database schema: adding runtime state and retryable requests"); db.exec(R"( -CREATE TABLE runtime_state ( - swarms_blob BLOB, - retryable_requests_blob BLOB +CREATE TABLE retry_requests ( + id INTEGER PRIMARY KEY, + command TEXT NOT NULL, + payload BLOB NOT NULL, + created DOUBLE PRECISION NOT NULL DEFAULT (unixepoch('now', 'subsec')) +); +CREATE TABLE retry_pubkeys ( + id INTEGER PRIMARY KEY, + pubkey BLOB NOT NULL, + UNIQUE(pubkey) +); +CREATE TABLE retry_node_requests ( + id INTEGER PRIMARY KEY, + rr_id INTEGER NOT NULL REFERENCES retry_requests(id) ON DELETE CASCADE, + pk_id INTEGER NOT NULL REFERENCES retry_pubkeys(id) ON DELETE CASCADE, + next_retry DOUBLE PRECISION NOT NULL, + UNIQUE(rr_id, pk_id) +); +CREATE INDEX retry_node_requests_pk_idx ON retry_node_requests(pk_id); + +CREATE VIEW retry_node_reqs AS + SELECT retry_node_requests.id, retry_requests.command, retry_reqeusts.payload, retry_pubkeys.pubkey, next_retry + FROM retry_node_requests JOIN retry_requests ON rr_id = retry_requests.id JOIN retry_pubkeys ON pk_id = retry_pubkeys.id; + +CREATE TRIGGER retry_node_add +INSTEAD OF INSERT ON retry_node_reqs +BEGIN + -- Allows insertion into the view (with the raw pubkey value) to automatically do the pubkey + -- lookup (with autovivification) for you. + INSERT INTO retry_pubkeys (pubkey) VALUES (NEW.pubkey) ON CONFLICT(pubkey) DO NOTHING; + INSERT INTO retry_node_requests (rr_id, pk_id, next_retry) + VALUES (NEW.rr_id, (SELECT id FROM retry_pubkeys WHERE pubkey = NEW.pubkey), NEW.next_retry); +END; + +CREATE TRIGGER rr_cleanup +AFTER DELETE ON retry_node_requests +BEGIN + -- After deleting a node request record this trigger handles cleaning up any pubkeys or request + -- commands that are no longer referenced. + DELETE FROM retry_pubkeys + WHERE id = OLD.pk_id + AND NOT EXISTS ( + SELECT 1 FROM retry_node_requests WHERE pk_id = OLD.pk_id + ); + DELETE FROM retry_requests + WHERE id = OLD.rr_id + AND NOT EXISTS ( + SELECT 1 FROM retry_node_requests WHERE rr_id = OLD.rr_id + ); +END; + +-- Generic key->value store for the database +-- in future, we may explicitly require TEXT for keys, but arbitrary type for values. +-- store arbitrary persistent state, e.g. which swarm were we in before restart +CREATE TABLE state_kv ( + key TEXT NOT NULL, + value TEXT, + UNIQUE(key) ); -INSERT INTO runtime_state VALUES (null, null); + PRAGMA user_version = 1; )"); } @@ -1236,4 +1294,52 @@ std::string Database::runtime_state_blob( } return result; } + +int64_t Database::add_retry_request(const crypto::legacy_pubkey& key, const std::string& cmd, const std::string& payload, int64_t req_id) { + auto impl = get_impl(/*write =*/ true); + + // insert into request table if not present + if (req_id == 0) { + req_id = impl->prepared_get( + "INSERT INTO retry_requests (command, payload) values (?,?) RETURNING id", + cmd, + payload + ); + } + + // first retry 5 seconds after insertion, subsequent retries will be 60 seconds after the last. + impl->prepared_exec("INSERT INTO retry_node_reqs (rr_id, pubkey, next_retry) VALUES(?, ?, unixepoch('now', 'subsec') + 5);", req_id, key.str()); + + return req_id; +} + +void Database::foreach_ready_retry_request(std::function callback) { + auto impl = get_impl(/*write =*/ true); + + auto stmt = impl->prepared_st( + "SELECT * from retry_node_reqs WHERE next_retry < unixepoch('now', 'subsec')"); + + using sql_duration = std::chrono::duration>; + double now = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); + + // retry 60 seconds after this retry. Initial retries are staggered (5sec after timeout), + // but it doesn't seem useful to stagger here. Further, it would be a pain to do so after + // restart. Could update this time if/when the retry fails, but here seems more convenient. + auto next_time = now + 60; + while (stmt->executeStep()) { + auto [req_id, key_str, cmd, payload, next_retry] = + get(stmt); + auto key = crypto::legacy_pubkey::from_bytes(key_str); + impl->prepared_exec("UPDATE retry_nodes_requests SET next_retry = ?", next_time); + + callback(key, cmd, payload, req_id); + } + +} + +void Database::remove_node_retry_request(int64_t req_id) { + auto impl = get_impl(/*write =*/ true); + impl->prepared_exec("DELETE FROM retry_node_reqs WHERE id = ?", req_id); +} + } // namespace oxenss diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index 8a3e12763..7fef8ea78 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -17,6 +17,7 @@ #include #include #include +#include "oxenss/crypto/keys.h" namespace oxenss { @@ -57,6 +58,9 @@ class Database { // keep track of db full errors so we don't print them on every store std::atomic db_full_counter = 0; + // database version at startup (before any migration/upgrade) + int _startup_version = 0; + public: // Recommended period for calling clean_expired() static constexpr auto CLEANUP_PERIOD = 10s; @@ -69,6 +73,8 @@ class Database { ~Database(); + int startup_version() const { return _startup_version; } + // if the database is full then print an error only once ever N errors static constexpr int DB_FULL_FREQUENCY = 100; @@ -226,6 +232,19 @@ class Database { std::string runtime_state_blob( BlobType type, Serialise serialise, const std::string& write_blob); + + // Adds a request retry to the database, to be retried later. If req_id is specified, this + // is a subsequent failure on the same request. It's not great to leak database table indices + // into the rest of the code if avoidable, but deduplication would be otherwise tedious. + int64_t add_retry_request(const crypto::legacy_pubkey& key, const std::string& cmd, const std::string& payload, int64_t req_id = 0); + + // executes the provided callback for each request retry in the database which ready to retry. + // The table id is provided so the callback can call remove_retry_request on success. + void foreach_ready_retry_request(std::function); + + // Remove the specified request retry. This is one node's retry request, not the request + // itself -- if no more nodes need the request retried it will be removed as well. + void remove_node_retry_request(int64_t req_id); }; } // namespace oxenss From 0f91acabff40f6f565ae97a0500c063edc1d1fe7 Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Tue, 3 Mar 2026 18:23:45 -0500 Subject: [PATCH 35/50] Add swarm space/id cache to database Messages can now be queried based on swarm space, and so by finding the swarm space boundaries for a swarm id we can query all messages for that swarm (as opposed to getting *all* messages into RAM and sorting from there). The swarm space is stored as two 32-bit integers rather than as a single 64-bit integer, as sqlite does not support 64-bit unsigned integers and thus queries wouldn't work properly. This is an unfortunate side effect of using such a large type for swarm space/ids. This commit also caches our swarm id so we can know what it was between restarts. Previously all swarms' makeups were cached, but this did not seem particularly useful -- it couldn't be queried as it was stored as a serialized blob, and it appears the only useful value there is our current swarm id. --- oxenss/common/pubkey.cpp | 12 +++ oxenss/common/pubkey.h | 10 +- oxenss/snode/network.cpp | 52 +++++++-- oxenss/snode/network.h | 7 +- oxenss/snode/service_node.cpp | 64 +++++------ oxenss/snode/swarm.cpp | 1 + oxenss/storage/database.cpp | 196 +++++++++++++++++++++++++++------- oxenss/storage/database.hpp | 17 +-- 8 files changed, 268 insertions(+), 91 deletions(-) diff --git a/oxenss/common/pubkey.cpp b/oxenss/common/pubkey.cpp index e433b2b5c..db1c2311d 100644 --- a/oxenss/common/pubkey.cpp +++ b/oxenss/common/pubkey.cpp @@ -1,11 +1,23 @@ #include "pubkey.h" #include "mainnet.h" +#include "oxenc/endian.h" #include #include #include namespace oxenss { +uint64_t pubkey_to_swarm_space(const user_pubkey& pk) { + const auto bytes = pk.raw(); + assert(bytes.size() == 32); + + uint64_t res = 0; + for (size_t i = 0; i < bytes.size(); i += 8) + res ^= oxenc::load_big_to_host(bytes.data() + i); + + return res; +} + user_pubkey& user_pubkey::load(std::string_view pk) { if (pk.size() == USER_PUBKEY_SIZE_HEX && oxenc::is_hex(pk)) { uint8_t netid; diff --git a/oxenss/common/pubkey.h b/oxenss/common/pubkey.h index af59b8b6e..f324fc550 100644 --- a/oxenss/common/pubkey.h +++ b/oxenss/common/pubkey.h @@ -1,5 +1,6 @@ #pragma once +#include #include namespace oxenss { @@ -13,14 +14,14 @@ class user_pubkey { int network_ = -1; std::string pubkey_; - user_pubkey(int network, std::string raw_pk) : network_{network}, pubkey_{std::move(raw_pk)} {} - friend class DatabaseImpl; public: // Default constructor; constructs an invalid pubkey user_pubkey() = default; + user_pubkey(int network, std::string raw_pk) : network_{network}, pubkey_{std::move(raw_pk)} {} + // bool conversion: returns true if this object contains a valid pubkey explicit operator bool() const { return !pubkey_.empty(); } @@ -57,6 +58,11 @@ class user_pubkey { std::string prefixed_raw() const; }; + +/// Maps a pubkey into a 64-bit "swarm space" value; the swarm you belong to is whichever one +/// has a swarm id closest to this pubkey-derived value. +uint64_t pubkey_to_swarm_space(const user_pubkey& pk); + } // namespace oxenss namespace std { diff --git a/oxenss/snode/network.cpp b/oxenss/snode/network.cpp index d230ce26f..3d0d28c5b 100644 --- a/oxenss/snode/network.cpp +++ b/oxenss/snode/network.cpp @@ -11,15 +11,45 @@ namespace oxenss::snode { Network::Network(oxenmq::OxenMQ& omq) : contacts{omq} {} -uint64_t Network::pubkey_to_swarm_space(const user_pubkey& pk) { - const auto bytes = pk.raw(); - assert(bytes.size() == 32); +std::pair Network::get_swarm_boundaries(const uint64_t swarm) const { + if (swarms_.size() <= 1) return {0,0}; + + const auto it = swarms_.find(swarm); + if (it == swarms_.end()) + throw std::logic_error{"This function should only be called with a current swarm id."}; + + // FIXME: this logic is a little weird, but should work. + uint64_t prev_swarm, next_swarm; + if (it == swarms_.begin()) { + next_swarm = std::next(it)->first; + prev_swarm = std::prev(swarms_.end())->first; + } + else { + prev_swarm = std::prev(it)->first; + auto it2 = std::next(it); + if (it2 == swarms_.end()) + it2 = swarms_.begin(); + next_swarm = it2->first; + } - uint64_t res = 0; - for (size_t i = 0; i < bytes.size(); i += 8) - res ^= oxenc::load_big_to_host(bytes.data() + i); + // now have target swarm id, the one before it, and the one after it + // + // in the event of a distance tie in swarm space (e.g. id 1 and 7 with swarm space 4), + // the "right" (next) swarm loses. This means when querying with what we return here, + // we should do x > lower_bound AND x <= upper_bound + + // if there are only 2 swarms somehow, return the average and the average + 1<<63, + // with the average as the lower bound if target is the larger swarm id + if (prev_swarm == next_swarm) { + uint64_t avg = (swarm + prev_swarm) / 2; + uint64_t shift = (uint64_t)1<<63; + if (swarm > prev_swarm) + return {avg, avg + shift}; + else + return {avg + shift, avg}; + } - return res; + return {(swarm + prev_swarm)/2, (swarm + next_swarm)/2}; } swarms_t::const_iterator Network::_find_swarm_for(const user_pubkey& pk) const { @@ -153,4 +183,12 @@ std::shared_ptr> Network::all_nodes_blob() const { return blob; } +std::set Network::get_all_swarm_ids() const { + std::set ret; + + for (const auto& [id, swarm] : swarms_) + ret.emplace(id); + return ret; +} + } // namespace oxenss::snode diff --git a/oxenss/snode/network.h b/oxenss/snode/network.h index ddb2743c2..1bd7ae67b 100644 --- a/oxenss/snode/network.h +++ b/oxenss/snode/network.h @@ -39,6 +39,7 @@ class Network { friend class ServiceNode; + std::pair get_swarm_boundaries(const uint64_t swarm) const; swarms_t::const_iterator _find_swarm_for(const user_pubkey& pk) const; // Cached value of the all_nodes_blob() return value. The cache is cleared whenever swarms or @@ -62,10 +63,6 @@ class Network { // Holds all current contact information for network nodes. Contacts contacts; - /// Maps a pubkey into a 64-bit "swarm space" value; the swarm you belong to is whichever one - /// has a swarm id closest to this pubkey-derived value. - static uint64_t pubkey_to_swarm_space(const user_pubkey& pk); - // Looks up the swarm for a pubkey and returns the swarm_id. Returns nullopt on error (which // will only happen if there are no swarms at all). std::optional get_swarm_id_for(const user_pubkey& pk) const; @@ -99,6 +96,8 @@ class Network { // This value is cached and recomputed whenever swarms or contact info of any active node // changes. std::shared_ptr> all_nodes_blob() const; + + std::set get_all_swarm_ids() const; }; } // namespace oxenss::snode diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 497e6ac94..dad6137b9 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -231,6 +231,10 @@ ServiceNode::ServiceNode( all_stats_{*omq_server} { mq_servers_.push_back(&omq_server); + if (auto id = db->get_current_swarm()) { + swarm_.cur_swarm_id_ = *id; + } + // Check if the DB was empty and remember if so for later when talking to swarm members on // handshake that we need to request a DB dump from them to populate our DB. In the edge case // where there _are_ 0 messages, this will request a DB dump of 0 messages and essentially @@ -689,13 +693,16 @@ void ServiceNode::check_new_members() { } if (auto send_now = swarm_.extract_contacts_needing_db_dump(); !send_now.empty()) { - auto msgs = db->retrieve_all(); log::debug( logcat, - "Initiating swarm message dump ({} message) to swarm member(s): {}", - msgs.size(), + "Initiating swarm message dump to swarm member(s): {}", fmt::join(send_now, ", ")); - relay_messages(std::move(msgs), send_now); + auto boundaries = network_.get_swarm_boundaries(swarm_.cur_swarm_id_); + db->foreach_swarm_message([&send_now, this](const std::vector& messages) { + relay_messages(messages, send_now); + }, + boundaries.first, + boundaries.second); } } @@ -805,8 +812,6 @@ static void store_swarms_blob_if_changed( hash, util::get_human_readable_bytes(serialise_result.bt.write_payload.size())); last_hash = hash; - db.runtime_state_blob( - BlobType::Swarms, Serialise::Write, serialise_result.bt.write_payload); } } else { if (static bool once = true; once) { @@ -1247,36 +1252,31 @@ void ServiceNode::report_reachability( void ServiceNode::bootstrap_swarms(const std::set& swarms) const { std::lock_guard guard(sn_mutex_); - if (swarms.empty()) + const std::set* swarms_ptr = &swarms; + std::optional> all_swarms; + + if (swarms.empty()) { log::info(logcat, "Bootstrapping all swarms"); + all_swarms = network_.get_all_swarm_ids(); + if (all_swarms->empty()) { + log::warning(logcat, "Bootstrapping all swarms, but there are none?"); + return; + } + swarms_ptr = &*all_swarms; + } else if (logcat->level() <= log::Level::info) - log::info(logcat, "Bootstrapping swarms: [{}]", fmt::join(swarms, ", ")); - - std::unordered_map pk_swarm_cache; - std::unordered_map> to_relay; - - std::vector all_msgs = db->retrieve_all(); - log::debug(logcat, "We have {} messages", all_msgs.size()); - for (auto& entry : all_msgs) { - if (!entry.pubkey) { - log::error(logcat, "Invalid pubkey in a message while bootstrapping other nodes"); - continue; + log::info(logcat, "Bootstrapping swarms: [{}]", fmt::join(*swarms_ptr, ", ")); + + for (const auto& swarm_id : *swarms_ptr) { + if (auto swarm = network_.get_swarm(swarm_id)) { + auto boundaries = network_.get_swarm_boundaries(swarm_id); + db->foreach_swarm_message([&swarm, this](const std::vector& messages) { + relay_messages(messages, *swarm); + }, + boundaries.first, + boundaries.second); } - - auto [it, ins] = pk_swarm_cache.try_emplace(entry.pubkey); - if (ins) - it->second = network_.get_swarm_id_for(entry.pubkey).value_or(INVALID_SWARM_ID); - auto swarm_id = it->second; - - if (swarms.empty() || swarms.count(swarm_id)) - to_relay[swarm_id].push_back(std::move(entry)); } - - log::trace(logcat, "Bootstrapping {} swarms", to_relay.size()); - - for (const auto& [swarm_id, items] : to_relay) - if (auto swarm = network_.get_swarm(swarm_id)) - relay_messages(items, *swarm); } void ServiceNode::relay_messages( diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index 1c02f6fd1..0806f8220 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -207,6 +207,7 @@ SwarmEvents Swarm::update_swarms( } cur_swarm_id_ = events.our_swarm_id; + _db.update_current_swarm(cur_swarm_id_); network.update_swarms(std::move(swarms), new_contacts); diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index f657016a9..3e4d28520 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -18,6 +19,8 @@ #include #include #include +#include "oxenc/bt_serialize.h" +#include "oxenc/bt_value.h" #include "oxenss/crypto/keys.h" #include @@ -231,6 +234,31 @@ namespace { } // namespace +user_pubkey load_pubkey(uint8_t type, std::string pk) { return {type, std::move(pk)}; } + +void sqlite_swarm_space(sqlite3_context *sqlite_context, int argc, sqlite3_value **argv, bool hi) { + assert(argc == 1); + auto sz = sqlite3_value_bytes(argv[0]); + assert(sz == 32); + auto* key_blob = sqlite3_value_blob(argv[0]); + auto pubkey = load_pubkey(0 /* irrelevant */, {reinterpret_cast(key_blob), 32}); + auto swarm_space = pubkey_to_swarm_space(pubkey); + + if (hi) swarm_space = swarm_space >> 32; + else swarm_space = swarm_space & 0xffffffff; + + sqlite3_result_int64(sqlite_context, swarm_space); +} + +void sqlite_swarm_space_hi(sqlite3_context *sqlite_context, int argc, sqlite3_value **argv) { + sqlite_swarm_space(sqlite_context, argc, argv, true); +} + +void sqlite_swarm_space_lo(sqlite3_context *sqlite_context, int argc, sqlite3_value **argv) { + sqlite_swarm_space(sqlite_context, argc, argv, false); +} + + class DatabaseImpl { public: oxenss::Database& parent; @@ -246,6 +274,11 @@ class DatabaseImpl { SQLite::OPEN_READWRITE | (initialize ? SQLite::OPEN_CREATE : 0) | SQLite::OPEN_NOMUTEX, SQLite_busy_timeout.count()} { + + // intialize sqlite application-defined functions (must be set up per-connection). + db.createFunction("func_swarm_space_hi", 1, true, nullptr, &sqlite_swarm_space_hi); + db.createFunction("func_swarm_space_lo", 1, true, nullptr, &sqlite_swarm_space_lo); + // Don't fail on these because we can still work even if they fail if (int rc = db.tryExec("PRAGMA journal_mode = WAL"); rc != SQLITE_OK) log::error(logcat, "Failed to set journal mode to WAL: {}", sqlite3_errstr(rc)); @@ -338,7 +371,40 @@ CREATE TRIGGER IF NOT EXISTS revoked_autoclean // use version for schema changes from now if (parent._startup_version == 0) { log::info(logcat, - "Upgrading database schema: adding runtime state and retryable requests"); + "Upgrading database schema: adding swarm space cache, runtime state and retryable requests"); + + // swarm space is 64-bit unsigned, which means unfortunately we can't do queries + // on it with arithmetic properly (sqlite INTEGER is 64-bit signed). As such, we + // store it as two separate columns so we can query on it. + // + // The added trigger will automatically populate these two columns on insert, and the + // existing rows will have these columns populated by the UPDATE query after this. + db.exec(R"( +ALTER TABLE owners ADD COLUMN swarm_space_hi INTEGER NOT NULL DEFAULT -1; +ALTER TABLE owners ADD COLUMN swarm_space_lo INTEGER NOT NULL DEFAULT -1; + +CREATE TRIGGER swarm_space_trigger +BEFORE INSERT ON owners +FOR EACH ROW +WHEN NEW.swarm_space_lo = -1 +BEGIN + INSERT INTO owners (id, type, pubkey, swarm_space_hi, swarm_space_lo) + VALUES (NEW.id, NEW.type, NEW.pubkey, func_swarm_space_hi(NEW.pubkey), func_swarm_space_lo(NEW.pubkey); + + SELECT RAISE(IGNORE); -- skips the original insert since we replaced it +END; + )"); + + db.exec(R"( +UPDATE owners +SET swarm_space_hi = func_swarm_space_hi(pubkey), +swarm_space_lo = func_swarm_space_lo(pubkey) +WHERE swarm_space_hi = -1; + )"); + + auto stmt = prepared_st( + "SELECT * from retry_node_reqs WHERE next_retry < unixepoch('now', 'subsec')"); + db.exec(R"( CREATE TABLE retry_requests ( id INTEGER PRIMARY KEY, @@ -533,6 +599,9 @@ CREATE INDEX IF NOT EXISTS messages_expiry ON messages(expiry); CREATE INDEX IF NOT EXISTS messages_owner ON messages(owner, namespace, timestamp); CREATE INDEX IF NOT EXISTS messages_hash ON messages(hash); +CREATE INDEX IF NOT EXISTS owners_swarm_hi ON owners(swarm_space_hi); +CREATE INDEX IF NOT EXISTS owners_swarm_lo ON owners(swarm_space_lo); + CREATE VIEW IF NOT EXISTS owned_messages AS SELECT owners.id AS oid, type, pubkey, messages.id AS mid, hash, namespace, timestamp, expiry, data FROM messages JOIN owners ON messages.owner = owners.id; @@ -578,8 +647,6 @@ DROP TRIGGER IF EXISTS owned_messages_upsert; auto prepared_get(const std::string& query, const Bind&... bind) { return exec_and_get(prepared_st(query), bind...); } - - user_pubkey load_pubkey(uint8_t type, std::string pk) { return {type, std::move(pk)}; } }; Database::Database(std::filesystem::path db_path) : db_path_{std::move(db_path)} { @@ -711,7 +778,7 @@ static std::optional get_message(DatabaseImpl& impl, SQLite::Statement& get( st); msg.emplace( - impl.load_pubkey(otype, std::move(opubkey)), + load_pubkey(otype, std::move(opubkey)), std::move(hash), ns, from_epoch_ms(ts), @@ -721,17 +788,6 @@ static std::optional get_message(DatabaseImpl& impl, SQLite::Statement& return msg; } -std::optional Database::retrieve_random() { - clean_expired(); // *Must* be before the below get_impl because otherwise the read-only impl - // would deadlock with the clean_expired write=true get_impl(). - auto impl = get_impl(false); - auto st = impl->prepared_st( - "SELECT hash, type, pubkey, namespace, timestamp, expiry, data" - " FROM owned_messages " - " WHERE mid = (SELECT id FROM messages ORDER BY RANDOM() LIMIT 1)"); - return get_message(*impl, st); -} - std::optional Database::retrieve_by_hash(const std::string& msg_hash) { auto impl = get_impl(false); auto st = impl->prepared_st( @@ -949,7 +1005,7 @@ std::vector Database::retrieve_all() { get( st); results.emplace_back( - impl->load_pubkey(type, pubkey), + load_pubkey(type, pubkey), std::move(hash), ns, from_epoch_ms(ts), @@ -1271,30 +1327,6 @@ void oxenss::Database::test_suite_block_for(std::chrono::milliseconds duration) std::this_thread::sleep_for(duration); } -std::string Database::runtime_state_blob( - BlobType type, Serialise serialise, const std::string& write_blob) { - std::string_view key = {}; - switch (type) { - case BlobType::Swarms: key = "swarms_blob"; break; - case BlobType::RetryableRequests: key = "retryable_requests_blob"; break; - } - - std::string result; - auto impl = get_impl(serialise == Serialise::Write); - if (serialise == Serialise::Read) { - auto stmt = impl->prepared_st("SELECT {} FROM runtime_state LIMIT 1"_format(key)); - auto maybe_result = exec_and_maybe_get(stmt); - if (maybe_result) - result = std::move(*maybe_result); - } else { - if (write_blob.size()) { - auto stmt = impl->prepared_st("UPDATE runtime_state SET {} = ?"_format(key)); - exec_query(stmt, write_blob); - } - } - return result; -} - int64_t Database::add_retry_request(const crypto::legacy_pubkey& key, const std::string& cmd, const std::string& payload, int64_t req_id) { auto impl = get_impl(/*write =*/ true); @@ -1337,9 +1369,93 @@ void Database::foreach_ready_retry_request(std::function&)> callback, uint64_t lower_bound, uint64_t upper_bound, bool zero_inclusive) { + + if (lower_bound > upper_bound) { + foreach_swarm_message(callback, lower_bound, std::numeric_limits::max()); + foreach_swarm_message(callback, 0, upper_bound, /*zero_inclusive=*/true); + return; + } + + auto impl = get_impl(/*write =*/ false); + + constexpr size_t batch_size = 100; + + std::optional statement; + + // weird case of their exists exactly one swarm, which should be impossible + if (lower_bound == upper_bound) { + statement = SQLite::Statement{impl->db, + "SELECT type, pubkey, hash, namespace, timestamp, expiry, data" + " FROM owned_messages ORDER BY mid"}; + } + else { + // there's probably a better way to do this, but it should be fine + std::string query = R"( +SELECT type, pubkey, hash, namespace, timestamp, expiry, data +FROM owned_messages ORDER BY mid +JOIN owners ON oid = id +WHERE + )"; + query += R"( + (owners.swarm_space_hi >{0} ?1 OR (owners.swarm_space_hi == ?1 AND owners.swarm_space_lo >{0} ?2)) + AND + (owners.swarm_space_hi <= ?3 OR (owners.swarm_space_hi == ?3 AND owners.swarm_space_lo <= ?4)); + )"_format(zero_inclusive ? "=" : ""); + + statement = SQLite::Statement{impl->db, query}; + + int pos = 1; + statement->bind(pos++, (int64_t)(lower_bound >> 32)); + statement->bind(pos++, (int64_t)(lower_bound & 0xffffffff)); + statement->bind(pos++, (int64_t)(upper_bound >> 32)); + statement->bind(pos++, (int64_t)(upper_bound & 0xffffffff)); + } + + auto& st = *statement; + std::vector messages; + while (st.executeStep()) { + auto [type, pubkey, hash, ns, ts, exp, data] = + get( + st); + messages.emplace_back( + load_pubkey(type, pubkey), + std::move(hash), + ns, + from_epoch_ms(ts), + from_epoch_ms(exp), + std::move(data)); + if (messages.size() >= batch_size) { + callback(messages); + messages.clear(); + } + } + if (messages.size()) + callback(messages); +} + void Database::remove_node_retry_request(int64_t req_id) { auto impl = get_impl(/*write =*/ true); impl->prepared_exec("DELETE FROM retry_node_reqs WHERE id = ?", req_id); } +void Database::update_current_swarm(uint64_t swarm_id) { + auto as_hex = oxenc::bt_serialize(swarm_id); + auto impl = get_impl(/*write =*/ true); + impl->prepared_exec("INSERT INTO state_kv (key, value) VALUES ('swarm_id', ?) ON CONFLICT REPLACE;", + as_hex); +} + +std::optional Database::get_current_swarm() { + auto impl = get_impl(/*write =*/ false); + try { + auto as_hex = impl->prepared_get("SELECT value FROM state_kv WHERE key = 'swarm_id';"); + return oxenc::bt_deserialize(as_hex); + } + catch (const std::exception& e) { + return std::nullopt; + } + return std::nullopt; +} + } // namespace oxenss diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index 7fef8ea78..a58e6acba 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -141,9 +141,6 @@ class Database { // bound on actual stored size as there may be partially filled pages. int64_t get_used_bytes(); - // Get random message. Returns nullopt if there are no messages. - std::optional retrieve_random(); - // Get message by `msg_hash`, return true if found. Note that this does *not* filter by // pubkey or namespace! std::optional retrieve_by_hash(const std::string& msg_hash); @@ -230,9 +227,6 @@ class Database { std::map get_expiries( const user_pubkey& pubkey, const std::vector& msg_hashes); - std::string runtime_state_blob( - BlobType type, Serialise serialise, const std::string& write_blob); - // Adds a request retry to the database, to be retried later. If req_id is specified, this // is a subsequent failure on the same request. It's not great to leak database table indices // into the rest of the code if avoidable, but deduplication would be otherwise tedious. @@ -242,9 +236,20 @@ class Database { // The table id is provided so the callback can call remove_retry_request on success. void foreach_ready_retry_request(std::function); + // executes the provided callback for every swarm message (in batches) for the swarm with the + // given swarm space boundaries. The lower bound is exclusive; the upper inclusive. + // if the lower bound is higher than the upper bound (i.e. overflow wrapping), will be called + // recursively on both sides of the overflow. In this case, zero as the lower bound *will* + // be inclusive + void foreach_swarm_message(std::function&)> callback, uint64_t lower_bound, uint64_t upper_bound, bool zero_inclusive=false); + // Remove the specified request retry. This is one node's retry request, not the request // itself -- if no more nodes need the request retried it will be removed as well. void remove_node_retry_request(int64_t req_id); + + void update_current_swarm(uint64_t swarm_id); + + std::optional get_current_swarm(); }; } // namespace oxenss From def6b8ecf8bd131826479305adcf358fd1554cab Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Wed, 4 Mar 2026 01:27:07 -0500 Subject: [PATCH 36/50] remove dead code and C++-ify some C-esque code --- oxenss/common/serialize.h | 27 ---- oxenss/server/omq.cpp | 26 ++-- oxenss/server/omq.h | 1 - oxenss/snode/service_node.cpp | 225 ++++------------------------------ oxenss/snode/service_node.h | 22 +--- oxenss/storage/database.cpp | 13 +- oxenss/storage/database.hpp | 3 +- 7 files changed, 40 insertions(+), 277 deletions(-) delete mode 100644 oxenss/common/serialize.h diff --git a/oxenss/common/serialize.h b/oxenss/common/serialize.h deleted file mode 100644 index be3155b0e..000000000 --- a/oxenss/common/serialize.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace oxenss { -enum class Serialise { - Read, - Write, -}; - -struct SerialiseBTResult { - bool success; - std::string write_payload; - std::string error; -}; - -constexpr uint64_t FNV1A64_SEED = 14695981039346656037ULL; - -inline uint64_t fnv1a64_hasher(std::string_view bytes, uint64_t hash) { - for (size_t i = 0; i < bytes.size(); i++) - hash = (bytes[i] ^ hash) * 1099511628211 /*FNV Prime*/; - return hash; -} - -}; // namespace oxenss diff --git a/oxenss/server/omq.cpp b/oxenss/server/omq.cpp index cec62a9b7..84b6058b9 100644 --- a/oxenss/server/omq.cpp +++ b/oxenss/server/omq.cpp @@ -69,28 +69,24 @@ void OMQ::handle_sn_data_ready(oxenmq::Message& message) { if (message.data.empty()) return message.send_reply("Request payload missing"); - snode::SerialiseDataReadyRequestResult deserialised = - snode::serialise_data_ready_request(Serialise::Read, message.data[0], {}); - if (!deserialised.bt.success) - return message.send_reply("Request payload malformed {}"_format(deserialised.bt.error)); + bool needs_db_dump{false}; + try { + needs_db_dump = snode::deserialise_data_ready_request(message.data[0]); + } + catch (const std::exception& e) { + log::info(logcat, "DataReadyRequest deserialization error: {}", e.what()); + return message.send_reply("Request payload malformed."); + } - const snode::DataReadyRequest& request = deserialised.request; - if (request.needs_db_dump) + if (needs_db_dump) service_node_->set_member_needs_db_dump(crypto::legacy_pubkey{ct->pubkey_ed25519}); if (log::get_level(logcat) <= log::Level::debug) { - std::string label; - if (deserialised.bt.success) - label = "rejected, bad request payload. {})"_format(deserialised.bt.error); - else - label = "rejected due to bad request args"; - log::debug( logcat, - "sn.data ready processed (edpk: {}, db dump: {}): {}", + "sn.data ready processed (edpk: {}, needs db dump: {})", ct->pubkey_ed25519, - request.needs_db_dump, - label); + needs_db_dump); } } diff --git a/oxenss/server/omq.h b/oxenss/server/omq.h index 7e3e947dc..2385cf784 100644 --- a/oxenss/server/omq.h +++ b/oxenss/server/omq.h @@ -10,7 +10,6 @@ #include #include #include -#include namespace oxenss { diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index dad6137b9..04669deca 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -48,169 +48,30 @@ constexpr auto OXEND_PING_INTERVAL = 30s; // swarm members and propagate a DB dump if necessary. constexpr auto NEW_SWARM_MEMBER_INTERVAL = 10s; -struct SerialiseRetryableRequestsResult { - SerialiseBTResult bt; - std::vector retryable_requests; -}; - -SerialiseDataReadyRequestResult serialise_data_ready_request( - Serialise serialise, std::string_view read_data, const DataReadyRequest& write_data) { - SerialiseDataReadyRequestResult result = {}; - uint32_t version = 0; +// TODO: if these *are* going to be named constants rather than just existing in 2 places +// (where this is serialized and where it is deserialized), they should live in the header +// or something. +namespace data_ready_req { constexpr std::string_view VERSION_KEY = "@"; constexpr std::string_view STATUS_KEY = "s"; constexpr std::string_view NEED_DB_DUMP_KEY = "t"; - static_assert(VERSION_KEY < STATUS_KEY); - static_assert(STATUS_KEY < NEED_DB_DUMP_KEY); - - if (serialise == Serialise::Write) { - oxenc::bt_dict_producer d; - d.append(VERSION_KEY, version); - d.append(NEED_DB_DUMP_KEY, write_data.needs_db_dump); - result.bt.write_payload = d.view(); - result.bt.success = result.bt.error.empty(); - } else { - if (read_data.size()) { - oxenc::bt_dict_consumer d{read_data}; - try { - version = d.require(VERSION_KEY); - } catch (const std::exception& e) { - result.bt.error = - "Failed to parse sn data ready request version: {}"_format(e.what()); - } - - if (result.bt.error.empty()) { - try { - result.request.needs_db_dump = d.require(NEED_DB_DUMP_KEY); - } catch (const std::exception& e) { - result.bt.error = - "Failed to parse sn data ready db dump flag: {}"_format(e.what()); - } - } - } else { - result.bt.error = "Failed to parse data ready payload: no bytes given"; - } - - result.bt.success = result.bt.error.empty(); - } - return result; -} - -SerialiseSwarmsResult ServiceNode::serialize_swarms( - Serialise serialise, std::string_view read_data) const { - SerialiseSwarmsResult result = {}; - - constexpr std::string_view VERSION_KEY = "@"; - constexpr std::string_view NETWORK_SWARMS_KEY = "network.swarms"; - constexpr std::string_view SWARM_CUR_SWARM_ID = "swarm.cur_swarm_id"; - constexpr std::string_view SWARM_MEMBERS_KEY = "swarm.members"; +} // namespace data_ready_req +std::string serialise_data_ready_request(bool needs_db_dump) { uint32_t version = 0; - if (serialise == Serialise::Write) { - oxenc::bt_dict_producer d; - d.append(VERSION_KEY, version); - - { - oxenc::bt_list_producer network_swarm_list = d.append_list(NETWORK_SWARMS_KEY); - for (auto it : network_.swarms_) { - auto swarm = network_swarm_list.append_list(); - swarm.append(it.first); // swarm_id_t - - { // Append list of pubkeys for this swarm - for (const crypto::legacy_pubkey& pk : it.second) - swarm.append(pk.view()); - } - } - } - - d.append(SWARM_CUR_SWARM_ID, swarm_.cur_swarm_id_); - - { // Append list of _our_ swarm members - oxenc::bt_list_producer swarm_member_list = d.append_list(SWARM_MEMBERS_KEY); - for (auto it : swarm_.members_) - swarm_member_list.append(it.first); // pk - } - - result.bt.success = true; - result.bt.write_payload = d.view(); - } else { - if (read_data.size()) { - oxenc::bt_dict_consumer d{read_data}; - try { - version = d.require(VERSION_KEY); - } catch (const std::exception& e) { - result.bt.error = "Failed to parse version: {}"_format(e.what()); - } - - if (result.bt.error.empty()) { - // Initially a dummy list that we will std::move the real list into - oxenc::bt_list_consumer swarm_list("l"); - try { - auto [key, list] = d.next_list_consumer(); - assert(key == NETWORK_SWARMS_KEY); - swarm_list = std::move(list); - } catch (const std::exception& e) { - result.bt.error = "Failed to parse network swarms: {}"_format(e.what()); - } - - while (result.bt.error.empty() && !swarm_list.is_finished()) { - auto swarm = swarm_list.consume_list_consumer(); - uint64_t swarm_id = 0; - try { - swarm_id = swarm.consume(); - } catch (const std::exception& e) { - result.bt.error = - "Failed to parse swarm id from swarm list: {}"_format(e.what()); - continue; - } - - std::set& keys = result.network_swarms[swarm_id]; - while (result.bt.error.empty() && !swarm.is_finished()) { - try { - auto bytes = swarm.consume(); - keys.insert(keys.end(), crypto::legacy_pubkey::from_bytes(bytes)); - } catch (const std::exception& e) { - result.bt.error = - "Failed to parse swarm pubkey from swarm: {}"_format(e.what()); - } - } - } - } - - if (result.bt.error.empty()) { - try { - result.swarm_cur_swarm_id = d.require(SWARM_CUR_SWARM_ID); - } catch (const std::exception& e) { - result.bt.error = - "Failed to parse swarm's current swarm ID: {}"_format(e.what()); - } - } + static_assert(data_ready_req::VERSION_KEY < data_ready_req::STATUS_KEY); + static_assert(data_ready_req::STATUS_KEY < data_ready_req::NEED_DB_DUMP_KEY); - if (result.bt.error.empty()) { - oxenc::bt_list_consumer swarm_members("l"); - try { - auto [key, list] = d.next_list_consumer(); - assert(key == SWARM_MEMBERS_KEY); - swarm_members = std::move(list); - } catch (const std::exception& e) { - result.bt.error = "Failed to parse swarm members: {}"_format(e.what()); - } - - while (result.bt.error.empty() && !swarm_members.is_finished()) { - try { - auto bytes = swarm_members.consume(); - result.swarm_members[crypto::legacy_pubkey::from_bytes(bytes)] = {}; - } catch (const std::exception& e) { - result.bt.error = - "Failed to parse swarm member from list: {}"_format(e.what()); - } - } - } - } - result.bt.success = result.bt.error.empty(); - } + oxenc::bt_dict_producer d; + d.append(data_ready_req::VERSION_KEY, version); + d.append(data_ready_req::NEED_DB_DUMP_KEY, needs_db_dump); + return std::move(d).str(); +} - return result; +bool deserialise_data_ready_request(std::string_view data) { + oxenc::bt_dict_consumer d{data}; + auto version = d.require(data_ready_req::VERSION_KEY); + return d.require(data_ready_req::NEED_DB_DUMP_KEY); } ServiceNode::ServiceNode( @@ -239,7 +100,7 @@ ServiceNode::ServiceNode( // handshake that we need to request a DB dump from them to populate our DB. In the edge case // where there _are_ 0 messages, this will request a DB dump of 0 messages and essentially // no-op. - if (db->get_message_count(Database::GetMessageCount::Owned) == 0) { + if (db->get_message_count() == 0) { // The 'cur_swarm_id' might be INVALID_SWARM_ID. This will be the case if the DB was deletd // (and so the blobs storing our swarms were also deleted). The swarm is then // bootstrapped to a proper swarm when we process the first handshake from a swarm member. @@ -658,33 +519,30 @@ void ServiceNode::check_new_members() { if (c->version >= SN_DATA_READY_WITH_REQUEST_VERSION) { // Build 'data ready' request - snode::DataReadyRequest request = {}; + bool needs_db_dump{false}; { std::lock_guard network_lock{network().mut_}; if (SwarmMemberState* member = swarm_.is_member_locked(pk); member) { SwarmRequestedDBDump& status = member->our_ss_requested_db_dump; if (status == SwarmRequestedDBDump::NeedsToRequest) { status = SwarmRequestedDBDump::RequestUnderway; - request.needs_db_dump = true; + needs_db_dump = true; } } } // Serialise our response and send it off - snode::SerialiseDataReadyRequestResult serialised = - snode::serialise_data_ready_request(Serialise::Write, "", request); - assert(serialised.bt.success); - + auto serialised = snode::serialise_data_ready_request(needs_db_dump); log::debug( logcat, "Initiating contact with new swarm member {}{}", pk, - request.needs_db_dump ? " (requesting DB dump)" : ""); + needs_db_dump ? " (requesting DB dump)" : ""); omq_server_->request( c->pubkey_x25519.view(), "sn.data_ready", on_sn_data_ready_response, - std::move(serialised.bt.write_payload)); + std::move(serialised)); } else { log::debug(logcat, "Initiating contact with new swarm member {}", pk); omq_server_->request( @@ -796,40 +654,9 @@ void ServiceNode::save_bulk(const std::vector& msgs) { log::trace(logcat, "saved messages count: {}", msgs.size()); } -static void store_swarms_blob_if_changed( - uint64_t block_height, - const SerialiseSwarmsResult& serialise_result, - Database& db, - uint64_t& last_hash) { - if (serialise_result.bt.success) { - uint64_t hash = fnv1a64_hasher(serialise_result.bt.write_payload, FNV1A64_SEED); - if (last_hash != hash) { - log::debug( - logcat, - "Swarm state dirtied at blk {}; #{:x} => #{:x}, saving {} to DB", - block_height, - last_hash, - hash, - util::get_human_readable_bytes(serialise_result.bt.write_payload.size())); - last_hash = hash; - } - } else { - if (static bool once = true; once) { - once = false; - log::error( - logcat, - "Failed to serialize swarms to blob: {}", - serialise_result.bt.write_payload); - } - } -} - void ServiceNode::on_bootstrap_update(block_update&& bu) { swarm_.update_swarms(bu.height, std::move(bu.swarms), bu.contacts); target_height_ = std::max(target_height_, bu.height); - - snode::SerialiseSwarmsResult write = serialize_swarms(Serialise::Write, ""); - store_swarms_blob_if_changed(block_height_, write, *db, last_swarms_serialize_hash); } void ServiceNode::on_snodes_update(block_update&& bu) { @@ -872,10 +699,6 @@ void ServiceNode::on_snodes_update(block_update&& bu) { auto events = swarm_.update_swarms(bu.height, std::move(bu.swarms), bu.contacts); - // Serialise state to blob and store into DB if dirtied - snode::SerialiseSwarmsResult write = serialize_swarms(Serialise::Write, ""); - store_swarms_blob_if_changed(block_height_, write, *db, last_swarms_serialize_hash); - if (const SnodeStatus status = events.our_swarm_id != INVALID_SWARM_ID ? SnodeStatus::ACTIVE : bu.decommed ? SnodeStatus::DECOMMISSIONED : SnodeStatus::UNSTAKED; @@ -1446,7 +1269,7 @@ std::string ServiceNode::get_status_line() const { STORAGE_SERVER_VERSION_STRING, oxenss::is_mainnet ? "" : " (TESTNET)", syncing_ ? "; SYNCING" : "", - db->get_message_count(Database::GetMessageCount::All), + db->get_message_count(), util::get_human_readable_bytes(db->get_used_bytes()), db->get_owner_count(), stats.client_store_requests, diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index 515a01839..599f69860 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -12,7 +12,6 @@ #include #include -#include #include #include "network.h" #include "swarm.h" @@ -91,13 +90,6 @@ struct RequestRetry { std::vector nodes; }; -struct SerialiseSwarmsResult { - SerialiseBTResult bt; - std::map swarm_members; - swarms_t network_swarms; - swarm_id_t swarm_cur_swarm_id; -}; - /// All service node logic that is not network-specific class ServiceNode { bool syncing_ = true; @@ -219,8 +211,6 @@ class ServiceNode { bool force_start, bool skip_bootstrap); - SerialiseSwarmsResult serialize_swarms(Serialise serialise, std::string_view read_data) const; - const Network& network() { return network_; } const Swarm& swarm() { return swarm_; } @@ -334,17 +324,9 @@ class ServiceNode { void check_retry_requests(); }; -struct DataReadyRequest { - bool needs_db_dump; -}; - -struct SerialiseDataReadyRequestResult { - SerialiseBTResult bt; - DataReadyRequest request; -}; +// at the moment we only care about the "needs_db_dump" boolean +bool deserialise_data_ready_request(std::string_view data); -SerialiseDataReadyRequestResult serialise_data_ready_request( - Serialise serialise, std::string_view read_data, const DataReadyRequest& write_data); } // namespace oxenss::snode template <> diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index 3e4d28520..fa70cc76f 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -730,17 +730,8 @@ void Database::clean_expired() { to_epoch_ms(std::chrono::system_clock::now())); } -int64_t Database::get_message_count(GetMessageCount get) { - int64_t result = 0; - switch (get) { - case GetMessageCount::All: - result = get_impl(false)->prepared_get("SELECT COUNT(*) FROM messages"); - break; - case GetMessageCount::Owned: - result = get_impl(false)->prepared_get("SELECT COUNT(*) FROM owned_messages"); - break; - } - return result; +int64_t Database::get_message_count() { + return get_impl(false)->prepared_get("SELECT COUNT(*) FROM messages"); } int64_t Database::get_owner_count() { diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index a58e6acba..d4e7fac72 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include @@ -120,7 +119,7 @@ class Database { }; // Return the total number of messages stored - int64_t get_message_count(GetMessageCount get); + int64_t get_message_count(); // Returns the per-owner counts of stored messages, for storage statistics purposes. std::vector get_message_counts(); From 6bfd9540b945d25a9d2c52e9bccff0b149c61b48 Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Wed, 4 Mar 2026 01:30:40 -0500 Subject: [PATCH 37/50] format --- network-tests/test_subaccount_auth.py | 7 +-- oxenss/common/pubkey.h | 1 - oxenss/rpc/request_handler.cpp | 9 +-- oxenss/server/omq.cpp | 5 +- oxenss/snode/network.cpp | 10 +-- oxenss/snode/service_node.cpp | 78 +++++++++++++----------- oxenss/snode/service_node.h | 2 - oxenss/storage/database.cpp | 88 +++++++++++++++++---------- oxenss/storage/database.hpp | 18 +++++- 9 files changed, 126 insertions(+), 92 deletions(-) diff --git a/network-tests/test_subaccount_auth.py b/network-tests/test_subaccount_auth.py index ce0a0abe6..4daf5c09f 100644 --- a/network-tests/test_subaccount_auth.py +++ b/network-tests/test_subaccount_auth.py @@ -336,7 +336,7 @@ def test_revoke_subaccount(omq, random_sn, sk, exclude): f"revoked_subaccounts{ts}".encode(), encoder=Base64Encoder ).signature.decode(), } - ).encode(), + ).encode() ], ).get() assert len(r) == 1 @@ -408,7 +408,7 @@ def test_revoke_subaccount(omq, random_sn, sk, exclude): f"revoked_subaccounts{ts}".encode(), encoder=Base64Encoder ).signature.decode(), } - ).encode(), + ).encode() ], ).get() assert len(r) == 1 @@ -416,7 +416,6 @@ def test_revoke_subaccount(omq, random_sn, sk, exclude): assert len(r["revoked_subaccounts"]) == 1 assert r["revoked_subaccounts"][0] == b64(dude_token) - # But the one in the revoked-keys-allowed namespace should work: r = omq.request_future( conn, @@ -511,7 +510,7 @@ def test_revoke_subaccount(omq, random_sn, sk, exclude): f"revoked_subaccounts{ts}".encode(), encoder=Base64Encoder ).signature.decode(), } - ).encode(), + ).encode() ], ).get() assert len(r) == 1 diff --git a/oxenss/common/pubkey.h b/oxenss/common/pubkey.h index f324fc550..e2b19eb98 100644 --- a/oxenss/common/pubkey.h +++ b/oxenss/common/pubkey.h @@ -58,7 +58,6 @@ class user_pubkey { std::string prefixed_raw() const; }; - /// Maps a pubkey into a 64-bit "swarm space" value; the swarm you belong to is whichever one /// has a swarm id closest to this pubkey-derived value. uint64_t pubkey_to_swarm_space(const user_pubkey& pk); diff --git a/oxenss/rpc/request_handler.cpp b/oxenss/rpc/request_handler.cpp index 9595cfdca..877cc5c4c 100644 --- a/oxenss/rpc/request_handler.cpp +++ b/oxenss/rpc/request_handler.cpp @@ -465,15 +465,15 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrpending--; - res->db_req_id = sn.db->add_retry_request(peer.first, res->cmd, res->req_payload, res->db_req_id); + res->db_req_id = sn.db->add_retry_request( + peer.first, res->cmd, res->req_payload, res->db_req_id); continue; } sn.omq_server()->request( ct->pubkey_x25519.view(), "sn.storage_cc", - [res, peer, peer_ed = ct->pubkey_ed25519, &sn]( - bool success, auto parts) { + [res, peer, peer_ed = ct->pubkey_ed25519, &sn](bool success, auto parts) { json peer_result; SNStorageCCResult store_result = interpret_sn_storage_cc_response_parts(success, parts); @@ -518,7 +518,8 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrdb_req_id = sn.db->add_retry_request(peer.first, res->cmd, res->req_payload, res->db_req_id); + res->db_req_id = sn.db->add_retry_request( + peer.first, res->cmd, res->req_payload, res->db_req_id); } } else if (res->b64) { if (auto it = peer_result.find("signature"); diff --git a/oxenss/server/omq.cpp b/oxenss/server/omq.cpp index 84b6058b9..975e29c76 100644 --- a/oxenss/server/omq.cpp +++ b/oxenss/server/omq.cpp @@ -72,9 +72,8 @@ void OMQ::handle_sn_data_ready(oxenmq::Message& message) { bool needs_db_dump{false}; try { needs_db_dump = snode::deserialise_data_ready_request(message.data[0]); - } - catch (const std::exception& e) { - log::info(logcat, "DataReadyRequest deserialization error: {}", e.what()); + } catch (const std::exception& e) { + log::info(logcat, "DataReadyRequest deserialization error: {}", e.what()); return message.send_reply("Request payload malformed."); } diff --git a/oxenss/snode/network.cpp b/oxenss/snode/network.cpp index 3d0d28c5b..40e287322 100644 --- a/oxenss/snode/network.cpp +++ b/oxenss/snode/network.cpp @@ -12,7 +12,8 @@ namespace oxenss::snode { Network::Network(oxenmq::OxenMQ& omq) : contacts{omq} {} std::pair Network::get_swarm_boundaries(const uint64_t swarm) const { - if (swarms_.size() <= 1) return {0,0}; + if (swarms_.size() <= 1) + return {0, 0}; const auto it = swarms_.find(swarm); if (it == swarms_.end()) @@ -23,8 +24,7 @@ std::pair Network::get_swarm_boundaries(const uint64_t swarm if (it == swarms_.begin()) { next_swarm = std::next(it)->first; prev_swarm = std::prev(swarms_.end())->first; - } - else { + } else { prev_swarm = std::prev(it)->first; auto it2 = std::next(it); if (it2 == swarms_.end()) @@ -42,14 +42,14 @@ std::pair Network::get_swarm_boundaries(const uint64_t swarm // with the average as the lower bound if target is the larger swarm id if (prev_swarm == next_swarm) { uint64_t avg = (swarm + prev_swarm) / 2; - uint64_t shift = (uint64_t)1<<63; + uint64_t shift = (uint64_t)1 << 63; if (swarm > prev_swarm) return {avg, avg + shift}; else return {avg + shift, avg}; } - return {(swarm + prev_swarm)/2, (swarm + next_swarm)/2}; + return {(swarm + prev_swarm) / 2, (swarm + next_swarm) / 2}; } swarms_t::const_iterator Network::_find_swarm_for(const user_pubkey& pk) const { diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 04669deca..3bf946e04 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -556,8 +556,9 @@ void ServiceNode::check_new_members() { "Initiating swarm message dump to swarm member(s): {}", fmt::join(send_now, ", ")); auto boundaries = network_.get_swarm_boundaries(swarm_.cur_swarm_id_); - db->foreach_swarm_message([&send_now, this](const std::vector& messages) { - relay_messages(messages, send_now); + db->foreach_swarm_message( + [&send_now, this](const std::vector& messages) { + relay_messages(messages, send_now); }, boundaries.first, boundaries.second); @@ -1086,15 +1087,15 @@ void ServiceNode::bootstrap_swarms(const std::set& swarms) const { return; } swarms_ptr = &*all_swarms; - } - else if (logcat->level() <= log::Level::info) + } else if (logcat->level() <= log::Level::info) log::info(logcat, "Bootstrapping swarms: [{}]", fmt::join(*swarms_ptr, ", ")); for (const auto& swarm_id : *swarms_ptr) { if (auto swarm = network_.get_swarm(swarm_id)) { auto boundaries = network_.get_swarm_boundaries(swarm_id); - db->foreach_swarm_message([&swarm, this](const std::vector& messages) { - relay_messages(messages, *swarm); + db->foreach_swarm_message( + [&swarm, this](const std::vector& messages) { + relay_messages(messages, *swarm); }, boundaries.first, boundaries.second); @@ -1304,37 +1305,40 @@ void ServiceNode::process_push_batch(std::string_view blob, std::string_view sen } void ServiceNode::check_retry_requests() { - db->foreach_ready_retry_request([this](const crypto::legacy_pubkey& key, const std::string& cmd, const std::string& payload, int64_t req_id) { - //FIXME: non-swarm-member retries should be purged automatically - //std::optional is_member = swarm_.is_member(key); - - crypto::x25519_pubkey pubkey_x25519 = {}; - - auto ct = contacts().find(key); - if (ct && *ct) - pubkey_x25519 = ct->pubkey_x25519; - - if (pubkey_x25519) { - auto on_request_done = [this, req_id](bool success, std::vector parts) { - // We cleanup the request in all situations except timeout (timeout - // indicating that the node was non-responsive, maybe offline). In an error - // state we don't know what state the recipient's storage server is in and - // we default to deleting it and ending the retry attempts. - rpc::SNStorageCCResult store_result = - rpc::interpret_sn_storage_cc_response_parts(success, parts); - if (store_result.status != rpc::SNStorageCCResultStatus::Timeout) { - db->remove_node_retry_request(req_id); - } - }; - omq_server()->request( - pubkey_x25519.view(), - "sn.storage_cc", - on_request_done, - cmd, - payload, - oxenmq::send_option::request_timeout{5s}); - } - }); + db->foreach_ready_retry_request([this](const crypto::legacy_pubkey& key, + const std::string& cmd, + const std::string& payload, + int64_t req_id) { + // FIXME: non-swarm-member retries should be purged automatically + // std::optional is_member = swarm_.is_member(key); + + crypto::x25519_pubkey pubkey_x25519 = {}; + + auto ct = contacts().find(key); + if (ct && *ct) + pubkey_x25519 = ct->pubkey_x25519; + + if (pubkey_x25519) { + auto on_request_done = [this, req_id](bool success, std::vector parts) { + // We cleanup the request in all situations except timeout (timeout + // indicating that the node was non-responsive, maybe offline). In an error + // state we don't know what state the recipient's storage server is in and + // we default to deleting it and ending the retry attempts. + rpc::SNStorageCCResult store_result = + rpc::interpret_sn_storage_cc_response_parts(success, parts); + if (store_result.status != rpc::SNStorageCCResultStatus::Timeout) { + db->remove_node_retry_request(req_id); + } + }; + omq_server()->request( + pubkey_x25519.view(), + "sn.storage_cc", + on_request_done, + cmd, + payload, + oxenmq::send_option::request_timeout{5s}); + } + }); } void ServiceNode::retryable_requests_thread_entry_point() { diff --git a/oxenss/snode/service_node.h b/oxenss/snode/service_node.h index 599f69860..ee54115e6 100644 --- a/oxenss/snode/service_node.h +++ b/oxenss/snode/service_node.h @@ -105,12 +105,10 @@ class ServiceNode { std::weak_ptr http_; public: - // bit messy, but Swarm needs db startup version, so db has to init before Swarm std::unique_ptr db; private: - SnodeStatus status_ = SnodeStatus::UNKNOWN; const crypto::legacy_keypair our_keys_; diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index fa70cc76f..086739800 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -234,9 +234,11 @@ namespace { } // namespace -user_pubkey load_pubkey(uint8_t type, std::string pk) { return {type, std::move(pk)}; } +user_pubkey load_pubkey(uint8_t type, std::string pk) { + return {type, std::move(pk)}; +} -void sqlite_swarm_space(sqlite3_context *sqlite_context, int argc, sqlite3_value **argv, bool hi) { +void sqlite_swarm_space(sqlite3_context* sqlite_context, int argc, sqlite3_value** argv, bool hi) { assert(argc == 1); auto sz = sqlite3_value_bytes(argv[0]); assert(sz == 32); @@ -244,21 +246,22 @@ void sqlite_swarm_space(sqlite3_context *sqlite_context, int argc, sqlite3_value auto pubkey = load_pubkey(0 /* irrelevant */, {reinterpret_cast(key_blob), 32}); auto swarm_space = pubkey_to_swarm_space(pubkey); - if (hi) swarm_space = swarm_space >> 32; - else swarm_space = swarm_space & 0xffffffff; + if (hi) + swarm_space = swarm_space >> 32; + else + swarm_space = swarm_space & 0xffffffff; sqlite3_result_int64(sqlite_context, swarm_space); } -void sqlite_swarm_space_hi(sqlite3_context *sqlite_context, int argc, sqlite3_value **argv) { +void sqlite_swarm_space_hi(sqlite3_context* sqlite_context, int argc, sqlite3_value** argv) { sqlite_swarm_space(sqlite_context, argc, argv, true); } -void sqlite_swarm_space_lo(sqlite3_context *sqlite_context, int argc, sqlite3_value **argv) { +void sqlite_swarm_space_lo(sqlite3_context* sqlite_context, int argc, sqlite3_value** argv) { sqlite_swarm_space(sqlite_context, argc, argv, false); } - class DatabaseImpl { public: oxenss::Database& parent; @@ -370,8 +373,10 @@ CREATE TRIGGER IF NOT EXISTS revoked_autoclean // use version for schema changes from now if (parent._startup_version == 0) { - log::info(logcat, - "Upgrading database schema: adding swarm space cache, runtime state and retryable requests"); + log::info( + logcat, + "Upgrading database schema: adding swarm space cache, runtime state and " + "retryable requests"); // swarm space is 64-bit unsigned, which means unfortunately we can't do queries // on it with arithmetic properly (sqlite INTEGER is 64-bit signed). As such, we @@ -402,8 +407,8 @@ swarm_space_lo = func_swarm_space_lo(pubkey) WHERE swarm_space_hi = -1; )"); - auto stmt = prepared_st( - "SELECT * from retry_node_reqs WHERE next_retry < unixepoch('now', 'subsec')"); + auto stmt = prepared_st( + "SELECT * from retry_node_reqs WHERE next_retry < unixepoch('now', 'subsec')"); db.exec(R"( CREATE TABLE retry_requests ( @@ -1318,32 +1323,45 @@ void oxenss::Database::test_suite_block_for(std::chrono::milliseconds duration) std::this_thread::sleep_for(duration); } -int64_t Database::add_retry_request(const crypto::legacy_pubkey& key, const std::string& cmd, const std::string& payload, int64_t req_id) { - auto impl = get_impl(/*write =*/ true); +int64_t Database::add_retry_request( + const crypto::legacy_pubkey& key, + const std::string& cmd, + const std::string& payload, + int64_t req_id) { + auto impl = get_impl(/*write =*/true); // insert into request table if not present if (req_id == 0) { req_id = impl->prepared_get( "INSERT INTO retry_requests (command, payload) values (?,?) RETURNING id", cmd, - payload - ); + payload); } // first retry 5 seconds after insertion, subsequent retries will be 60 seconds after the last. - impl->prepared_exec("INSERT INTO retry_node_reqs (rr_id, pubkey, next_retry) VALUES(?, ?, unixepoch('now', 'subsec') + 5);", req_id, key.str()); + impl->prepared_exec( + "INSERT INTO retry_node_reqs (rr_id, pubkey, next_retry) VALUES(?, ?, unixepoch('now', " + "'subsec') + 5);", + req_id, + key.str()); return req_id; } -void Database::foreach_ready_retry_request(std::function callback) { - auto impl = get_impl(/*write =*/ true); +void Database::foreach_ready_retry_request(std::function< + void(const crypto::legacy_pubkey& key, + const std::string& cmd, + const std::string& payload, + int64_t req_id)> callback) { + auto impl = get_impl(/*write =*/true); auto stmt = impl->prepared_st( - "SELECT * from retry_node_reqs WHERE next_retry < unixepoch('now', 'subsec')"); + "SELECT * from retry_node_reqs WHERE next_retry < unixepoch('now', 'subsec')"); using sql_duration = std::chrono::duration>; - double now = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); + double now = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); // retry 60 seconds after this retry. Initial retries are staggered (5sec after timeout), // but it doesn't seem useful to stagger here. Further, it would be a pain to do so after @@ -1357,10 +1375,13 @@ void Database::foreach_ready_retry_request(std::function&)> callback, uint64_t lower_bound, uint64_t upper_bound, bool zero_inclusive) { +void Database::foreach_swarm_message( + std::function&)> callback, + uint64_t lower_bound, + uint64_t upper_bound, + bool zero_inclusive) { if (lower_bound > upper_bound) { foreach_swarm_message(callback, lower_bound, std::numeric_limits::max()); @@ -1368,7 +1389,7 @@ void Database::foreach_swarm_message(std::functiondb, + statement = SQLite::Statement{ + impl->db, "SELECT type, pubkey, hash, namespace, timestamp, expiry, data" " FROM owned_messages ORDER BY mid"}; - } - else { + } else { // there's probably a better way to do this, but it should be fine std::string query = R"( SELECT type, pubkey, hash, namespace, timestamp, expiry, data @@ -1426,24 +1447,25 @@ WHERE } void Database::remove_node_retry_request(int64_t req_id) { - auto impl = get_impl(/*write =*/ true); + auto impl = get_impl(/*write =*/true); impl->prepared_exec("DELETE FROM retry_node_reqs WHERE id = ?", req_id); } void Database::update_current_swarm(uint64_t swarm_id) { auto as_hex = oxenc::bt_serialize(swarm_id); - auto impl = get_impl(/*write =*/ true); - impl->prepared_exec("INSERT INTO state_kv (key, value) VALUES ('swarm_id', ?) ON CONFLICT REPLACE;", + auto impl = get_impl(/*write =*/true); + impl->prepared_exec( + "INSERT INTO state_kv (key, value) VALUES ('swarm_id', ?) ON CONFLICT REPLACE;", as_hex); } std::optional Database::get_current_swarm() { - auto impl = get_impl(/*write =*/ false); + auto impl = get_impl(/*write =*/false); try { - auto as_hex = impl->prepared_get("SELECT value FROM state_kv WHERE key = 'swarm_id';"); + auto as_hex = impl->prepared_get( + "SELECT value FROM state_kv WHERE key = 'swarm_id';"); return oxenc::bt_deserialize(as_hex); - } - catch (const std::exception& e) { + } catch (const std::exception& e) { return std::nullopt; } return std::nullopt; diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index d4e7fac72..af3414d7f 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -229,18 +229,30 @@ class Database { // Adds a request retry to the database, to be retried later. If req_id is specified, this // is a subsequent failure on the same request. It's not great to leak database table indices // into the rest of the code if avoidable, but deduplication would be otherwise tedious. - int64_t add_retry_request(const crypto::legacy_pubkey& key, const std::string& cmd, const std::string& payload, int64_t req_id = 0); + int64_t add_retry_request( + const crypto::legacy_pubkey& key, + const std::string& cmd, + const std::string& payload, + int64_t req_id = 0); // executes the provided callback for each request retry in the database which ready to retry. // The table id is provided so the callback can call remove_retry_request on success. - void foreach_ready_retry_request(std::function); + void foreach_ready_retry_request(std::function< + void(const crypto::legacy_pubkey& key, + const std::string& cmd, + const std::string& payload, + int64_t req_id)>); // executes the provided callback for every swarm message (in batches) for the swarm with the // given swarm space boundaries. The lower bound is exclusive; the upper inclusive. // if the lower bound is higher than the upper bound (i.e. overflow wrapping), will be called // recursively on both sides of the overflow. In this case, zero as the lower bound *will* // be inclusive - void foreach_swarm_message(std::function&)> callback, uint64_t lower_bound, uint64_t upper_bound, bool zero_inclusive=false); + void foreach_swarm_message( + std::function&)> callback, + uint64_t lower_bound, + uint64_t upper_bound, + bool zero_inclusive = false); // Remove the specified request retry. This is one node's retry request, not the request // itself -- if no more nodes need the request retried it will be removed as well. From 13fde68e9599f07054475d507307ad24be205945 Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Wed, 4 Mar 2026 14:28:07 -0500 Subject: [PATCH 38/50] Do not store public outbox message if newer present --- oxenss/storage/database.cpp | 17 +++++++++++++++-- oxenss/storage/database.hpp | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index 086739800..fa0c5d15b 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -811,9 +811,22 @@ StoreResult Database::store(const message& msg, std::chrono::system_clock::time_ owner_id = impl->prepared_get( "INSERT INTO owners (pubkey, type) VALUES (?, ?) RETURNING id", msg.pubkey); - // When storing to a public namespace we clear anything there (except for a duplicate, to - // avoid unnecessary storage churn). + // When storing to a public namespace we replace any earlier message (except for a + // duplicate, to avoid unnecessary storage churn). If the stored message is newer, return + // early if (is_public_outbox_namespace(msg.msg_namespace)) { + if (auto maybe_times = exec_and_maybe_get( + impl->prepared_st("SELECT timestamp, expiry FROM messages" + " WHERE owner = ? AND namespace = ?"), + owner_id, + msg.msg_namespace)) { + if (maybe_times->first > to_epoch_ms(msg.timestamp)) { + log::trace(logcat, "Not storing message; newer public outbox message present."); + if (expiry) + *expiry = from_epoch_ms(maybe_times->second); + return StoreResult::Obsolete; + } + } impl->prepared_exec( "DELETE FROM messages" " WHERE owner = ? AND namespace = ? AND hash != ?", diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index af3414d7f..2e352d205 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -30,6 +30,7 @@ enum class StoreResult { New, // Message did not exist and was inserted. Extended, // Message existed, but the expiry was extended to match the stored timestamp. Exists, // Message exists and already has an expiry >= the stored one. + Obsolete, // Newer message exists and message type is singleton (e.g. public outbox) Full, // Can't insert right now because the database is full. }; From 726f0826ff2e220ebb20abcdffcd81b5550ea593 Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Wed, 4 Mar 2026 16:38:11 -0500 Subject: [PATCH 39/50] add limit (and order) to query While public namespaces should be limit 1 per 'owner', if somehow that is not the case we still don't want this query to fail, but rather return the newest matching row. --- oxenss/storage/database.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index fa0c5d15b..086363ec9 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -817,7 +817,8 @@ StoreResult Database::store(const message& msg, std::chrono::system_clock::time_ if (is_public_outbox_namespace(msg.msg_namespace)) { if (auto maybe_times = exec_and_maybe_get( impl->prepared_st("SELECT timestamp, expiry FROM messages" - " WHERE owner = ? AND namespace = ?"), + " WHERE owner = ? AND namespace = ?" + " ORDER BY timestamp DESC LIMIT 1;"), owner_id, msg.msg_namespace)) { if (maybe_times->first > to_epoch_ms(msg.timestamp)) { From 2bcc0c6f6b4eb31e7757f27abfc68e72c73a8962 Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Wed, 4 Mar 2026 17:24:04 -0500 Subject: [PATCH 40/50] Add unique index on public namespace and owner Letting the database handle this constraint is more correct and allows for simpler code anyway. --- oxenss/storage/database.cpp | 57 ++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index 086363ec9..c411da67d 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -375,8 +375,8 @@ CREATE TRIGGER IF NOT EXISTS revoked_autoclean if (parent._startup_version == 0) { log::info( logcat, - "Upgrading database schema: adding swarm space cache, runtime state and " - "retryable requests"); + "Upgrading database schema: adding swarm space cache, runtime state, " + "retryable requests, and public namespace unique constraint"); // swarm space is 64-bit unsigned, which means unfortunately we can't do queries // on it with arithmetic properly (sqlite INTEGER is 64-bit signed). As such, we @@ -471,6 +471,14 @@ CREATE TABLE state_kv ( UNIQUE(key) ); +-- public namespaces are at most used for testing before this migration, so clear them before +-- adding the unique owner/namespace index +DELETE FROM messages WHERE namespace < 0 AND namespace % 20 = -1; + +CREATE UNIQUE INDEX message_outbox_singleton +ON messages(owner, namespace) +WHERE namespace < 0 AND namespace % 20 = -1; + PRAGMA user_version = 1; )"); } @@ -811,31 +819,6 @@ StoreResult Database::store(const message& msg, std::chrono::system_clock::time_ owner_id = impl->prepared_get( "INSERT INTO owners (pubkey, type) VALUES (?, ?) RETURNING id", msg.pubkey); - // When storing to a public namespace we replace any earlier message (except for a - // duplicate, to avoid unnecessary storage churn). If the stored message is newer, return - // early - if (is_public_outbox_namespace(msg.msg_namespace)) { - if (auto maybe_times = exec_and_maybe_get( - impl->prepared_st("SELECT timestamp, expiry FROM messages" - " WHERE owner = ? AND namespace = ?" - " ORDER BY timestamp DESC LIMIT 1;"), - owner_id, - msg.msg_namespace)) { - if (maybe_times->first > to_epoch_ms(msg.timestamp)) { - log::trace(logcat, "Not storing message; newer public outbox message present."); - if (expiry) - *expiry = from_epoch_ms(maybe_times->second); - return StoreResult::Obsolete; - } - } - impl->prepared_exec( - "DELETE FROM messages" - " WHERE owner = ? AND namespace = ? AND hash != ?", - owner_id, - msg.msg_namespace, - msg.hash); - } - auto new_exp = to_epoch_ms(msg.expiry); if (auto existing = exec_and_maybe_get( @@ -852,15 +835,25 @@ StoreResult Database::store(const message& msg, std::chrono::system_clock::time_ if (expiry) *expiry = from_epoch_ms(exp); } else { - impl->prepared_exec( + auto rows = impl->prepared_exec( "INSERT INTO messages (owner, hash, namespace, timestamp, expiry, data)" - " VALUES (?, ?, ?, ?, ?, ?)", + " VALUES (?, ?, ?, ?, ?, ?)" + " ON CONFLICT (owner, namespace) WHERE namespace < 0 AND namespace % 20 = -1" + " DO UPDATE SET" + " hash = EXCLUDED.hash, timestamp = EXCLUDED.timestamp," + " expiry = EXCLUDED.expiry, data = EXCLUDED.data" + " WHERE EXCLUDED.timestamp > messages.timestamp;", owner_id, msg.hash, msg.msg_namespace, to_epoch_ms(msg.timestamp), to_epoch_ms(msg.expiry), blob_binder{msg.data}); + + // did not insert, which means public namespace and not newer + if (rows == 0) + return StoreResult::Obsolete; + ret = StoreResult::New; if (expiry) @@ -914,7 +907,11 @@ void Database::bulk_store(const std::vector& items) { auto insert_message = impl->prepared_st( "INSERT INTO messages (owner, hash, namespace, timestamp, expiry, data)" " VALUES (?, ?, ?, ?, ?, ?)" - " ON CONFLICT DO NOTHING"); + " ON CONFLICT (owner, namespace) WHERE namespace < 0 AND namespace % 20 = -1" + " DO UPDATE SET" + " hash = EXCLUDED.hash, timestamp = EXCLUDED.timestamp," + " expiry = EXCLUDED.expiry, data = EXCLUDED.data" + " WHERE EXCLUDED.timestamp > messages.timestamp;"); for (auto& m : items) { if (!m.pubkey) From 3aa276283b481afa833c44de55b6a91b41d6f872 Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Thu, 5 Mar 2026 13:16:12 -0500 Subject: [PATCH 41/50] fix unused variable errors --- oxenss/snode/service_node.cpp | 2 +- oxenss/storage/database.cpp | 24 ++++++++++-------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 3bf946e04..510fae7e4 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -70,7 +70,7 @@ std::string serialise_data_ready_request(bool needs_db_dump) { bool deserialise_data_ready_request(std::string_view data) { oxenc::bt_dict_consumer d{data}; - auto version = d.require(data_ready_req::VERSION_KEY); + [[maybe_unused]] auto version = d.require(data_ready_req::VERSION_KEY); return d.require(data_ready_req::NEED_DB_DUMP_KEY); } diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index c411da67d..279c7d06d 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -238,10 +238,10 @@ user_pubkey load_pubkey(uint8_t type, std::string pk) { return {type, std::move(pk)}; } -void sqlite_swarm_space(sqlite3_context* sqlite_context, int argc, sqlite3_value** argv, bool hi) { +void sqlite_swarm_space( + sqlite3_context* sqlite_context, [[maybe_unused]] int argc, sqlite3_value** argv, bool hi) { assert(argc == 1); - auto sz = sqlite3_value_bytes(argv[0]); - assert(sz == 32); + assert(sqlite3_value_bytes(argv[0])); auto* key_blob = sqlite3_value_blob(argv[0]); auto pubkey = load_pubkey(0 /* irrelevant */, {reinterpret_cast(key_blob), 32}); auto swarm_space = pubkey_to_swarm_space(pubkey); @@ -774,9 +774,14 @@ int64_t Database::get_used_bytes() { impl->prepared_get("PRAGMA freelist_count") * impl->page_size; } -static std::optional get_message(DatabaseImpl& impl, SQLite::Statement& st) { +std::optional Database::retrieve_by_hash(const std::string& msg_hash) { + auto impl = get_impl(false); + auto st = impl->prepared_st( + "SELECT hash, type, pubkey, namespace, timestamp, expiry, data" + " FROM owned_messages WHERE hash = ?"); + st->bindNoCopy(1, msg_hash); std::optional msg; - while (st.executeStep()) { + while (st->executeStep()) { assert(!msg); auto [hash, otype, opubkey, ns, ts, exp, data] = get( @@ -792,15 +797,6 @@ static std::optional get_message(DatabaseImpl& impl, SQLite::Statement& return msg; } -std::optional Database::retrieve_by_hash(const std::string& msg_hash) { - auto impl = get_impl(false); - auto st = impl->prepared_st( - "SELECT hash, type, pubkey, namespace, timestamp, expiry, data" - " FROM owned_messages WHERE hash = ?"); - st->bindNoCopy(1, msg_hash); - return get_message(*impl, st); -} - StoreResult Database::store(const message& msg, std::chrono::system_clock::time_point* expiry) { auto impl = get_impl(true); From e1e4be21b688e7a56006990dbf385b06a02d9e01 Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Thu, 5 Mar 2026 16:16:00 -0500 Subject: [PATCH 42/50] fix unit tests, including broken db queries --- oxenss/storage/database.cpp | 28 +++++++++++++--------------- unit_test/storage.cpp | 26 +++++++++++++------------- unit_test/swarm.cpp | 30 ++++++++++++++++++------------ 3 files changed, 44 insertions(+), 40 deletions(-) diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index 279c7d06d..852a2301b 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -389,14 +389,13 @@ ALTER TABLE owners ADD COLUMN swarm_space_hi INTEGER NOT NULL DEFAULT -1; ALTER TABLE owners ADD COLUMN swarm_space_lo INTEGER NOT NULL DEFAULT -1; CREATE TRIGGER swarm_space_trigger -BEFORE INSERT ON owners +AFTER INSERT ON owners FOR EACH ROW -WHEN NEW.swarm_space_lo = -1 +WHEN NEW.swarm_space_hi = -1 OR NEW.swarm_space_lo = -1 BEGIN - INSERT INTO owners (id, type, pubkey, swarm_space_hi, swarm_space_lo) - VALUES (NEW.id, NEW.type, NEW.pubkey, func_swarm_space_hi(NEW.pubkey), func_swarm_space_lo(NEW.pubkey); - - SELECT RAISE(IGNORE); -- skips the original insert since we replaced it + UPDATE owners SET + swarm_space_hi = func_swarm_space_hi(NEW.pubkey), swarm_space_lo = func_swarm_space_lo(NEW.pubkey) + WHERE owners.id = NEW.id; END; )"); @@ -407,9 +406,6 @@ swarm_space_lo = func_swarm_space_lo(pubkey) WHERE swarm_space_hi = -1; )"); - auto stmt = prepared_st( - "SELECT * from retry_node_reqs WHERE next_retry < unixepoch('now', 'subsec')"); - db.exec(R"( CREATE TABLE retry_requests ( id INTEGER PRIMARY KEY, @@ -838,7 +834,7 @@ StoreResult Database::store(const message& msg, std::chrono::system_clock::time_ " DO UPDATE SET" " hash = EXCLUDED.hash, timestamp = EXCLUDED.timestamp," " expiry = EXCLUDED.expiry, data = EXCLUDED.data" - " WHERE EXCLUDED.timestamp > messages.timestamp;", + " WHERE EXCLUDED.timestamp > messages.timestamp", owner_id, msg.hash, msg.msg_namespace, @@ -903,11 +899,14 @@ void Database::bulk_store(const std::vector& items) { auto insert_message = impl->prepared_st( "INSERT INTO messages (owner, hash, namespace, timestamp, expiry, data)" " VALUES (?, ?, ?, ?, ?, ?)" + " ON CONFLICT (hash)" + " DO UPDATE SET" + " expiry = MAX(EXCLUDED.expiry, messages.expiry)" " ON CONFLICT (owner, namespace) WHERE namespace < 0 AND namespace % 20 = -1" " DO UPDATE SET" " hash = EXCLUDED.hash, timestamp = EXCLUDED.timestamp," " expiry = EXCLUDED.expiry, data = EXCLUDED.data" - " WHERE EXCLUDED.timestamp > messages.timestamp;"); + " WHERE EXCLUDED.timestamp > messages.timestamp"); for (auto& m : items) { if (!m.pubkey) @@ -1348,7 +1347,7 @@ int64_t Database::add_retry_request( // first retry 5 seconds after insertion, subsequent retries will be 60 seconds after the last. impl->prepared_exec( "INSERT INTO retry_node_reqs (rr_id, pubkey, next_retry) VALUES(?, ?, unixepoch('now', " - "'subsec') + 5);", + "'subsec') + 5)", req_id, key.str()); @@ -1462,15 +1461,14 @@ void Database::update_current_swarm(uint64_t swarm_id) { auto as_hex = oxenc::bt_serialize(swarm_id); auto impl = get_impl(/*write =*/true); impl->prepared_exec( - "INSERT INTO state_kv (key, value) VALUES ('swarm_id', ?) ON CONFLICT REPLACE;", - as_hex); + "INSERT OR REPLACE INTO state_kv (key, value) VALUES ('swarm_id', ?)", as_hex); } std::optional Database::get_current_swarm() { auto impl = get_impl(/*write =*/false); try { auto as_hex = impl->prepared_get( - "SELECT value FROM state_kv WHERE key = 'swarm_id';"); + "SELECT value FROM state_kv WHERE key = 'swarm_id'"); return oxenc::bt_deserialize(as_hex); } catch (const std::exception& e) { return std::nullopt; diff --git a/unit_test/storage.cpp b/unit_test/storage.cpp index cce142655..96460481b 100644 --- a/unit_test/storage.cpp +++ b/unit_test/storage.cpp @@ -42,7 +42,7 @@ TEST_CASE("storage - data persistence", "[storage]") { CHECK(storage.store({pubkey, hash, ns, now, now + ttl, bytes}) == StoreResult::New); CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == 1); + CHECK(storage.get_message_count() == 1); // the database is closed when storage goes out of scope } @@ -51,7 +51,7 @@ TEST_CASE("storage - data persistence", "[storage]") { Database storage{"."}; CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == 1); + CHECK(storage.get_message_count() == 1); auto [items, more] = storage.retrieve(pubkey, namespace_id::Default, ""); @@ -79,7 +79,7 @@ TEST_CASE("storage - data persistence, namespace", "[storage][namespace]") { CHECK(storage.store({pubkey, hash, ns, now, now + ttl, bytes}) == StoreResult::New); CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == 1); + CHECK(storage.get_message_count() == 1); // the database is closed when storage goes out of scope } @@ -88,7 +88,7 @@ TEST_CASE("storage - data persistence, namespace", "[storage][namespace]") { Database storage{"."}; CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == 1); + CHECK(storage.get_message_count() == 1); auto [items, more] = storage.retrieve(pubkey, ns, ""); @@ -131,7 +131,7 @@ TEST_CASE("storage - re-storing existing hash", "[storage]") { CHECK(ins == StoreResult::Exists); CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == 1); + CHECK(storage.get_message_count() == 1); } TEST_CASE("storage - only return entries for specified pubkey", "[storage]") { @@ -152,7 +152,7 @@ TEST_CASE("storage - only return entries for specified pubkey", "[storage]") { StoreResult::New); CHECK(storage.get_owner_count() == 2); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == 2); + CHECK(storage.get_message_count() == 2); const auto lastHash = ""; { @@ -184,7 +184,7 @@ TEST_CASE("storage - return entries older than lasthash", "[storage]") { } CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == 100); + CHECK(storage.get_message_count() == 100); { const auto lastHash = "hash0"; @@ -228,7 +228,7 @@ TEST_CASE("storage - remove expired entries", "[storage]") { StoreResult::New); CHECK(storage.get_owner_count() == 3); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == 6); + CHECK(storage.get_message_count() == 6); { const auto lastHash = ""; @@ -245,7 +245,7 @@ TEST_CASE("storage - remove expired entries", "[storage]") { } CHECK(storage.get_owner_count() == 2); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == 2); + CHECK(storage.get_message_count() == 2); } TEST_CASE("storage - bulk data storage", "[storage]") { @@ -284,7 +284,7 @@ TEST_CASE("storage - bulk data storage", "[storage]") { } CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == num_items); + CHECK(storage.get_message_count() == num_items); } TEST_CASE("storage - bulk storage with overlap", "[storage]") { @@ -307,7 +307,7 @@ TEST_CASE("storage - bulk storage with overlap", "[storage]") { StoreResult::New); CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == 2); + CHECK(storage.get_message_count() == 2); // bulk store { @@ -326,7 +326,7 @@ TEST_CASE("storage - bulk storage with overlap", "[storage]") { } CHECK(storage.get_owner_count() == 1); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == num_items); + CHECK(storage.get_message_count() == num_items); // retrieve { @@ -359,7 +359,7 @@ TEST_CASE("storage - retrieve limit", "[storage]") { } CHECK(storage.get_owner_count() == 2); - CHECK(storage.get_message_count(Database::GetMessageCount::All) == num_entries + 5); + CHECK(storage.get_message_count() == num_entries + 5); CHECK(storage.retrieve(pubkey, namespace_id::Default, "").first.size() == num_entries); CHECK(storage.retrieve(pubkey, namespace_id::Default, "", 10).first.size() == 10); diff --git a/unit_test/swarm.cpp b/unit_test/swarm.cpp index 0521a084a..909d501ad 100644 --- a/unit_test/swarm.cpp +++ b/unit_test/swarm.cpp @@ -19,35 +19,41 @@ using oxenss::snode::Swarm; TEST_CASE("swarm - pubkey to swarm space", "[swarm]") { oxenss::user_pubkey pk; REQUIRE(pk.load("053506f4a71324b7dd114eddbf4e311f39dde243e1f2cb97c40db1961f70ebaae8")); - CHECK(Network::pubkey_to_swarm_space(pk) == 17589930838143112648ULL); + CHECK(oxenss::pubkey_to_swarm_space(pk) == 17589930838143112648ULL); REQUIRE(pk.load("05cf27da303a50ac8c4b2d43d27259505c9bcd73fc21cf2a57902c3d050730b604")); - CHECK(Network::pubkey_to_swarm_space(pk) == 10370619079776428163ULL); + CHECK(oxenss::pubkey_to_swarm_space(pk) == 10370619079776428163ULL); REQUIRE(pk.load("03d3511706b8b34f6e8411bf07bd22ba6b2435ca56846fbccf6eb1e166a6cd15cc")); - CHECK(Network::pubkey_to_swarm_space(pk) == 2144983569669512198ULL); + CHECK(oxenss::pubkey_to_swarm_space(pk) == 2144983569669512198ULL); REQUIRE(pk.load("ff0f06693428fca9102a451e3f28d9cc743d8ea60a89ab6aa69eb119470c11cbd3")); - CHECK(Network::pubkey_to_swarm_space(pk) == 9690840703409570833ULL); + CHECK(oxenss::pubkey_to_swarm_space(pk) == 9690840703409570833ULL); REQUIRE(pk.load("05ffba630924aa1224bb930dde21c0d11bf004608f2812217f8ac812d6c7e3ad48")); - CHECK(Network::pubkey_to_swarm_space(pk) == 4532060000165252872ULL); + CHECK(oxenss::pubkey_to_swarm_space(pk) == 4532060000165252872ULL); REQUIRE(pk.load("05eeeeeeeeeeeeeeee777777777777777711111111111111118888888888888888")); - CHECK(Network::pubkey_to_swarm_space(pk) == 0); + CHECK(oxenss::pubkey_to_swarm_space(pk) == 0); REQUIRE(pk.load("050123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef")); - CHECK(Network::pubkey_to_swarm_space(pk) == 0); + CHECK(oxenss::pubkey_to_swarm_space(pk) == 0); REQUIRE(pk.load("05fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe")); - CHECK(Network::pubkey_to_swarm_space(pk) == 1); + CHECK(oxenss::pubkey_to_swarm_space(pk) == 1); REQUIRE(pk.load("05ffffffffffffffffffffffffffffffffffffffffffffffff7fffffffffffffff")); - CHECK(Network::pubkey_to_swarm_space(pk) == 1ULL << 63); + CHECK(oxenss::pubkey_to_swarm_space(pk) == 1ULL << 63); REQUIRE(pk.load("05000000000000000000000000000000000000000000000000ffffffffffffffff")); - CHECK(Network::pubkey_to_swarm_space(pk) == (uint64_t)-1); + CHECK(oxenss::pubkey_to_swarm_space(pk) == (uint64_t)-1); REQUIRE(pk.load("050000000000000000000000000000000000000000000000000123456789abcdef")); - CHECK(Network::pubkey_to_swarm_space(pk) == 0x0123456789abcdefULL); + CHECK(oxenss::pubkey_to_swarm_space(pk) == 0x0123456789abcdefULL); } +struct StorageDeleter { + StorageDeleter() { std::filesystem::remove("storage.db"); } + ~StorageDeleter() { std::filesystem::remove("storage.db"); } +}; + TEST_CASE("service nodes - pubkey to swarm id") { const auto fake_pk = oxenss::crypto::legacy_pubkey::from_hex( "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"); oxenmq::OxenMQ omq; Network network{omq}; - Swarm swarm{network, fake_pk}; + oxenss::Database db{"."}; // unused here, but required by Swarm + Swarm swarm{network, fake_pk, db}; using oxenss::snode::swarms_t; swarms_t swarms; From bb52e7e7127c65b76fea5e77afa8fb0eabe4d431 Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Thu, 5 Mar 2026 17:12:15 -0500 Subject: [PATCH 43/50] unused parameter --- oxenss/rpc/request_handler.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/oxenss/rpc/request_handler.cpp b/oxenss/rpc/request_handler.cpp index 877cc5c4c..9a6fb6b5f 100644 --- a/oxenss/rpc/request_handler.cpp +++ b/oxenss/rpc/request_handler.cpp @@ -413,7 +413,7 @@ struct swarm_response { // swarm entries returned things with "failed" in them or in the case of a non-recursive request, // the top-level object has a "failed" in it then we send back an INTERNAL_SERVER_ERROR // along with the response. -static void reply_or_fail(snode::ServiceNode& sn, const std::shared_ptr& res) { +static void reply_or_fail(const std::shared_ptr& res) { auto res_code = http::INTERNAL_SERVER_ERROR; if (auto swarm_obj = res->result.find("swarm"); swarm_obj != res->result.end()) { for (const auto& [sn_pkey, obj] : swarm_obj->items()) { @@ -529,7 +529,7 @@ static void distribute_command(snode::ServiceNode& sn, std::shared_ptrresult["swarm"][peer_ed.hex()] = std::move(peer_result); if (send_reply) - reply_or_fail(sn, res); + reply_or_fail(res); }, res->cmd, res->req_payload, @@ -681,7 +681,7 @@ void RequestHandler::process_client_req(rpc::store&& req, std::functionpending == 0) - reply_or_fail(service_node_, std::move(res)); + reply_or_fail(std::move(res)); } void RequestHandler::process_client_req( @@ -954,7 +954,7 @@ void RequestHandler::process_client_req( add_misc_response_fields(res->result, service_node_, now); if (--res->pending == 0) - reply_or_fail(service_node_, std::move(res)); + reply_or_fail(std::move(res)); } void RequestHandler::process_client_req(rpc::delete_msgs&& req, std::function cb) { @@ -1016,7 +1016,7 @@ void RequestHandler::process_client_req(rpc::delete_msgs&& req, std::functionresult, service_node_); if (--res->pending == 0) - reply_or_fail(service_node_, std::move(res)); + reply_or_fail(std::move(res)); } void RequestHandler::process_client_req( @@ -1067,7 +1067,7 @@ void RequestHandler::process_client_req( add_misc_response_fields(res->result, service_node_); if (--res->pending == 0) - reply_or_fail(service_node_, std::move(res)); + reply_or_fail(std::move(res)); } void RequestHandler::process_client_req( @@ -1121,7 +1121,7 @@ void RequestHandler::process_client_req( add_misc_response_fields(res->result, service_node_); if (--res->pending == 0) - reply_or_fail(service_node_, std::move(res)); + reply_or_fail(std::move(res)); } void RequestHandler::process_client_req( @@ -1248,7 +1248,7 @@ void RequestHandler::process_client_req( add_misc_response_fields(res->result, service_node_, now); if (--res->pending == 0) - reply_or_fail(service_node_, std::move(res)); + reply_or_fail(std::move(res)); } void RequestHandler::process_client_req(rpc::expire_all&& req, std::function cb) { @@ -1314,7 +1314,7 @@ void RequestHandler::process_client_req(rpc::expire_all&& req, std::functionresult, service_node_, now); if (--res->pending == 0) - reply_or_fail(service_node_, std::move(res)); + reply_or_fail(std::move(res)); } void RequestHandler::process_client_req(rpc::expire_msgs&& req, std::function cb) { @@ -1455,7 +1455,7 @@ void RequestHandler::process_client_req(rpc::expire_msgs&& req, std::functionresult, service_node_, now); if (--res->pending == 0) - reply_or_fail(service_node_, std::move(res)); + reply_or_fail(std::move(res)); } void RequestHandler::process_client_req(rpc::get_expiries&& req, std::function cb) { From d015af360a77fed74b169eca236dd65cceca90cc Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Wed, 25 Mar 2026 20:20:44 -0400 Subject: [PATCH 44/50] new tests and fixes found from them --- oxenss/rpc/request_handler.cpp | 2 +- oxenss/snode/network.cpp | 22 +++++------- oxenss/snode/network.h | 7 ++-- oxenss/snode/service_node.cpp | 2 ++ oxenss/storage/database.cpp | 28 ++++++++++++--- oxenss/storage/database.hpp | 8 +++++ oxenss/utils/time.hpp | 4 +++ unit_test/storage.cpp | 65 +++++++++++++++++++++++++++++++++- unit_test/swarm.cpp | 33 +++++++++++++++++ 9 files changed, 149 insertions(+), 22 deletions(-) diff --git a/oxenss/rpc/request_handler.cpp b/oxenss/rpc/request_handler.cpp index 9a6fb6b5f..a83ad5ffa 100644 --- a/oxenss/rpc/request_handler.cpp +++ b/oxenss/rpc/request_handler.cpp @@ -403,9 +403,9 @@ struct swarm_response { bool b64; nlohmann::json result; std::function cb; - std::vector retry_nodes; std::string cmd; std::string req_payload; + std::chrono::system_clock::time_point expiry; int64_t db_req_id{0}; }; diff --git a/oxenss/snode/network.cpp b/oxenss/snode/network.cpp index 40e287322..11ad3b2d0 100644 --- a/oxenss/snode/network.cpp +++ b/oxenss/snode/network.cpp @@ -37,19 +37,11 @@ std::pair Network::get_swarm_boundaries(const uint64_t swarm // in the event of a distance tie in swarm space (e.g. id 1 and 7 with swarm space 4), // the "right" (next) swarm loses. This means when querying with what we return here, // we should do x > lower_bound AND x <= upper_bound - - // if there are only 2 swarms somehow, return the average and the average + 1<<63, - // with the average as the lower bound if target is the larger swarm id - if (prev_swarm == next_swarm) { - uint64_t avg = (swarm + prev_swarm) / 2; - uint64_t shift = (uint64_t)1 << 63; - if (swarm > prev_swarm) - return {avg, avg + shift}; - else - return {avg + shift, avg}; - } - - return {(swarm + prev_swarm) / 2, (swarm + next_swarm) / 2}; + auto left_diff = swarm - prev_swarm; + if (left_diff % 2) + left_diff += 1; // round the average up on the left side + auto right_diff = next_swarm - swarm; + return {swarm - (left_diff / 2), swarm + (right_diff / 2)}; } swarms_t::const_iterator Network::_find_swarm_for(const user_pubkey& pk) const { @@ -59,6 +51,10 @@ swarms_t::const_iterator Network::_find_swarm_for(const user_pubkey& pk) const { return swarms_.begin(); const uint64_t swarm_pos = pubkey_to_swarm_space(pk); + return _find_swarm_for_swarm_space(swarm_pos); +} + +swarms_t::const_iterator Network::_find_swarm_for_swarm_space(const swarm_id_t swarm_pos) const { // Find the right boundary, i.e. first swarm with swarm_id >= res auto right_it = swarms_.lower_bound(swarm_pos); diff --git a/oxenss/snode/network.h b/oxenss/snode/network.h index 1bd7ae67b..58dcc5972 100644 --- a/oxenss/snode/network.h +++ b/oxenss/snode/network.h @@ -39,9 +39,6 @@ class Network { friend class ServiceNode; - std::pair get_swarm_boundaries(const uint64_t swarm) const; - swarms_t::const_iterator _find_swarm_for(const user_pubkey& pk) const; - // Cached value of the all_nodes_blob() return value. The cache is cleared whenever swarms or // any contact info changes. mutable std::shared_ptr> all_nodes_blob_; @@ -56,6 +53,10 @@ class Network { swarms_t&& new_swarms, const std::map& new_contacts); public: + std::pair get_swarm_boundaries(const uint64_t swarm) const; + swarms_t::const_iterator _find_swarm_for(const user_pubkey& pk) const; + swarms_t::const_iterator _find_swarm_for_swarm_space(const swarm_id_t swarm_pos) const; + /// Constructs a Network object. The omq instance will be passed to `contacts` so that any /// x25519 pubkey list changes are automatically propagated to oxenmq for SN authentication. Network(oxenmq::OxenMQ& omq); diff --git a/oxenss/snode/service_node.cpp b/oxenss/snode/service_node.cpp index 510fae7e4..b1377e93d 100644 --- a/oxenss/snode/service_node.cpp +++ b/oxenss/snode/service_node.cpp @@ -1305,6 +1305,8 @@ void ServiceNode::process_push_batch(std::string_view blob, std::string_view sen } void ServiceNode::check_retry_requests() { + db->remove_expired_retry_requests(); + db->foreach_ready_retry_request([this](const crypto::legacy_pubkey& key, const std::string& cmd, const std::string& payload, diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index 852a2301b..ef9b2fb65 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -413,11 +413,13 @@ CREATE TABLE retry_requests ( payload BLOB NOT NULL, created DOUBLE PRECISION NOT NULL DEFAULT (unixepoch('now', 'subsec')) ); + CREATE TABLE retry_pubkeys ( id INTEGER PRIMARY KEY, pubkey BLOB NOT NULL, UNIQUE(pubkey) ); + CREATE TABLE retry_node_requests ( id INTEGER PRIMARY KEY, rr_id INTEGER NOT NULL REFERENCES retry_requests(id) ON DELETE CASCADE, @@ -425,20 +427,24 @@ CREATE TABLE retry_node_requests ( next_retry DOUBLE PRECISION NOT NULL, UNIQUE(rr_id, pk_id) ); + CREATE INDEX retry_node_requests_pk_idx ON retry_node_requests(pk_id); CREATE VIEW retry_node_reqs AS - SELECT retry_node_requests.id, retry_requests.command, retry_reqeusts.payload, retry_pubkeys.pubkey, next_retry - FROM retry_node_requests JOIN retry_requests ON rr_id = retry_requests.id JOIN retry_pubkeys ON pk_id = retry_pubkeys.id; + SELECT retry_node_requests.id AS rr_id, retry_requests.command, retry_requests.payload, + retry_pubkeys.pubkey AS pubkey, next_retry + FROM retry_node_requests + JOIN retry_requests ON retry_node_requests.rr_id = retry_requests.id + JOIN retry_pubkeys ON pk_id = retry_pubkeys.id; CREATE TRIGGER retry_node_add INSTEAD OF INSERT ON retry_node_reqs BEGIN -- Allows insertion into the view (with the raw pubkey value) to automatically do the pubkey -- lookup (with autovivification) for you. - INSERT INTO retry_pubkeys (pubkey) VALUES (NEW.pubkey) ON CONFLICT(pubkey) DO NOTHING; + INSERT OR IGNORE INTO retry_pubkeys (pubkey) VALUES (NEW.pubkey); INSERT INTO retry_node_requests (rr_id, pk_id, next_retry) - VALUES (NEW.rr_id, (SELECT id FROM retry_pubkeys WHERE pubkey = NEW.pubkey), NEW.next_retry); + VALUES (NEW.rr_id, (SELECT id FROM retry_pubkeys WHERE retry_pubkeys.pubkey = NEW.pubkey), NEW.next_retry); END; CREATE TRIGGER rr_cleanup @@ -1383,6 +1389,11 @@ void Database::foreach_ready_retry_request(std::function< } } +int64_t Database::retry_request_count() { + auto impl = get_impl(/*write =*/false); + return impl->prepared_get("SELECT COUNT(*) from retry_node_reqs"); +} + void Database::foreach_swarm_message( std::function&)> callback, uint64_t lower_bound, @@ -1457,6 +1468,15 @@ void Database::remove_node_retry_request(int64_t req_id) { impl->prepared_exec("DELETE FROM retry_node_reqs WHERE id = ?", req_id); } +void Database::remove_expired_retry_requests(std::chrono::system_clock::time_point now) { + auto impl = get_impl(/*write =*/true); + + // FIXME: retry requests don't have an expiry, so we need to pick a good expiration time + // for these retries. For now, using 4 hours ago. Tests will pass 4 hours from + // now. + impl->prepared_exec("DELETE FROM retry_requests WHERE created < ?", to_epoch_double(now - 4h)); +} + void Database::update_current_swarm(uint64_t swarm_id) { auto as_hex = oxenc::bt_serialize(swarm_id); auto impl = get_impl(/*write =*/true); diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index 2e352d205..a4520d44b 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -244,6 +244,10 @@ class Database { const std::string& payload, int64_t req_id)>); + // This is just for the test suite, as using "ready retry requests" as above would require it + // to take several seconds longer to execute, per call. + int64_t retry_request_count(); + // executes the provided callback for every swarm message (in batches) for the swarm with the // given swarm space boundaries. The lower bound is exclusive; the upper inclusive. // if the lower bound is higher than the upper bound (i.e. overflow wrapping), will be called @@ -259,6 +263,10 @@ class Database { // itself -- if no more nodes need the request retried it will be removed as well. void remove_node_retry_request(int64_t req_id); + // the `now` argument here only exists for the test suite; do not use it. + void remove_expired_retry_requests( + std::chrono::system_clock::time_point now = std::chrono::system_clock::now()); + void update_current_swarm(uint64_t swarm_id); std::optional get_current_swarm(); diff --git a/oxenss/utils/time.hpp b/oxenss/utils/time.hpp index 59c40b6dc..8a096a8b0 100644 --- a/oxenss/utils/time.hpp +++ b/oxenss/utils/time.hpp @@ -9,6 +9,10 @@ inline int64_t to_epoch_ms(std::chrono::system_clock::time_point t) { return std::chrono::duration_cast(t.time_since_epoch()).count(); } +inline double to_epoch_double(std::chrono::system_clock::time_point t) { + return std::chrono::duration{t.time_since_epoch()}.count(); +} + inline std::chrono::system_clock::time_point from_epoch_ms(int64_t t) { return std::chrono::system_clock::time_point{std::chrono::milliseconds{t}}; } diff --git a/unit_test/storage.cpp b/unit_test/storage.cpp index 96460481b..3ca4b1edf 100644 --- a/unit_test/storage.cpp +++ b/unit_test/storage.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -10,14 +11,19 @@ #include #include +#include "oxenss/utils/time.hpp" using namespace oxenss; using namespace std::literals; struct StorageDeleter { + bool delete_it = true; StorageDeleter() { std::filesystem::remove("storage.db"); } - ~StorageDeleter() { std::filesystem::remove("storage.db"); } + ~StorageDeleter() { + if (delete_it) + std::filesystem::remove("storage.db"); + } }; TEST_CASE("storage - database file creation", "[storage]") { @@ -427,3 +433,60 @@ TEST_CASE("storage - connection pool", "[storage][pool]") { // returned to the pool: CHECK(oxenss::TestSuiteHacks::db_pool_size(storage) == 1 + n_blocked_threads); } + +TEST_CASE("storage - current swarm", "[storage]") { + StorageDeleter fixture; + + Database storage{"."}; + + // new db has no current swarm + CHECK(storage.get_current_swarm() == std::nullopt); + + storage.update_current_swarm(12345); + + CHECK(*(storage.get_current_swarm()) == 12345); +} + +TEST_CASE("storage - retry requests", "[storage]") { + StorageDeleter fixture; + fixture.delete_it = false; + oxenss::crypto::legacy_pubkey pubkey, pubkey2; + pubkey.load_from_hex("0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"); + pubkey2.load_from_hex("0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcde0"); + + Database storage{"."}; + + // first row id = 1 + CHECK(storage.add_retry_request(pubkey, "foo", "bar") == 1); + + // unique on retry id and pubkey + CHECK_THROWS(storage.add_retry_request(pubkey, "foo", "bar", 1)); + // this will silently fail, but also should never be done. + CHECK_THROWS(storage.add_retry_request(pubkey, "foo", "bar", 2)); + + auto req_count = storage.retry_request_count(); + CHECK(req_count == 1); + + storage.add_retry_request(pubkey2, "foo", "bar"); + req_count = storage.retry_request_count(); + CHECK(req_count == 2); + + storage.add_retry_request(pubkey, "bits", "bits"); + req_count = storage.retry_request_count(); + CHECK(req_count == 3); + + std::this_thread::sleep_for(500ms); + // FIXME: "expiry" is currently 4h, this is incredibly arbitrary and should be considered + // further. + auto the_future = std::chrono::system_clock::now() + 4h; + + std::this_thread::sleep_for( + 500ms); // the following insert should *not* be considered "expired" + CHECK_NOTHROW(storage.add_retry_request(pubkey, "fools", "barred") == 4); + req_count = storage.retry_request_count(); + CHECK(req_count == 4); + // remove expired, pretending it's 4h (minus the sleep) from now + CHECK_NOTHROW(storage.remove_expired_retry_requests(the_future)); + req_count = storage.retry_request_count(); + CHECK(req_count == 1); +} diff --git a/unit_test/swarm.cpp b/unit_test/swarm.cpp index 909d501ad..6f5f747e5 100644 --- a/unit_test/swarm.cpp +++ b/unit_test/swarm.cpp @@ -172,3 +172,36 @@ TEST_CASE("service nodes - pubkey to swarm id") { REQUIRE(pk.load("05000000000000000000000000000000000000000000000000fffffffffffffffe")); CHECK(network.get_swarm_id_for(pk).value() == 0); } + +TEST_CASE("service nodes - swarm id to swarm space (pubkey range)") { + const auto fake_pk = oxenss::crypto::legacy_pubkey::from_hex( + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"); + oxenmq::OxenMQ omq; + Network network{omq}; + oxenss::Database db{"."}; // unused here, but required by Swarm + Swarm swarm{network, fake_pk, db}; + + using oxenss::snode::swarms_t; + swarms_t swarms; + for (oxenss::snode::swarm_id_t s : {100, 200, 300, 399, 498, 596, 694}) + swarms[s]; + swarm.update_swarms(0, swarms_t{swarms}, {}); + + oxenss::user_pubkey pk; + + auto boundaries = network.get_swarm_boundaries(200); + REQUIRE(boundaries.first == 150); + REQUIRE(boundaries.second == 250); // we lose this tie, but this boundary is exclusive + + boundaries = network.get_swarm_boundaries(300); + REQUIRE(boundaries.first == 250); + REQUIRE(boundaries.second == 349); + + boundaries = network.get_swarm_boundaries(399); + REQUIRE(boundaries.first == 349); + REQUIRE(boundaries.second == 448); + + boundaries = network.get_swarm_boundaries(100); + REQUIRE(boundaries.first == (0x18d + 0x8000000000000000)); + REQUIRE(boundaries.second == 150); +} From 118b0f1a695741925ae41e3ec8e7305a26374822 Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Wed, 25 Mar 2026 21:29:02 -0400 Subject: [PATCH 45/50] more thorough swarm id -> space testing --- unit_test/swarm.cpp | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/unit_test/swarm.cpp b/unit_test/swarm.cpp index 6f5f747e5..9ae604e15 100644 --- a/unit_test/swarm.cpp +++ b/unit_test/swarm.cpp @@ -173,6 +173,11 @@ TEST_CASE("service nodes - pubkey to swarm id") { CHECK(network.get_swarm_id_for(pk).value() == 0); } +// A round-trip test against "service nodes - pubkey to swarm id" is not needed here. +// Both get_swarm_boundaries and _find_swarm_for_swarm_space use consistent uint64_t modular +// arithmetic, and the wrapping range case (lo > hi, crossing UINT64_MAX) is already exercised +// by swarm 100's boundaries below. UINT64_MAX is a valid swarm space position but is assumed +// (and enforced elsewhere) to never be a swarm id, so no additional edge cases exist. TEST_CASE("service nodes - swarm id to swarm space (pubkey range)") { const auto fake_pk = oxenss::crypto::legacy_pubkey::from_hex( "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"); @@ -191,7 +196,7 @@ TEST_CASE("service nodes - swarm id to swarm space (pubkey range)") { auto boundaries = network.get_swarm_boundaries(200); REQUIRE(boundaries.first == 150); - REQUIRE(boundaries.second == 250); // we lose this tie, but this boundary is exclusive + REQUIRE(boundaries.second == 250); // swarm 300 loses this tie; 250 is inclusive for swarm 200 boundaries = network.get_swarm_boundaries(300); REQUIRE(boundaries.first == 250); @@ -201,7 +206,24 @@ TEST_CASE("service nodes - swarm id to swarm space (pubkey range)") { REQUIRE(boundaries.first == 349); REQUIRE(boundaries.second == 448); - boundaries = network.get_swarm_boundaries(100); - REQUIRE(boundaries.first == (0x18d + 0x8000000000000000)); - REQUIRE(boundaries.second == 150); + boundaries = network.get_swarm_boundaries(498); + REQUIRE(boundaries.first == 448); // left_diff=99 (odd, rounded up to 100), so lo = 498 - 50 + REQUIRE(boundaries.second == 547); // right_diff=98, so hi = 498 + 49 + + boundaries = network.get_swarm_boundaries(596); + REQUIRE(boundaries.first == 547); + REQUIRE(boundaries.second == 645); + + auto boundaries_100 = network.get_swarm_boundaries(100); + REQUIRE(boundaries_100.first == (0x18d + 0x8000000000000000)); + REQUIRE(boundaries_100.second == 150); + + // 694 is the last element; its successor wraps around to 100. + // right_diff = 100 - 694 (uint64 wraparound) = 0xFFFFFFFFFFFFFDAE + // hi = 694 + 0x7FFFFFFFFFFFFED7 = 0x800000000000018D + // 694's upper bound and 100's lower bound must be the same value (shared boundary). + auto boundaries_694 = network.get_swarm_boundaries(694); + REQUIRE(boundaries_694.first == 645); + REQUIRE(boundaries_694.second == (0x18d + 0x8000000000000000)); + REQUIRE(boundaries_694.second == boundaries_100.first); } From b65fd431624857505331b0a049f7a990463e88b9 Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Thu, 26 Mar 2026 00:09:59 -0400 Subject: [PATCH 46/50] reformat some values and comments --- unit_test/swarm.cpp | 107 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 9 deletions(-) diff --git a/unit_test/swarm.cpp b/unit_test/swarm.cpp index 9ae604e15..abf2073d4 100644 --- a/unit_test/swarm.cpp +++ b/unit_test/swarm.cpp @@ -15,6 +15,7 @@ using ip_ports = std::tuple; using oxenss::snode::Network; using oxenss::snode::Swarm; +using oxenss::snode::INVALID_SWARM_ID; TEST_CASE("swarm - pubkey to swarm space", "[swarm]") { oxenss::user_pubkey pk; @@ -37,7 +38,7 @@ TEST_CASE("swarm - pubkey to swarm space", "[swarm]") { REQUIRE(pk.load("05ffffffffffffffffffffffffffffffffffffffffffffffff7fffffffffffffff")); CHECK(oxenss::pubkey_to_swarm_space(pk) == 1ULL << 63); REQUIRE(pk.load("05000000000000000000000000000000000000000000000000ffffffffffffffff")); - CHECK(oxenss::pubkey_to_swarm_space(pk) == (uint64_t)-1); + CHECK(oxenss::pubkey_to_swarm_space(pk) == INVALID_SWARM_ID); REQUIRE(pk.load("050000000000000000000000000000000000000000000000000123456789abcdef")); CHECK(oxenss::pubkey_to_swarm_space(pk) == 0x0123456789abcdefULL); } @@ -88,7 +89,7 @@ TEST_CASE("service nodes - pubkey to swarm id") { REQUIRE(pk.load("05000000000000000000000000000000000000000000000000a000000000000000")); CHECK(network.get_swarm_id_for(pk).value() == 100); - // This is the invalid swarm id for swarms, but should still work for a client + // A pubkey whose swarm space == INVALID_SWARM_ID is not a valid swarm id, but *is* a valid swarm space value REQUIRE(pk.load("05000000000000000000000000000000000000000000000000ffffffffffffffff")); CHECK(network.get_swarm_id_for(pk).value() == 100); @@ -161,10 +162,11 @@ TEST_CASE("service nodes - pubkey to swarm id") { REQUIRE(pk.load("050000000000000000000000000000000000000000000000000000000000000029")); CHECK(network.get_swarm_id_for(pk).value() == swarms.begin()->first); - // The code used to have a broken edge case if we have a swarm at zero and a client at max-u64 - // because of an overflow in how the distance is calculated (the first swarm will be calculated - // as max-u64 away, rather than 1 away), and so the id always maps to the highest swarm (even - // though 0xfff...fe maps to the lowest swarm; the first check here, then, would fail. + // The code used to have a broken edge case if we have a swarm at zero and a client at + // INVALID_SWARM_ID (UINT64_MAX) because of an overflow in how the distance is calculated (the + // first swarm will be calculated as UINT64_MAX away (i.e. -1), rather than 1 away), and so the id + // always maps to the highest swarm (even though 0xfff...fe maps to the lowest swarm); the first + // check here, then, would fail. swarms[0]; swarm.update_swarms(0, swarms_t{swarms}, {}); REQUIRE(pk.load("05000000000000000000000000000000000000000000000000ffffffffffffffff")); @@ -173,11 +175,98 @@ TEST_CASE("service nodes - pubkey to swarm id") { CHECK(network.get_swarm_id_for(pk).value() == 0); } +TEST_CASE("service nodes - swarm id to swarm space, boundaries near 0 and INVALID_SWARM_ID", "[swarm]") { + const auto fake_pk = oxenss::crypto::legacy_pubkey::from_hex( + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"); + oxenmq::OxenMQ omq; + Network network{omq}; + oxenss::Database db{"."}; + Swarm swarm{network, fake_pk, db}; + + using oxenss::snode::swarms_t; + using oxenss::snode::swarm_id_t; + + // INVALID_SWARM_ID (UINT64_MAX) cannot be a swarm id; INVALID_SWARM_ID - 1 can. + const swarm_id_t near_max = INVALID_SWARM_ID - 1; + swarms_t swarms; + for (swarm_id_t s : {swarm_id_t{1}, swarm_id_t{100}, swarm_id_t{694}, near_max}) + swarms[s]; + swarm.update_swarms(0, swarms_t{swarms}, {}); + + // swarm 1 (prev=near_max, next=100): + // left_diff = 3 (odd, rounded up to 4); lo = 1 - 2 = INVALID_SWARM_ID (UINT64_MAX) + // right_diff = 99; hi = 50 + // lo == INVALID_SWARM_ID (UINT64_MAX), which is a valid swarm space position. + // Range wraps: (INVALID_SWARM_ID, 50] = [0, 50] since nothing exceeds INVALID_SWARM_ID. + auto b_1 = network.get_swarm_boundaries(1); + REQUIRE(b_1.first == INVALID_SWARM_ID); + REQUIRE(b_1.second == 50); + + // swarm 100 (prev=1, next=694): + // left_diff = 99 (odd, rounded up to 100); lo = 50; right_diff = 594; hi = 397 + auto b_100 = network.get_swarm_boundaries(100); + REQUIRE(b_100.first == 50); + REQUIRE(b_100.second == 397); + + // swarm 694 (prev=100, next=near_max): + // left_diff = 594 (even); lo = 397 + // right_diff = near_max - 694 (large, even); hi = 0x800000000000015A + auto b_694 = network.get_swarm_boundaries(694); + REQUIRE(b_694.first == 397); + REQUIRE(b_694.second == 0x800000000000015AULL); + + // swarm near_max (prev=694, next=1): + // left_diff = near_max - 694 (large, even); lo = 0x800000000000015A + // right_diff = 3; hi = near_max + 1 = INVALID_SWARM_ID (UINT64_MAX) + // hi lands exactly on INVALID_SWARM_ID (UINT64_MAX): a valid swarm space position even + // though it cannot be a swarm id. near_max owns INVALID_SWARM_ID as a swarm space position. + auto b_near_max = network.get_swarm_boundaries(near_max); + REQUIRE(b_near_max.first == 0x800000000000015AULL); + REQUIRE(b_near_max.second == INVALID_SWARM_ID); + + // Shared boundaries + REQUIRE(b_694.second == b_near_max.first); // 0x800000000000015A + REQUIRE(b_near_max.second == b_1.first); // INVALID_SWARM_ID (UINT64_MAX): near_max's hi == swarm 1's lo +} + +TEST_CASE("service nodes - swarm id to swarm space, minimal 0 and INVALID_SWARM_ID - 1", "[swarm]") { + const auto fake_pk = oxenss::crypto::legacy_pubkey::from_hex( + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"); + oxenmq::OxenMQ omq; + Network network{omq}; + oxenss::Database db{"."}; + Swarm swarm{network, fake_pk, db}; + + using oxenss::snode::swarms_t; + using oxenss::snode::swarm_id_t; + + swarms_t swarms; + for (swarm_id_t s : {swarm_id_t{0}, INVALID_SWARM_ID - 1}) + swarms[s]; + swarm.update_swarms(0, swarms_t{swarms}, {}); + + // Two swarms split the space exactly in half. + // swarm 0: left_diff=2 (even); lo = 0 - 1 = INVALID_SWARM_ID (UINT64_MAX); hi = 0x7FFFFFFFFFFFFFFF + // swarm INVALID_SWARM_ID-1: lo = 0x7FFFFFFFFFFFFFFF; hi = INVALID_SWARM_ID - 1 + 1 = INVALID_SWARM_ID + // INVALID_SWARM_ID is a valid swarm space position, owned here by swarm INVALID_SWARM_ID - 1. + auto b_0 = network.get_swarm_boundaries(0); + REQUIRE(b_0.first == INVALID_SWARM_ID); + REQUIRE(b_0.second == 0x7FFFFFFFFFFFFFFFULL); + + auto b_max = network.get_swarm_boundaries(INVALID_SWARM_ID - 1); + REQUIRE(b_max.first == 0x7FFFFFFFFFFFFFFFULL); + REQUIRE(b_max.second == INVALID_SWARM_ID); + + REQUIRE(b_0.second == b_max.first); // shared midpoint + REQUIRE(b_max.second == b_0.first); // shared boundary at INVALID_SWARM_ID (UINT64_MAX) +} + // A round-trip test against "service nodes - pubkey to swarm id" is not needed here. // Both get_swarm_boundaries and _find_swarm_for_swarm_space use consistent uint64_t modular -// arithmetic, and the wrapping range case (lo > hi, crossing UINT64_MAX) is already exercised -// by swarm 100's boundaries below. UINT64_MAX is a valid swarm space position but is assumed -// (and enforced elsewhere) to never be a swarm id, so no additional edge cases exist. +// arithmetic, and the wrapping range case (lo > hi, crossing INVALID_SWARM_ID (UINT64_MAX)) is +// already exercised by swarm 100's boundaries below. INVALID_SWARM_ID is a valid swarm space +// position but is assumed (and enforced elsewhere) to never be a swarm id, so no additional edge +// cases exist. TEST_CASE("service nodes - swarm id to swarm space (pubkey range)") { const auto fake_pk = oxenss::crypto::legacy_pubkey::from_hex( "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"); From ba54fa0312e685adba6055146fe3cce1e16b757f Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Thu, 26 Mar 2026 23:52:28 -0400 Subject: [PATCH 47/50] Replace startup_version with had_swarm_state_on_open Detect schema migration state via table presence (state_kv) rather than PRAGMA user_version, and replace the numeric _startup_version member with a bool _had_swarm_state_on_open that captures the actual semantic: whether swarm state was already persisted when the database was opened. --- oxenss/snode/swarm.cpp | 4 ++-- oxenss/snode/swarm.h | 2 +- oxenss/storage/database.cpp | 6 ++---- oxenss/storage/database.hpp | 9 ++++++--- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/oxenss/snode/swarm.cpp b/oxenss/snode/swarm.cpp index 0806f8220..ac4e8258a 100644 --- a/oxenss/snode/swarm.cpp +++ b/oxenss/snode/swarm.cpp @@ -180,13 +180,13 @@ SwarmEvents Swarm::update_swarms( // from them to ensure we have all the messages they have that we don't. for (auto it : events.new_swarm_members) { auto& pair = members_[it]; - if (!did_startup_version_check && _db.startup_version() == 1) { + if (!did_swarm_space_check && _db.had_swarm_state_on_open()) { if (pair.our_ss_requested_db_dump == SwarmRequestedDBDump::Nil) pair.our_ss_requested_db_dump = SwarmRequestedDBDump::NeedsToRequest; } } - did_startup_version_check = true; + did_swarm_space_check = true; // If the DB was empty on startup then we mark all swarm members as peers that we need to // request a DB dump from. Note we only do this if the swarm matches the initial swarm we diff --git a/oxenss/snode/swarm.h b/oxenss/snode/swarm.h index 65598a60a..0876ccb8b 100644 --- a/oxenss/snode/swarm.h +++ b/oxenss/snode/swarm.h @@ -94,7 +94,7 @@ class Swarm { Database& _db; - bool did_startup_version_check = false; + bool did_swarm_space_check = false; public: Swarm(Network& network, const crypto::legacy_pubkey& our_pk, Database& db) : diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index ef9b2fb65..d3c2716d1 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -320,7 +320,7 @@ class DatabaseImpl { } void initialize_database() { - parent._startup_version = db.execAndGet("PRAGMA user_version").getInt(); + parent._had_swarm_state_on_open = db.tableExists("state_kv"); if (!db.tableExists("owners")) { create_schema(); @@ -371,8 +371,7 @@ CREATE TRIGGER IF NOT EXISTS revoked_autoclean )"); } - // use version for schema changes from now - if (parent._startup_version == 0) { + if (!parent._had_swarm_state_on_open) { log::info( logcat, "Upgrading database schema: adding swarm space cache, runtime state, " @@ -481,7 +480,6 @@ CREATE UNIQUE INDEX message_outbox_singleton ON messages(owner, namespace) WHERE namespace < 0 AND namespace % 20 = -1; -PRAGMA user_version = 1; )"); } diff --git a/oxenss/storage/database.hpp b/oxenss/storage/database.hpp index a4520d44b..324e9af0f 100644 --- a/oxenss/storage/database.hpp +++ b/oxenss/storage/database.hpp @@ -58,8 +58,11 @@ class Database { // keep track of db full errors so we don't print them on every store std::atomic db_full_counter = 0; - // database version at startup (before any migration/upgrade) - int _startup_version = 0; + // True if swarm state was already persisted in the database when it was opened. + // On the first swarm update after startup, this prevents spurious DB dump requests + // to peers who only appear as new members because swarm state was not persisted + // in pre-migration databases. + bool _had_swarm_state_on_open = false; public: // Recommended period for calling clean_expired() @@ -73,7 +76,7 @@ class Database { ~Database(); - int startup_version() const { return _startup_version; } + bool had_swarm_state_on_open() const { return _had_swarm_state_on_open; } // if the database is full then print an error only once ever N errors static constexpr int DB_FULL_FREQUENCY = 100; From f048263762082cb7613c004ecab15611fc8e39a8 Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Fri, 27 Mar 2026 00:15:26 -0400 Subject: [PATCH 48/50] Add migration test binary and historical database fixture migrate_test opens a database at a given path and runs the Database constructor, triggering any pending migrations. create_old_db.py creates a historical database schema with representative test data for manually verifying migrations; currently supports migrating from v2.11.3. --- unit_test/CMakeLists.txt | 4 ++ unit_test/create_old_db.py | 137 +++++++++++++++++++++++++++++++++++++ unit_test/migrate_test.cpp | 22 ++++++ 3 files changed, 163 insertions(+) create mode 100644 unit_test/create_old_db.py create mode 100644 unit_test/migrate_test.cpp diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt index 79eb1db82..6fc1ad8a4 100644 --- a/unit_test/CMakeLists.txt +++ b/unit_test/CMakeLists.txt @@ -19,3 +19,7 @@ target_link_libraries(Test Catch2::Catch2) target_include_directories(Test PRIVATE ..) + +add_executable(migrate_test migrate_test.cpp) +target_link_libraries(migrate_test PRIVATE common storage utils crypto snode rpc server) +target_include_directories(migrate_test PRIVATE ..) diff --git a/unit_test/create_old_db.py b/unit_test/create_old_db.py new file mode 100644 index 000000000..c2be547e5 --- /dev/null +++ b/unit_test/create_old_db.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +""" +Creates a historical test database for exercising schema migrations. +Run this first, then run migrate_test against the output path. + +Usage: python3 create_old_db.py [schema] [output_dir] + schema : which historical schema to create (default: pre-swarm-space) + output_dir : where to write storage.db (default: /tmp/test_migration_) + +Available schemas: + + pre-swarm-space (as of v2.11.3) + owners(id INTEGER PK, type INTEGER, pubkey BLOB, UNIQUE(pubkey, type)) + messages(id INTEGER PK, hash TEXT UNIQUE, owner→owners, namespace INTEGER, + timestamp INTEGER, expiry INTEGER, data BLOB) + revoked_subaccounts(owner→owners, token BLOB, timestamp INTEGER) + + post-swarm-space (current) + owners: added swarm_space_hi INTEGER, swarm_space_lo INTEGER + (upper/lower 32-bit halves of pubkey_to_swarm_space(); populated on migration + via custom SQLite functions func_swarm_space_hi/lo registered by C++ at open time) + new trigger: swarm_space_trigger auto-populates these on INSERT + new indices: owners_swarm_hi, owners_swarm_lo + messages: public outbox namespaces (namespace < 0 AND namespace % 20 = -1, i.e. -1,-21,-41,…) + cleared entirely, then UNIQUE INDEX message_outbox_singleton added on + (owner, namespace) — enforces singleton behaviour going forward + new tables: retry_requests(id, command, payload, created) + retry_pubkeys(id, pubkey UNIQUE) + retry_node_requests(id, rr_id→retry_requests, pk_id→retry_pubkeys, + next_retry, UNIQUE(rr_id,pk_id)) + new view+triggers: retry_node_reqs (insert view), retry_node_add, rr_cleanup + new table: state_kv(key TEXT UNIQUE, value TEXT) — generic persistent key/value store +""" + +import sqlite3, os, sys, time + + +def create_pre_swarm_space(c): + c.executescript(""" +CREATE TABLE owners ( + id INTEGER PRIMARY KEY, + type INTEGER NOT NULL, + pubkey BLOB NOT NULL, + UNIQUE(pubkey, type) +); + +CREATE TABLE messages ( + id INTEGER PRIMARY KEY, + hash TEXT NOT NULL, + owner INTEGER NOT NULL REFERENCES owners(id), + namespace INTEGER NOT NULL DEFAULT 0, + timestamp INTEGER NOT NULL, + expiry INTEGER NOT NULL, + data BLOB NOT NULL, + UNIQUE(hash) +); + +CREATE TABLE revoked_subaccounts ( + owner INTEGER REFERENCES owners(id) ON DELETE CASCADE, + token BLOB NOT NULL, + timestamp INTEGER NOT NULL DEFAULT (CAST((julianday('now') - 2440587.5)*86400000 AS INTEGER)) +); +""") + + # Pubkeys: 32-byte blobs (type prefix stored separately in the type column). + # swarm_space = XOR of four big-endian uint64 chunks of the pubkey bytes. + pk_100 = bytes(31) + bytes([0x64]) # swarm_space=100 (hi=0, lo=100) + pk_1 = bytes(31) + bytes([0x01]) # swarm_space=1 (hi=0, lo=1) + pk_0 = bytes(32) # swarm_space=0 (hi=0, lo=0) + pk_maxu64 = bytes(24) + bytes([0xff]*8) # swarm_space=UINT64_MAX + + now_ms = int(time.time() * 1000) + future_ms = now_ms + 86400_000 + + def ins_owner(pk, t=5): + c.execute("INSERT INTO owners (type, pubkey) VALUES (?, ?)", (t, pk)) + return c.lastrowid + + def ins_msg(owner_id, ns, h, d=b"data"): + c.execute("INSERT INTO messages (hash, owner, namespace, timestamp, expiry, data)" + " VALUES (?, ?, ?, ?, ?, ?)", (h, owner_id, ns, now_ms, future_ms, d)) + + o1 = ins_owner(pk_100) # swarm_space=100 + o2 = ins_owner(pk_1) # swarm_space=1 + o3 = ins_owner(pk_0) # swarm_space=0 + o4 = ins_owner(pk_maxu64) # swarm_space=UINT64_MAX + + # o1: regular namespaces only — all messages survive migration + ins_msg(o1, 0, "o1_ns0") + ins_msg(o1, 5, "o1_ns5") + + # o2: one public outbox message (ns=-1) — deleted by migration + ins_msg(o2, -1, "o2_ns-1") + + # o3: multiple public outbox messages + non-outbox negative ns + ins_msg(o3, -1, "o3_ns-1_a") # deleted (public outbox) + ins_msg(o3, -1, "o3_ns-1_b") # deleted (public outbox, same ns) + ins_msg(o3, -21, "o3_ns-21") # deleted (also public outbox: -21 % 20 = -1) + ins_msg(o3, -2, "o3_ns-2") # survives (-2 % 20 = -2, not public outbox) + + # o4: mix of outbox and non-outbox + ins_msg(o4, -1, "o4_ns-1_a") # deleted + ins_msg(o4, -1, "o4_ns-1_b") # deleted + ins_msg(o4, 10, "o4_ns10") # survives + + +SCHEMAS = { + 'pre-swarm-space': create_pre_swarm_space, +} + +schema = sys.argv[1] if len(sys.argv) > 1 else 'pre-swarm-space' +db_dir = sys.argv[2] if len(sys.argv) > 2 else f'/tmp/test_migration_{schema}' +db_path = os.path.join(db_dir, 'storage.db') + +if schema not in SCHEMAS: + print(f"Unknown schema '{schema}'. Available: {', '.join(SCHEMAS)}") + sys.exit(1) + +os.makedirs(db_dir, exist_ok=True) +if os.path.exists(db_path): + os.remove(db_path) + +conn = sqlite3.connect(db_path) +c = conn.cursor() + +SCHEMAS[schema](c) + +conn.commit() + +print(f"=== PRE-MIGRATION STATE ({schema}) ===") +print(f"owners columns : {[r[1] for r in c.execute('PRAGMA table_info(owners)')]}") +print(f"owners : {c.execute('SELECT id, type FROM owners').fetchall()}") +print(f"messages : {[(r[0], r[1]) for r in c.execute('SELECT namespace, hash FROM messages ORDER BY hash')]}") +print(f"\nDatabase written to: {db_path}") +print(f"Now run: ./migrate_test {db_dir}") + +conn.close() diff --git a/unit_test/migrate_test.cpp b/unit_test/migrate_test.cpp new file mode 100644 index 000000000..e6edf4ab6 --- /dev/null +++ b/unit_test/migrate_test.cpp @@ -0,0 +1,22 @@ +// Standalone migration test: open a pre-existing database, trigger migration, print results. +#include +#include +#include + +int main(int argc, char* argv[]) { + if (argc < 2) { + std::cerr << "Usage: migrate_test \n"; + return 1; + } + std::string path = argv[1]; + std::cout << "Opening: " << path << "\n"; + try { + oxenss::Database db{path}; + std::cout << "had_swarm_state_on_open: " << db.had_swarm_state_on_open() << "\n"; + std::cout << "Migration complete.\n"; + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << "\n"; + return 1; + } + return 0; +} From 6ee099c51c9af787fd7626f12442a225fa0e73dc Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Fri, 27 Mar 2026 00:20:54 -0400 Subject: [PATCH 49/50] clang-format --- unit_test/swarm.cpp | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/unit_test/swarm.cpp b/unit_test/swarm.cpp index abf2073d4..55c809055 100644 --- a/unit_test/swarm.cpp +++ b/unit_test/swarm.cpp @@ -13,9 +13,9 @@ using namespace std::literals; using ip_ports = std::tuple; +using oxenss::snode::INVALID_SWARM_ID; using oxenss::snode::Network; using oxenss::snode::Swarm; -using oxenss::snode::INVALID_SWARM_ID; TEST_CASE("swarm - pubkey to swarm space", "[swarm]") { oxenss::user_pubkey pk; @@ -89,7 +89,8 @@ TEST_CASE("service nodes - pubkey to swarm id") { REQUIRE(pk.load("05000000000000000000000000000000000000000000000000a000000000000000")); CHECK(network.get_swarm_id_for(pk).value() == 100); - // A pubkey whose swarm space == INVALID_SWARM_ID is not a valid swarm id, but *is* a valid swarm space value + // A pubkey whose swarm space == INVALID_SWARM_ID is not a valid swarm id, but *is* a valid + // swarm space value REQUIRE(pk.load("05000000000000000000000000000000000000000000000000ffffffffffffffff")); CHECK(network.get_swarm_id_for(pk).value() == 100); @@ -164,9 +165,9 @@ TEST_CASE("service nodes - pubkey to swarm id") { // The code used to have a broken edge case if we have a swarm at zero and a client at // INVALID_SWARM_ID (UINT64_MAX) because of an overflow in how the distance is calculated (the - // first swarm will be calculated as UINT64_MAX away (i.e. -1), rather than 1 away), and so the id - // always maps to the highest swarm (even though 0xfff...fe maps to the lowest swarm); the first - // check here, then, would fail. + // first swarm will be calculated as UINT64_MAX away (i.e. -1), rather than 1 away), and so the + // id always maps to the highest swarm (even though 0xfff...fe maps to the lowest swarm); the + // first check here, then, would fail. swarms[0]; swarm.update_swarms(0, swarms_t{swarms}, {}); REQUIRE(pk.load("05000000000000000000000000000000000000000000000000ffffffffffffffff")); @@ -175,7 +176,9 @@ TEST_CASE("service nodes - pubkey to swarm id") { CHECK(network.get_swarm_id_for(pk).value() == 0); } -TEST_CASE("service nodes - swarm id to swarm space, boundaries near 0 and INVALID_SWARM_ID", "[swarm]") { +TEST_CASE( + "service nodes - swarm id to swarm space, boundaries near 0 and INVALID_SWARM_ID", + "[swarm]") { const auto fake_pk = oxenss::crypto::legacy_pubkey::from_hex( "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"); oxenmq::OxenMQ omq; @@ -183,8 +186,8 @@ TEST_CASE("service nodes - swarm id to swarm space, boundaries near 0 and INVALI oxenss::Database db{"."}; Swarm swarm{network, fake_pk, db}; - using oxenss::snode::swarms_t; using oxenss::snode::swarm_id_t; + using oxenss::snode::swarms_t; // INVALID_SWARM_ID (UINT64_MAX) cannot be a swarm id; INVALID_SWARM_ID - 1 can. const swarm_id_t near_max = INVALID_SWARM_ID - 1; @@ -225,11 +228,13 @@ TEST_CASE("service nodes - swarm id to swarm space, boundaries near 0 and INVALI REQUIRE(b_near_max.second == INVALID_SWARM_ID); // Shared boundaries - REQUIRE(b_694.second == b_near_max.first); // 0x800000000000015A - REQUIRE(b_near_max.second == b_1.first); // INVALID_SWARM_ID (UINT64_MAX): near_max's hi == swarm 1's lo + REQUIRE(b_694.second == b_near_max.first); // 0x800000000000015A + REQUIRE(b_near_max.second == + b_1.first); // INVALID_SWARM_ID (UINT64_MAX): near_max's hi == swarm 1's lo } -TEST_CASE("service nodes - swarm id to swarm space, minimal 0 and INVALID_SWARM_ID - 1", "[swarm]") { +TEST_CASE( + "service nodes - swarm id to swarm space, minimal 0 and INVALID_SWARM_ID - 1", "[swarm]") { const auto fake_pk = oxenss::crypto::legacy_pubkey::from_hex( "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"); oxenmq::OxenMQ omq; @@ -237,8 +242,8 @@ TEST_CASE("service nodes - swarm id to swarm space, minimal 0 and INVALID_SWARM_ oxenss::Database db{"."}; Swarm swarm{network, fake_pk, db}; - using oxenss::snode::swarms_t; using oxenss::snode::swarm_id_t; + using oxenss::snode::swarms_t; swarms_t swarms; for (swarm_id_t s : {swarm_id_t{0}, INVALID_SWARM_ID - 1}) @@ -246,9 +251,10 @@ TEST_CASE("service nodes - swarm id to swarm space, minimal 0 and INVALID_SWARM_ swarm.update_swarms(0, swarms_t{swarms}, {}); // Two swarms split the space exactly in half. - // swarm 0: left_diff=2 (even); lo = 0 - 1 = INVALID_SWARM_ID (UINT64_MAX); hi = 0x7FFFFFFFFFFFFFFF - // swarm INVALID_SWARM_ID-1: lo = 0x7FFFFFFFFFFFFFFF; hi = INVALID_SWARM_ID - 1 + 1 = INVALID_SWARM_ID - // INVALID_SWARM_ID is a valid swarm space position, owned here by swarm INVALID_SWARM_ID - 1. + // swarm 0: left_diff=2 (even); lo = 0 - 1 = INVALID_SWARM_ID (UINT64_MAX); hi = + // 0x7FFFFFFFFFFFFFFF swarm INVALID_SWARM_ID-1: lo = 0x7FFFFFFFFFFFFFFF; hi = INVALID_SWARM_ID - + // 1 + 1 = INVALID_SWARM_ID INVALID_SWARM_ID is a valid swarm space position, owned here by + // swarm INVALID_SWARM_ID - 1. auto b_0 = network.get_swarm_boundaries(0); REQUIRE(b_0.first == INVALID_SWARM_ID); REQUIRE(b_0.second == 0x7FFFFFFFFFFFFFFFULL); @@ -257,8 +263,8 @@ TEST_CASE("service nodes - swarm id to swarm space, minimal 0 and INVALID_SWARM_ REQUIRE(b_max.first == 0x7FFFFFFFFFFFFFFFULL); REQUIRE(b_max.second == INVALID_SWARM_ID); - REQUIRE(b_0.second == b_max.first); // shared midpoint - REQUIRE(b_max.second == b_0.first); // shared boundary at INVALID_SWARM_ID (UINT64_MAX) + REQUIRE(b_0.second == b_max.first); // shared midpoint + REQUIRE(b_max.second == b_0.first); // shared boundary at INVALID_SWARM_ID (UINT64_MAX) } // A round-trip test against "service nodes - pubkey to swarm id" is not needed here. From 87e6e4c915b1dd272178f759cfa8d76ddddfc2da Mon Sep 17 00:00:00 2001 From: Thomas Winget Date: Wed, 1 Apr 2026 16:17:56 -0400 Subject: [PATCH 50/50] fix simple query mistakes --- oxenss/storage/database.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/oxenss/storage/database.cpp b/oxenss/storage/database.cpp index d3c2716d1..0b92c0f55 100644 --- a/oxenss/storage/database.cpp +++ b/oxenss/storage/database.cpp @@ -1381,7 +1381,7 @@ void Database::foreach_ready_retry_request(std::function< auto [req_id, key_str, cmd, payload, next_retry] = get(stmt); auto key = crypto::legacy_pubkey::from_bytes(key_str); - impl->prepared_exec("UPDATE retry_nodes_requests SET next_retry = ?", next_time); + impl->prepared_exec("UPDATE retry_node_requests SET next_retry = ?", next_time); callback(key, cmd, payload, req_id); } @@ -1420,14 +1420,15 @@ void Database::foreach_swarm_message( // there's probably a better way to do this, but it should be fine std::string query = R"( SELECT type, pubkey, hash, namespace, timestamp, expiry, data -FROM owned_messages ORDER BY mid +FROM owned_messages JOIN owners ON oid = id WHERE )"; query += R"( (owners.swarm_space_hi >{0} ?1 OR (owners.swarm_space_hi == ?1 AND owners.swarm_space_lo >{0} ?2)) AND - (owners.swarm_space_hi <= ?3 OR (owners.swarm_space_hi == ?3 AND owners.swarm_space_lo <= ?4)); + (owners.swarm_space_hi <= ?3 OR (owners.swarm_space_hi == ?3 AND owners.swarm_space_lo <= ?4)) +ORDER BY mid; )"_format(zero_inclusive ? "=" : ""); statement = SQLite::Statement{impl->db, query};