From 35550f2dd8f0cfb41d6533e029dbd4c3a117c982 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Thu, 7 Aug 2025 15:28:23 -0500 Subject: [PATCH 01/30] Refactor Sparta checkpointers --- sparta/simdb | 2 +- .../serialization/checkpoint/Checkpoint.hpp | 168 +++------------ .../checkpoint/CheckpointBase.hpp | 202 ++++++++++++++++++ .../serialization/checkpoint/Checkpointer.hpp | 110 ++-------- .../checkpoint/DeltaCheckpoint.hpp | 29 +++ .../checkpoint/FastCheckpointer.hpp | 140 +++++++++--- .../FastCheckpoint/FastCheckpoint_test.cpp | 18 +- 7 files changed, 403 insertions(+), 266 deletions(-) create mode 100644 sparta/sparta/serialization/checkpoint/CheckpointBase.hpp diff --git a/sparta/simdb b/sparta/simdb index aa5127d367..d531b6ebc5 160000 --- a/sparta/simdb +++ b/sparta/simdb @@ -1 +1 @@ -Subproject commit aa5127d3679e1efcb1fb768d3851ce6ea391097a +Subproject commit d531b6ebc5a0ec643c5f1f5b91e1e0dca13d0319 diff --git a/sparta/sparta/serialization/checkpoint/Checkpoint.hpp b/sparta/sparta/serialization/checkpoint/Checkpoint.hpp index a166a893a4..0ef611d567 100644 --- a/sparta/sparta/serialization/checkpoint/Checkpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/Checkpoint.hpp @@ -2,21 +2,10 @@ #pragma once -#include -#include - -#include "sparta/functional/ArchData.hpp" -#include "sparta/utils/SpartaException.hpp" -#include "sparta/utils/SpartaAssert.hpp" -#include "sparta/kernel/Scheduler.hpp" - -#include "sparta/serialization/checkpoint/CheckpointExceptions.hpp" - +#include "sparta/serialization/checkpoint/CheckpointBase.hpp" namespace sparta::serialization::checkpoint { - class FastCheckpointer; - /*! * \brief Single checkpoint object interface with a tick number and an ID * unique to the owning Checkpointer instance @@ -25,35 +14,10 @@ namespace sparta::serialization::checkpoint * checkpoint data in memory or on disk at construction which can be * restored with load() */ - class Checkpoint + class Checkpoint : public CheckpointBase { public: - //! \name Local Types - //! @{ - //////////////////////////////////////////////////////////////////////// - - //! \brief tick_t Tick type to which checkpoints will refer - typedef sparta::Scheduler::Tick tick_t; - - //! \brief tick_t Tick type to which checkpoints will refer - typedef uint64_t chkpt_id_t; - - //////////////////////////////////////////////////////////////////////// - //! @} - - /*! - * \brief Indicates the smallest valid checkpoint id - */ - static const chkpt_id_t MIN_CHECKPOINT = 0; - - /*! - * \brief Indicates unidentified checkpoint (could mean 'invalid' or - * 'any') depending on context - */ - static const chkpt_id_t UNIDENTIFIED_CHECKPOINT = ~(chkpt_id_t)0; - - //! \name Construction & Initialization //! @{ //////////////////////////////////////////////////////////////////////// @@ -65,7 +29,13 @@ namespace sparta::serialization::checkpoint Checkpoint(const Checkpoint&) = delete; //! \brief Non-assignable - const Checkpoint& operator=(const Checkpoint&) = delete; + Checkpoint& operator=(const Checkpoint&) = delete; + + //! \brief Not move constructable + Checkpoint(Checkpoint&&) = delete; + + //! \brief Not move assignable + Checkpoint& operator=(Checkpoint&&) = delete; protected: @@ -75,8 +45,7 @@ namespace sparta::serialization::checkpoint Checkpoint(chkpt_id_t id, tick_t tick, Checkpoint* prev) : - tick_(tick), - chkpt_id_(id), + CheckpointBase(id, tick), prev_(prev) { } @@ -107,66 +76,6 @@ namespace sparta::serialization::checkpoint //////////////////////////////////////////////////////////////////////// //! @} - /*! - * \brief Returns a string describing this object - */ - virtual std::string stringize() const { - std::stringstream ss; - ss << "'; - return ss.str(); - } - - /*! - * \brief Writes all checkpoint raw data to an ostream - * \param o ostream to which raw data will be written - * \note No newlines or other extra characters will be appended - */ - virtual void dumpData(std::ostream& o) const = 0; - - /*! - * \brief Returns memory usage by this checkpoint including any - * framework data structures - */ - virtual uint64_t getTotalMemoryUse() const noexcept = 0; - - /*! - * \brief Returns memory usage by this checkpoint solely for the - * checkpointed content. - */ - virtual uint64_t getContentMemoryUse() const noexcept = 0; - - //! \name Checkpoint Actions - //! @{ - //////////////////////////////////////////////////////////////////////// - - /*! - * \brief Attempts to restore this checkpoint state to the simulation - * state (ArchData) objects given to this Checkpoint at construction - */ - virtual void load(const std::vector& dats) = 0; - - /*! - * \brief Returns the tick number at which this checkpoint was taken. - */ - tick_t getTick() const noexcept { return tick_; } - - /*! - * \brief Returns the ID of this checkpoint - * \note Number has no sequential meaning - it is effectively a random - * ID. - */ - chkpt_id_t getID() const noexcept { return chkpt_id_; } - - /*! - * \brief Gets the representation of this deleted checkpoint as part of - * a checkpoint chain (if that checkpointer supports deletion) - */ - virtual std::string getDeletedRepr() const { - return "*"; - } - /*! * \brief Returns the previous checkpoint. If this checkpoint is a * snapshot, it has no previous checkpoint. @@ -177,7 +86,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Sets the previous checkpoint of this checkpoint to \a prev - * \param prev New previou checkpoint. Overwrites previous + * \param prev New previous checkpoint. Overwrites previous * This will often be accompanied by a call to addNext on the * \a prev argument */ @@ -185,6 +94,15 @@ namespace sparta::serialization::checkpoint prev_ = prev; } + /*! + * \brief Get the ID of our previous checkpoint. Returns UNIDENTIFIED_CHECKPOINT + * if we have no previous checkpoint, as is the case with the head checkpoint + * and snapshots. + */ + chkpt_id_t getPrevID() const override { + return prev_ ? prev_->getID() : UNIDENTIFIED_CHECKPOINT; + } + /*! * \brief Adds another next checkpoint following *this. * \param next Next checkpoint (later in simulator ticks) than @@ -242,22 +160,22 @@ namespace sparta::serialization::checkpoint */ const std::vector& getNexts() const noexcept { return nexts_; } - //////////////////////////////////////////////////////////////////////// - //! @} - - protected: - /*! - * \brief Sets the checkpoint ID. + * \brief Returns next checkpoint following *this. May be an empty + * vector if there are no later checkpoints. */ - void setID_(chkpt_id_t id) { - chkpt_id_ = id; + std::vector getNextIDs() const override { + std::vector next_ids; + for (const auto chkpt : getNexts()) { + next_ids.push_back(chkpt->getID()); + } + return next_ids; } - private: + //////////////////////////////////////////////////////////////////////// + //! @} - const tick_t tick_; //!< Tick number for this checkpoint. - chkpt_id_t chkpt_id_; //!< This checkpoint's ID. Guaranteed to be unique from other checkpoints' + private: /*! * \brief Next checkpoint (later tick numbers in same forward stream of @@ -270,27 +188,3 @@ namespace sparta::serialization::checkpoint }; } // namespace sparta::serialization::checkpoint - - -//! ostream insertion operator for Checkpoint -inline std::ostream& operator<<(std::ostream& o, const sparta::serialization::checkpoint::Checkpoint& dcp){ - o << dcp.stringize(); - return o; -} - -//! ostream insertion operator for Checkpoint -inline std::ostream& operator<<(std::ostream& o, const sparta::serialization::checkpoint::Checkpoint* dcp){ - if(dcp == 0){ - o << "null"; - }else{ - o << dcp->stringize(); - } - return o; -} - -//! \brief Required in simulator source to define some globals. -#define SPARTA_CHECKPOINT_BODY \ - namespace sparta{ namespace serialization { namespace checkpoint { \ - const Checkpoint::chkpt_id_t Checkpoint::MIN_CHECKPOINT; \ - const Checkpoint::chkpt_id_t Checkpoint::UNIDENTIFIED_CHECKPOINT; \ - }}} diff --git a/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp b/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp new file mode 100644 index 0000000000..9c73f93fd7 --- /dev/null +++ b/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp @@ -0,0 +1,202 @@ +// -*- C++ -*- + +#pragma once + +#include +#include + +#include "sparta/functional/ArchData.hpp" +#include "sparta/utils/SpartaException.hpp" +#include "sparta/utils/SpartaAssert.hpp" +#include "sparta/kernel/Scheduler.hpp" + +#include "sparta/serialization/checkpoint/CheckpointExceptions.hpp" + +namespace sparta::serialization::checkpoint +{ + /*! + * \brief Single checkpoint object interface with a tick number and an ID + * unique to the owning Checkpointer instance + * + * A subclass of Checkpointer is expected to hold or refer to some + * checkpoint data in memory or on disk at construction which can be + * restored with load() + */ + class CheckpointBase + { + public: + + //! \name Local Types + //! @{ + //////////////////////////////////////////////////////////////////////// + + //! \brief tick_t Tick type to which checkpoints will refer + typedef sparta::Scheduler::Tick tick_t; + + //! \brief tick_t Checkpoint ID type to which checkpoints will refer + typedef uint64_t chkpt_id_t; + + //////////////////////////////////////////////////////////////////////// + //! @} + + /*! + * \brief Indicates the smallest valid checkpoint id + */ + static const chkpt_id_t MIN_CHECKPOINT = 0; + + /*! + * \brief Indicates unidentified checkpoint (could mean 'invalid' or + * 'any') depending on context + */ + static const chkpt_id_t UNIDENTIFIED_CHECKPOINT = ~(chkpt_id_t)0; + + + //! \name Construction & Initialization + //! @{ + //////////////////////////////////////////////////////////////////////// + + //! \brief Not default constructable + CheckpointBase() = delete; + + //! \brief Not copy constructable + CheckpointBase(const CheckpointBase&) = delete; + + //! \brief Non-assignable + const CheckpointBase& operator=(const CheckpointBase&) = delete; + + //! \brief Default move construction + CheckpointBase(CheckpointBase&&) = default; + + //! \brief Default move assignment + CheckpointBase& operator=(CheckpointBase&&) = default; + + protected: + + /*! + * \note Should only be constructed by subclasses + */ + CheckpointBase(chkpt_id_t id, tick_t tick) : + tick_(tick), + chkpt_id_(id) + { } + + public: + + /*! + * \brief Destructor + */ + virtual ~CheckpointBase() = default; + + /*! + * \brief Returns a string describing this object + */ + virtual std::string stringize() const { + std::stringstream ss; + ss << "'; + return ss.str(); + } + + /*! + * \brief Writes all checkpoint raw data to an ostream + * \param o ostream to which raw data will be written + * \note No newlines or other extra characters will be appended + */ + virtual void dumpData(std::ostream& o) const = 0; + + /*! + * \brief Returns memory usage by this checkpoint including any + * framework data structures + */ + virtual uint64_t getTotalMemoryUse() const noexcept = 0; + + /*! + * \brief Returns memory usage by this checkpoint solely for the + * checkpointed content. + */ + virtual uint64_t getContentMemoryUse() const noexcept = 0; + + //! \name Checkpoint Actions + //! @{ + //////////////////////////////////////////////////////////////////////// + + /*! + * \brief Attempts to restore this checkpoint state to the simulation + * state (ArchData) objects given to this Checkpoint at construction + */ + virtual void load(const std::vector& dats) = 0; + + /*! + * \brief Returns the tick number at which this checkpoint was taken. + */ + tick_t getTick() const noexcept { return tick_; } + + /*! + * \brief Returns the ID of this checkpoint + * \note Number has no sequential meaning - it is effectively a random + * ID. + */ + chkpt_id_t getID() const noexcept { return chkpt_id_; } + + /*! + * \brief Get the ID of our previous checkpoint. Returns UNIDENTIFIED_CHECKPOINT + * if we have no previous checkpoint, as is the case with the head checkpoint + * and snapshots. + */ + virtual chkpt_id_t getPrevID() const = 0; + + /*! + * \brief Returns next checkpoint following *this. May be an empty + * vector if there are no later checkpoints. + */ + virtual std::vector getNextIDs() const = 0; + + /*! + * \brief Gets the representation of this deleted checkpoint as part of + * a checkpoint chain (if that checkpointer supports deletion) + */ + virtual std::string getDeletedRepr() const { + return "*"; + } + + //////////////////////////////////////////////////////////////////////// + //! @} + + protected: + /*! + * \brief Sets the checkpoint ID. + */ + void setID_(chkpt_id_t id) { + chkpt_id_ = id; + } + + private: + const tick_t tick_; //!< Tick number for this checkpoint. + chkpt_id_t chkpt_id_; //!< This checkpoint's ID. Guaranteed to be unique from other checkpoints' + }; + +} // namespace sparta::serialization::checkpoint + +//! ostream insertion operator for Checkpoint +inline std::ostream& operator<<(std::ostream& o, const sparta::serialization::checkpoint::CheckpointBase& dcp){ + o << dcp.stringize(); + return o; +} + +//! ostream insertion operator for Checkpoint +inline std::ostream& operator<<(std::ostream& o, const sparta::serialization::checkpoint::CheckpointBase* dcp){ + if(dcp == 0){ + o << "null"; + }else{ + o << dcp->stringize(); + } + return o; +} + +//! \brief Required in simulator source to define some globals. +#define SPARTA_CHECKPOINT_BODY \ + namespace sparta{ namespace serialization { namespace checkpoint { \ + const CheckpointBase::chkpt_id_t CheckpointBase::MIN_CHECKPOINT; \ + const CheckpointBase::chkpt_id_t CheckpointBase::UNIDENTIFIED_CHECKPOINT; \ + }}} diff --git a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp index 60643de77c..1b51ae53a3 100644 --- a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp @@ -127,25 +127,13 @@ namespace sparta::serialization::checkpoint * \note This is an approxiation and does not include some of * minimal dynamic overhead from stl containers. */ - uint64_t getTotalMemoryUse() const noexcept { - uint64_t mem = 0; - for(auto& cp : chkpts_){ - mem += cp.second->getTotalMemoryUse(); - } - return mem; - } + virtual uint64_t getTotalMemoryUse() const noexcept = 0; /*! * \brief Computes and returns the memory usage by this checkpointer at * this moment purely for the checkpoint state being held */ - uint64_t getContentMemoryUse() const noexcept { - uint64_t mem = 0; - for(auto& cp : chkpts_){ - mem += cp.second->getContentMemoryUse(); - } - return mem; - } + virtual uint64_t getContentMemoryUse() const noexcept = 0; /*! * \brief Returns the total number of checkpoints which have been @@ -361,36 +349,6 @@ namespace sparta::serialization::checkpoint */ virtual std::deque getCheckpointChain(chkpt_id_t id) const = 0; - /*! - * \brief Finds the latest checkpoint at or before the given tick - * starting at the \a from checkpoint and working backward. - * If no checkpoints before or at tick are found, returns nullptr. - * \param tick Tick to search for - * \param from Checkpoint at which to begin searching for a tick. - * Must be a valid checkpoint known by this checkpointer. - * See hasCheckpoint. - * \return The latest checkpoint with a tick number less than or equal - * to the \a tick argument. Returns nullptr if no checkpoints before \a - * tick were found. It is possible for the checkpoint identified by \a - * from could be returned. - * \warning This is not a high-performance method. Generally, - * a client of this interface knows a paticular ID. - * \throw CheckpointError if \a from does not refer to a valid - * checkpoint. - */ - virtual Checkpoint* findLatestCheckpointAtOrBefore(tick_t tick, - chkpt_id_t from) = 0; - - /*! - * \brief Finds a checkpoint by its ID - * \param id ID of checkpoint to find. Guaranteed not to be flagged as - * deleted - * \return Checkpoint with ID of \a id if found or nullptr if not found - */ - Checkpoint* findCheckpoint(chkpt_id_t id) noexcept { - return findCheckpoint_(id); - } - /*! * \brief Tests whether this checkpoint manager has a checkpoint with * the given id. @@ -398,9 +356,7 @@ namespace sparta::serialization::checkpoint * and false if not. If id == Checkpoint::UNIDENTIFIED_CHECKPOINT, * always returns false */ - virtual bool hasCheckpoint(chkpt_id_t id) const noexcept { - return findCheckpoint_(id) != nullptr; - } + virtual bool hasCheckpoint(chkpt_id_t id) const noexcept = 0; /*! * \brief Returns the head checkpoint which is equivalent to the @@ -505,23 +461,14 @@ namespace sparta::serialization::checkpoint * ostream with a newline following each checkpoint * \param o ostream to dump to */ - void dumpList(std::ostream& o) const { - for(auto& cp : chkpts_){ - o << cp.second->stringize() << std::endl; - } - } + virtual void dumpList(std::ostream& o) const = 0; /*! * \brief Dumps this checkpointer's data to an ostream with a newline * following each checkpoint * \param o ostream to dump to */ - void dumpData(std::ostream& o) const { - for(auto& cp : chkpts_){ - cp.second->dumpData(o); - o << std::endl; - } - } + virtual void dumpData(std::ostream& o) const = 0; /*! * \brief Dumps this checkpointer's data to an @@ -529,13 +476,7 @@ namespace sparta::serialization::checkpoint * following each checkpoint description and each checkpoint data dump * \param o ostream to dump to */ - void dumpAnnotatedData(std::ostream& o) const { - for(auto& cp : chkpts_){ - o << cp.second->stringize() << std::endl; - cp.second->dumpData(o); - o << std::endl; - } - } + virtual void dumpAnnotatedData(std::ostream& o) const = 0; /*! * \brief Debugging utility which dumps values in some bytes across a @@ -561,7 +502,7 @@ namespace sparta::serialization::checkpoint */ void dumpTree(std::ostream& o) const { std::deque c; - dumpBranch(o, head_, 0, 0, c); + dumpBranch(o, getHeadID(), 0, 0, c); o << '\n'; } @@ -578,7 +519,7 @@ namespace sparta::serialization::checkpoint * expected in directory-like tree-view displays */ void dumpBranch(std::ostream& o, - const Checkpoint* chkpt, + const chkpt_id_t chkpt, uint32_t indent, uint32_t pos, std::deque& continues) const { @@ -601,7 +542,7 @@ namespace sparta::serialization::checkpoint } } - auto nexts = chkpt->getNexts(); + auto nexts = getNextIDs_(chkpt); std::stringstream ss; // Draw separator between prev checkpoint and this @@ -612,14 +553,14 @@ namespace sparta::serialization::checkpoint } // Draw box around object if it is current - if(current_ == chkpt){ + if(current_ && current_->getID() == chkpt){ ss << "[ "; } dumpCheckpointNode_(chkpt, ss); ss << ' '; - if(current_ == chkpt){ + if(current_ && current_->getID() == chkpt){ ss << ']'; } @@ -653,20 +594,6 @@ namespace sparta::serialization::checkpoint protected: - /*! - * \brief Attempts to find a checkpoint within this checkpointer by ID. - * \param id Checkpoint ID to search for - * \return Pointer to found checkpoint with matchind ID. If not found, - * returns nullptr. - * \todo Faster lookup? - */ - virtual Checkpoint* findCheckpoint_(chkpt_id_t id) noexcept = 0; - - /*! - * \brief const variant of findCheckpoint_ - */ - virtual const Checkpoint* findCheckpoint_(chkpt_id_t id) const noexcept = 0; - /*! * \brief Create a head node. * \pre ArchDatas for tree root are already enumerated @@ -689,8 +616,8 @@ namespace sparta::serialization::checkpoint */ virtual chkpt_id_t createCheckpoint_(bool force_snapshot=false) = 0; - virtual void dumpCheckpointNode_(const Checkpoint* chkpt, std::ostream& o) const { - o << chkpt->getID(); + virtual void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const { + o << id; } /*! @@ -749,14 +676,9 @@ namespace sparta::serialization::checkpoint } /*! - * \brief All checkpoints sorted by ascending tick number (or - * equivalently ascending checkpoint ID since both are monotonically - * increasing) - * - * This map must still be explicitly torn down in reverse order by a - * subclass of Checkpointer + * \brief Returns IDs of the checkpoints immediately following the given checkpoint. */ - std::map> chkpts_; + virtual std::vector getNextIDs_(chkpt_id_t id) const = 0; /*! * \brief Scheduler whose tick count will be set and read. Cannnot be @@ -818,7 +740,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Head checkpoint. This is the first checkpoint taken but cannot - * be deleted. Head checkpoint memory is owned by chkpts_. + * be deleted. Head checkpoint memory is owned by checkpointer subclass. */ Checkpoint* head_; diff --git a/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp index 4700dacfb0..6188064a40 100644 --- a/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp @@ -677,6 +677,35 @@ namespace sparta::serialization::checkpoint return dcps; } + /*! + * \brief Get the ID of our previous checkpoint. Returns UNIDENTIFIED_CHECKPOINT + * if we have no previous checkpoint, as is the case with the head checkpoint + * and snapshots. + */ + chkpt_id_t getPrevID() const override { + if (auto prev = static_cast(getPrev())) { + if (!prev->isFlaggedDeleted()) { + return prev->getID(); + } + } + return UNIDENTIFIED_CHECKPOINT; + } + + /*! + * \brief Returns next checkpoint following *this. May be an empty + * vector if there are no later checkpoints. + */ + std::vector getNextIDs() const override { + std::vector next_ids; + for (const auto chkpt : getNexts()) { + const auto dcp = static_cast(chkpt); + if (!dcp->isFlaggedDeleted()) { + next_ids.push_back(chkpt->getID()); + } + } + return next_ids; + } + /*! * \brief Attempts to restore this checkpoint including any previous * deltas (dependencies). diff --git a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp index aeac93f2fb..fc07bf9193 100644 --- a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp @@ -140,6 +140,32 @@ namespace sparta::serialization::checkpoint snap_thresh_ = thresh; } + /*! + * \brief Computes and returns the memory usage by this checkpointer at + * this moment including any framework overhead + * \note This is an approxiation and does not include some of + * minimal dynamic overhead from stl containers. + */ + uint64_t getTotalMemoryUse() const noexcept override { + uint64_t mem = 0; + for(auto& cp : chkpts_){ + mem += cp.second->getTotalMemoryUse(); + } + return mem; + } + + /*! + * \brief Computes and returns the memory usage by this checkpointer at + * this moment purely for the checkpoint state being held + */ + uint64_t getContentMemoryUse() const noexcept override { + uint64_t mem = 0; + for(auto& cp : chkpts_){ + mem += cp.second->getContentMemoryUse(); + } + return mem; + } + //////////////////////////////////////////////////////////////////////// //! @} @@ -232,18 +258,6 @@ namespace sparta::serialization::checkpoint cleanupChain_(rmv); } - /*! - * \brief Queries a specific checkpoint by ID - */ - bool checkpointExists(chkpt_id_t id) { - bool exists = false; - checkpoint_type* d = findCheckpoint_(id); - if(d){ - exists = true; - } - return exists; - } - /*! * \brief Gets all checkpoints taken at tick t on any timeline. * \param t Tick number at which checkpoints should found. @@ -362,7 +376,7 @@ namespace sparta::serialization::checkpoint * checkpoint. */ checkpoint_type* findLatestCheckpointAtOrBefore(tick_t tick, - chkpt_id_t from) override { + chkpt_id_t from) { checkpoint_type* d = findCheckpoint_(from); if(!d){ throw CheckpointError("There is no checkpoint with ID ") << from; @@ -379,13 +393,29 @@ namespace sparta::serialization::checkpoint return d; } + /*! + * \brief Finds a checkpoint by its ID + * \param id ID of checkpoint to find. Guaranteed not to be flagged as + * deleted + * \return Checkpoint with ID of \a id if found or nullptr if not found + */ + checkpoint_type* findCheckpoint(chkpt_id_t id) noexcept { + auto it = chkpts_.find(id); + if (it != chkpts_.end()) { + return static_cast(it->second.get()); + } + return nullptr; + } /*! - * \brief Gets a checkpoint through findCheckpoint interface casted to - * the type of Checkpoint subclass used by this class. + * \brief Tests whether this checkpoint manager has a checkpoint with + * the given id. + * \return True if id refers to a checkpoint held by this checkpointer + * and false if not. If id == Checkpoint::UNIDENTIFIED_CHECKPOINT, + * always returns false */ - checkpoint_type* findInternalCheckpoint(chkpt_id_t id) { - return static_cast(findCheckpoint_(id)); + bool hasCheckpoint(chkpt_id_t id) const noexcept override { + return chkpts_.find(id) != chkpts_.end(); } //////////////////////////////////////////////////////////////////////// @@ -404,6 +434,43 @@ namespace sparta::serialization::checkpoint return ss.str(); } + /*! + * \brief Dumps this checkpointer's flat list of checkpoints to an + * ostream with a newline following each checkpoint + * \param o ostream to dump to + */ + void dumpList(std::ostream& o) const override { + for(auto& cp : chkpts_){ + o << cp.second->stringize() << std::endl; + } + } + + /*! + * \brief Dumps this checkpointer's data to an ostream with a newline + * following each checkpoint + * \param o ostream to dump to + */ + void dumpData(std::ostream& o) const override { + for(auto& cp : chkpts_){ + cp.second->dumpData(o); + o << std::endl; + } + } + + /*! + * \brief Dumps this checkpointer's data to an + * ostream with annotations between each ArchData and a newline + * following each checkpoint description and each checkpoint data dump + * \param o ostream to dump to + */ + void dumpAnnotatedData(std::ostream& o) const override { + for(auto& cp : chkpts_){ + o << cp.second->stringize() << std::endl; + cp.second->dumpData(o); + o << std::endl; + } + } + /*! * \brief Forwards debug/trace info onto checkpoint by ID */ @@ -553,7 +620,7 @@ namespace sparta::serialization::checkpoint * returns nullptr. * \todo Faster lookup? */ - checkpoint_type* findCheckpoint_(chkpt_id_t id) noexcept override { + checkpoint_type* findCheckpoint_(chkpt_id_t id) noexcept { auto itr = chkpts_.find(id); if (itr != chkpts_.end()) { return static_cast(itr->second.get()); @@ -564,7 +631,7 @@ namespace sparta::serialization::checkpoint /*! * \brief const variant of findCheckpoint_ */ - const checkpoint_type* findCheckpoint_(chkpt_id_t id) const noexcept override { + const checkpoint_type* findCheckpoint_(chkpt_id_t id) const noexcept { auto itr = chkpts_.find(id); if (itr != chkpts_.end()) { return static_cast(itr->second.get()); @@ -575,17 +642,15 @@ namespace sparta::serialization::checkpoint /*! * \brief Implements Checkpointer::dumpCheckpointNode_ */ - void dumpCheckpointNode_(const Checkpoint* chkpt, std::ostream& o) const override { + void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const override { static std::string SNAPSHOT_NOTICE = "(s)"; - - // checkpoint_type is a known direct base class of Checkpoint - const checkpoint_type* cp = static_cast(chkpt); + auto cp = findCheckpoint_(id); // Draw data for this checkpoint if(cp->isFlaggedDeleted()){ - o << chkpt->getDeletedRepr(); + o << cp->getDeletedRepr(); }else{ - o << chkpt->getID(); + o << cp->getID(); } // Show that this is a snapshot if(cp->isSnapshot()){ @@ -695,6 +760,31 @@ namespace sparta::serialization::checkpoint return dcp->getID(); } + /*! + * \brief Returns IDs of the checkpoints immediately following the given checkpoint. + */ + std::vector getNextIDs_(chkpt_id_t id) const override final { + std::vector next_ids; + if (const auto chkpt = findCheckpoint_(id)) { + for (const auto next : chkpt->getNexts()) { + const auto dcp = static_cast(next); + if (!dcp->isFlaggedDeleted()) { + next_ids.push_back(next->getID()); + } + } + } + return next_ids; + } + + /*! + * \brief All checkpoints sorted by ascending tick number (or + * equivalently ascending checkpoint ID since both are monotonically + * increasing) + * + * This map must still be explicitly torn down in reverse order by a + * subclass of Checkpointer + */ + std::map> chkpts_; /*! * \brief Snapshot generation threshold. Every n checkpoints in a chain diff --git a/sparta/test/FastCheckpoint/FastCheckpoint_test.cpp b/sparta/test/FastCheckpoint/FastCheckpoint_test.cpp index 485803764b..833ac322c2 100644 --- a/sparta/test/FastCheckpoint/FastCheckpoint_test.cpp +++ b/sparta/test/FastCheckpoint/FastCheckpoint_test.cpp @@ -364,7 +364,7 @@ void generalTest() // Look at a restore chain - auto* cp20 = (fcp.findInternalCheckpoint(20)); + auto* cp20 = (fcp.findCheckpoint(20)); auto rc20 = cp20->getRestoreChain(); EXPECT_EQUAL(rc20.size(), 6); // 0 -> 16 -> 17 -> * -> 19 -> 20 std::cout << "\nRestore chain for cp 20:" << std::endl; @@ -394,11 +394,11 @@ void generalTest() auto cpA = fcp.createCheckpoint(); ////r1->write(0xbbbb); std::cout << "Dumping restore chain for cpA (" << cpA << ")" << std::endl; - fcp.findInternalCheckpoint(cpA)->dumpRestoreChain(std::cout); + fcp.findCheckpoint(cpA)->dumpRestoreChain(std::cout); std::cout << std::endl; continues.clear(); fcp.dumpBranch(std::cout, - fcp.findCheckpoint(cpP), + cpP, 0, 0, continues); @@ -407,11 +407,11 @@ void generalTest() auto cpC = fcp.createCheckpoint(); //////fcp.deleteCheckpoint(cpA); std::cout << "Dumping restore chain for cpC (" << cpC << ")" << std::endl; - fcp.findInternalCheckpoint(cpC)->dumpRestoreChain(std::cout); + fcp.findCheckpoint(cpC)->dumpRestoreChain(std::cout); std::cout << std::endl; continues.clear(); fcp.dumpBranch(std::cout, - fcp.findCheckpoint(cpP), + cpP, 0, 0, continues); @@ -420,7 +420,7 @@ void generalTest() fcp.deleteCheckpoint(cpC); continues.clear(); fcp.dumpBranch(std::cout, - fcp.findCheckpoint(cpP), + cpP, 0, 0, continues); @@ -429,7 +429,7 @@ void generalTest() fcp.deleteCheckpoint(cpA); continues.clear(); fcp.dumpBranch(std::cout, - fcp.findCheckpoint(cpP), + cpP, 0, 0, continues); @@ -438,11 +438,11 @@ void generalTest() auto cpB = fcp.createCheckpoint(); fcp.loadCheckpoint(cpB); std::cout << "Dumping restore chain for cpB (" << cpB << ")" << std::endl; - fcp.findInternalCheckpoint(cpB)->dumpRestoreChain(std::cout); + fcp.findCheckpoint(cpB)->dumpRestoreChain(std::cout); std::cout << std::endl; continues.clear(); fcp.dumpBranch(std::cout, - fcp.findCheckpoint(cpP), + cpP, 0, 0, continues); From e15c30d2e679c11355044654525f902fab718466 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Fri, 8 Aug 2025 17:29:57 -0500 Subject: [PATCH 02/30] Database-backed checkpointer --- sparta/CMakeLists.txt | 2 + .../serialization/checkpoint/Checkpoint.hpp | 7 +- .../checkpoint/CheckpointBase.hpp | 16 +- .../serialization/checkpoint/Checkpointer.hpp | 16 +- .../checkpoint/DatabaseCheckpoint.hpp | 265 +++++++++ .../checkpoint/DatabaseCheckpointer.hpp | 523 +++++++++++++++++ .../checkpoint/DeltaCheckpoint.hpp | 332 +---------- .../checkpoint/FastCheckpointer.hpp | 1 + .../checkpoint/StringStreamStorage.hpp | 124 ++++ .../checkpoint/VectorStorage.hpp | 229 ++++++++ sparta/src/DatabaseCheckpoint.cpp | 225 ++++++++ sparta/src/DatabaseCheckpointer.cpp | 536 ++++++++++++++++++ sparta/test/FastCheckpoint/CMakeLists.txt | 2 +- .../DatabaseCheckpoint/CMakeLists.txt | 5 + .../DatabaseCheckpoint_test.cpp | 181 ++++++ 15 files changed, 2116 insertions(+), 348 deletions(-) create mode 100644 sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp create mode 100644 sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp create mode 100644 sparta/sparta/serialization/checkpoint/StringStreamStorage.hpp create mode 100644 sparta/sparta/serialization/checkpoint/VectorStorage.hpp create mode 100644 sparta/src/DatabaseCheckpoint.cpp create mode 100644 sparta/src/DatabaseCheckpointer.cpp create mode 100644 sparta/test/FastCheckpoint/DatabaseCheckpoint/CMakeLists.txt create mode 100644 sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp diff --git a/sparta/CMakeLists.txt b/sparta/CMakeLists.txt index 07259f7556..e604953fc8 100644 --- a/sparta/CMakeLists.txt +++ b/sparta/CMakeLists.txt @@ -40,6 +40,8 @@ list (APPEND SourceCppFiles src/CounterBase.cpp src/CsvFormatter.cpp src/DAG.cpp + src/DatabaseCheckpoint.cpp + src/DatabaseCheckpointer.cpp src/Destination.cpp src/EdgeFactory.cpp src/EventNode.cpp diff --git a/sparta/sparta/serialization/checkpoint/Checkpoint.hpp b/sparta/sparta/serialization/checkpoint/Checkpoint.hpp index 0ef611d567..f5753b9b68 100644 --- a/sparta/sparta/serialization/checkpoint/Checkpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/Checkpoint.hpp @@ -53,12 +53,10 @@ namespace sparta::serialization::checkpoint /*! - * \brief Destructor - * - * Removes this checkpoint from the chain and patches chain between prev + * \brief Removes this checkpoint from the chain and patches chain between prev * and each item in the nexts list */ - virtual ~Checkpoint() { + virtual void disconnect() { if(getPrev()){ getPrev()->removeNext(this); } @@ -70,7 +68,6 @@ namespace sparta::serialization::checkpoint getPrev()->addNext(d); } } - } //////////////////////////////////////////////////////////////////////// diff --git a/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp b/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp index 9c73f93fd7..9a44e75920 100644 --- a/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp +++ b/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp @@ -55,9 +55,6 @@ namespace sparta::serialization::checkpoint //! @{ //////////////////////////////////////////////////////////////////////// - //! \brief Not default constructable - CheckpointBase() = delete; - //! \brief Not copy constructable CheckpointBase(const CheckpointBase&) = delete; @@ -80,6 +77,8 @@ namespace sparta::serialization::checkpoint chkpt_id_(id) { } + CheckpointBase() = default; + public: /*! @@ -87,6 +86,15 @@ namespace sparta::serialization::checkpoint */ virtual ~CheckpointBase() = default; + /*! + * \brief boost::serialization support + */ + template + void serialize(Archive& ar, const unsigned int /*version*/) { + ar & tick_; + ar & chkpt_id_; + } + /*! * \brief Returns a string describing this object */ @@ -172,7 +180,7 @@ namespace sparta::serialization::checkpoint } private: - const tick_t tick_; //!< Tick number for this checkpoint. + tick_t tick_; //!< Tick number for this checkpoint. chkpt_id_t chkpt_id_; //!< This checkpoint's ID. Guaranteed to be unique from other checkpoints' }; diff --git a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp index 1b51ae53a3..eaab6a77e7 100644 --- a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp @@ -368,7 +368,7 @@ namespace sparta::serialization::checkpoint * The head checkpoint has an ID of * Checkpoint::UNIDENTIFIED_CHECKPOINT and can never be deleted. */ - const Checkpoint* getHead() const noexcept { + const CheckpointBase* getHead() const noexcept { return head_; } @@ -631,14 +631,14 @@ namespace sparta::serialization::checkpoint /*! * \brief Non-const variant of getHead_ */ - Checkpoint* getHead_() noexcept { + CheckpointBase* getHead_() noexcept { return head_; } /*! * \brief Gets the head checkpoint. Returns nullptr if none created yet */ - const Checkpoint* getHead_() const noexcept { + const CheckpointBase* getHead_() const noexcept { return head_; } @@ -649,7 +649,7 @@ namespace sparta::serialization::checkpoint * \pre Internal head pointer must be nullptr. * \note This can only be done once */ - void setHead_(Checkpoint* head) { + virtual void setHead_(CheckpointBase* head) { sparta_assert(head != nullptr, "head argument in setHead_ cannot be nullptr"); sparta_assert(head_ == nullptr, "Cannot setHead_ again on a Checkpointer once heas is already set"); head_ = head; @@ -659,7 +659,7 @@ namespace sparta::serialization::checkpoint * \brief Gets the current checkpointer pointer. Returns nullptr if * there is no current checkpoint object */ - Checkpoint* getCurrent_() const noexcept { + CheckpointBase* getCurrent_() const noexcept { return current_; } @@ -669,7 +669,7 @@ namespace sparta::serialization::checkpoint * checkpoint created will follow the current checkpoint set here. * Cannot be nullptr */ - void setCurrent_(Checkpoint* current) { + virtual void setCurrent_(CheckpointBase* current) { sparta_assert(current != nullptr, "Can never setCurrent_ to nullptr except. A null current is a valid state at initialization only") current_ = current; @@ -742,7 +742,7 @@ namespace sparta::serialization::checkpoint * \brief Head checkpoint. This is the first checkpoint taken but cannot * be deleted. Head checkpoint memory is owned by checkpointer subclass. */ - Checkpoint* head_; + CheckpointBase* head_; /*! * \brief ArchDatas required to checkpoint for this checkpointiner based @@ -753,7 +753,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Most recent checkpoint created or loaded */ - Checkpoint* current_; + CheckpointBase* current_; /*! * \brief Total checkpoint ever created by this instance. Monotonically diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp new file mode 100644 index 0000000000..de284b571e --- /dev/null +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp @@ -0,0 +1,265 @@ +// -*- C++ -*- + +#pragma once + +#include "sparta/serialization/checkpoint/CheckpointBase.hpp" +#include "sparta/serialization/checkpoint/CheckpointExceptions.hpp" +#include "sparta/serialization/checkpoint/VectorStorage.hpp" + +#include + +namespace sparta::serialization::checkpoint +{ + class DatabaseCheckpointer; + + /*! + * \brief Checkpoint class optimized for use with database-backed + * checkpointers. + */ + class DatabaseCheckpoint : public CheckpointBase + { + public: + + //! \name Construction & Initialization + //! @{ + //////////////////////////////////////////////////////////////////////// + + //! \brief Default constructable required for boost::serialization + DatabaseCheckpoint() = default; + + //! \brief Not copy constructable + DatabaseCheckpoint(const DatabaseCheckpoint&) = delete; + + //! \brief Non-assignable + DatabaseCheckpoint& operator=(const DatabaseCheckpoint&) = delete; + + //! \brief Move constructor + DatabaseCheckpoint(DatabaseCheckpoint&&) = default; + + //! \brief Not move-assignable + DatabaseCheckpoint& operator=(DatabaseCheckpoint&&) = delete; + + private: + + //! \brief Construction to be performed by friend class DatabaseCheckpointer + DatabaseCheckpoint(TreeNode& root, + const std::vector& dats, + chkpt_id_t id, + tick_t tick, + chkpt_id_t prev_id, + bool is_snapshot, + DatabaseCheckpointer* checkpointer); + + //! \brief This constructor is called during checkpoing cloning + DatabaseCheckpoint(chkpt_id_t prev_id, + chkpt_id_t deleted_id_, + bool is_snapshot_, + const storage::VectorStorage& storage, + DatabaseCheckpointer* checkpointer); + + //////////////////////////////////////////////////////////////////////// + //! @} + + friend class DatabaseCheckpointer; + + public: + + template + void serialize(Archive& ar, const unsigned int version) { + CheckpointBase::serialize(ar, version); + ar & prev_id_; + ar & deleted_id_; + ar & is_snapshot_; + ar & data_; + } + + /*! + * \brief Returns a string describing this object + */ + std::string stringize() const override; + + /*! + * \brief Writes all checkpoint raw data to an ostream + * \param o ostream to which raw data will be written + * \note No newlines or other extra characters will be appended + */ + void dumpData(std::ostream& o) const override; + + /*! + * \brief Dumps the restore chain for this checkpoint. + * \see getRestoreChain() + * \param o ostream to which chain data will be dumped + */ + void dumpRestoreChain(std::ostream& o) const; + + /*! + * \brief Returns memory usage by this checkpoint + */ + uint64_t getTotalMemoryUse() const noexcept override; + + /*! + * \brief Returns memory usage by the content of this checkpoint + */ + uint64_t getContentMemoryUse() const noexcept override; + + /*! + * \brief Implement trace of a value across the restore chain as described in Checkpointer::traceValue + */ + void traceValue(std::ostream& o, const std::vector& dats, + const ArchData* container, uint32_t offset, uint32_t size); + + /*! + * \brief Returns a stack of checkpoints from this checkpoint as far + * back as possible until no previous link is found. This is a superset + * of getRestoreChain and contains checkpoints that do not actually need + * to be inspected for restoring this checkpoint's data. This may reach + * the head checkpoint if no gaps are encountered. + */ + std::stack getHistoryChain() const; + + /*! + * \brief Returns a stack of checkpoints that must be restored from + * top-to-bottom to fully restore the state associated with this + * checkpoint. + */ + std::stack getRestoreChain() const; + + /*! + * \brief Get the ID of our previous checkpoint. Returns UNIDENTIFIED_CHECKPOINT + * if we have no previous checkpoint, as is the case with the head checkpoint + * and snapshots. + */ + chkpt_id_t getPrevID() const override; + + /*! + * \brief Returns next checkpoint following *this. May be an empty + * vector if there are no later checkpoints. + */ + std::vector getNextIDs() const override; + + /*! + * \brief Attempts to restore this checkpoint including any previous + * deltas (dependencies). + * + * Uses loadState to restore state from each checkpoint in the + * restore chain. + */ + void load(const std::vector& dats) override; + + /*! + * \brief Can this checkpoint be deleted + * Cannot be deleted if: + * \li This checkpoint has any ancestors which are not deletable and not snapshots + * \li This checkpoint was not flagged for deletion with flagDeleted + * \warning This is a recursive search of a checkpoint tree which has potentially many + * branches and could have high time cost + */ + bool canDelete() const noexcept; + + /*! + * \brief Allows this checkpoint to be deleted if it is no longer a + * previous delta of some other delta (i.e. getNexts() returns an + * empty vector). Sets the checkpoint ID to invalid. Calling multiple + * times has no effect + * \pre Must not already be flagged deleted + * \post isFlaggedDeleted() will return true + * \post getDeletedID() will return the current ID (if any) + * \see canDelete + * \see isFlaggedDeleted + */ + void flagDeleted(); + + /*! + * \brief Indicates whether this checkpoint has been flagged deleted. + * \note Does not imply that the checkpoint can safely be deleted; + * only that it was flagged for deletion. + * \note If false, Checkpoint ID will also be UNIDENTIFIED_CHECKPOINT + * \see flagDeleted() + */ + bool isFlaggedDeleted() const noexcept; + + /*! + * \brief Return the ID had by this checkpoint before it was deleted + * If this checkpoint has not been flagged for deletion, this will be + * UNIDENTIFIED_CHECKPOINT + */ + chkpt_id_t getDeletedID() const noexcept; + + /*! + * \brief Gets the representation of this deleted checkpoint as part of + * a checkpoint chain (if that checkpointer supports deletion) + * \return "D-" concatenate with ID copied when being deleted. Returns + * the ID if not yet deleted + */ + std::string getDeletedRepr() const override; + + /*! + * \brief Is this checkpoint a snapshot (contains ALL simulator state) + */ + bool isSnapshot() const noexcept; + + /*! + * \brief Determines how many checkpoints away the closest, earlier + * snapshot is. + * \return distance to closest snapshot. If this node is a snapshot, + * returns 0; if immediate getPrev() is a snapshot, returns 1; and + * so on. + * + * \note This is a noexcept function, which means that the exception if + * no snapshot is encountered is uncatchable. This is intentional. + */ + uint32_t getDistanceToPrevSnapshot() const noexcept; + + /*! + * \brief Loads delta state of this checkpoint to root. + * Does not look at any other checkpoints checkpoints. + * \see load + */ + void loadState(const std::vector& dats); + + /*! + * \brief Create a deep copy of this checkpoint. + */ + std::unique_ptr clone() const; + + private: + + /*! + * \brief Writes checkpoint data starting from current root to + * checkpoint storage + * \pre Must not have already stored data for this checkpoint + * This should only be called at construction + */ + void storeSnapshot_(const std::vector& dats); + + /*! + * \brief Writes checkpoint data starting from current root to + * checkpoint storage + * \pre Must not have already stored data for this checkpoint + * This should only be called at construction + */ + void storeDelta_(const std::vector& dats); + + /*! + * \brief ID of the previous checkpoint. + */ + chkpt_id_t prev_id_; + + /*! + * \brief ID of the checkpoint before it was deleted. This is invalid + * until deletion. Prevents misuse of checkpoint ID or any confusion + * about whether it is deleted or not. + */ + chkpt_id_t deleted_id_; + + //! \brief Is this node a snapshot? + bool is_snapshot_; + + //! \brief Storage implementation + storage::VectorStorage data_; + + //! \brief Checkpointer who created us + DatabaseCheckpointer* checkpointer_ = nullptr; + }; + +} // namespace sparta::serialization::checkpoint diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp new file mode 100644 index 0000000000..5c8388f82a --- /dev/null +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -0,0 +1,523 @@ +// -*- C++ -*- + +#pragma once + +#include "sparta/serialization/checkpoint/Checkpointer.hpp" +#include "sparta/serialization/checkpoint/DatabaseCheckpoint.hpp" +#include "simdb/apps/AppRegistration.hpp" +#include "simdb/apps/App.hpp" +#include "simdb/utils/ConcurrentQueue.hpp" +#include + +//! Default threshold for creating snapshots +#ifndef DEFAULT_SNAPSHOT_THRESH +#define DEFAULT_SNAPSHOT_THRESH 20 +#endif + +namespace sparta::serialization::checkpoint +{ + +/*! + * \brief Implementation of the FastCheckpointer which only holds + * a "window" of checkpoints in memory at any given time, and sends + * checkpoints outside this window to/from SimDB. + */ +class DatabaseCheckpointer : public simdb::App, public Checkpointer +{ +public: + static constexpr auto NAME = "db-checkpointer"; + + using checkpoint_type = DatabaseCheckpoint; + + /*! + * \brief FastCheckpointer Constructor + * + * \param db_mgr SimDB instance to use as a backing store for all checkpoints. + * + * \param root TreeNode at which checkpoints will be taken. + * This cannot be changed later. This does not + * necessarily need to be a RootTreeNode. Before + * the first checkpoint is taken, this node must + * be finalized (see + * sparta::TreeNode::isFinalized). At this point, + * the node does not need to be finalized + * + * \param sched Scheduler to read and restart on checkpoint restore (if + * not nullptr) + */ + DatabaseCheckpointer(simdb::DatabaseManager* db_mgr, TreeNode& root, Scheduler* sched=nullptr) : + Checkpointer(root, sched), + db_mgr_(db_mgr), + snap_thresh_(DEFAULT_SNAPSHOT_THRESH), + next_chkpt_id_(checkpoint_type::MIN_CHECKPOINT), + num_alive_checkpoints_(0), + num_alive_snapshots_(0), + num_dead_checkpoints_(0) + { } + + /*! + * \brief Define the SimDB schema for this checkpointer. + */ + static void defineSchema(simdb::Schema& schema); + + /*! + * \brief Instantiate the async processing pipeline to save/load checkpoints. + */ + std::unique_ptr createPipeline( + simdb::pipeline::AsyncDatabaseAccessor* db_accessor) override; + + /*! + * \brief Returns the next-shapshot threshold. + * + * This represents the distance between two checkpoints required for the + * checkpointer to automatically place a snapshot checkpoint instead of + * a delta. A threshold of 0 or 1 results in all checkpoints being + * snapshots. A value of 10 results in every 10th checkpoint being a + * snapshot. Explicit snapshot creation using createCheckpoint can interrupt + * and restart this pattern. + * + * This value is a performance/space tradeoff knob. + */ + uint32_t getSnapshotThreshold() const noexcept; + + /*! + * \brief Sets the snapshot threshold + * \see getSnapshotThreshold + */ + void setSnapshotThreshold(uint32_t thresh) noexcept; + + /*! + * \brief Computes and returns the memory usage by this checkpointer at + * this moment including any framework overhead + * \note This is an approxiation and does not include some of + * minimal dynamic overhead from stl containers. + */ + uint64_t getTotalMemoryUse() const noexcept override; + + /*! + * \brief Computes and returns the memory usage by this checkpointer at + * this moment purely for the checkpoint state being held + */ + uint64_t getContentMemoryUse() const noexcept override; + + /*! + * \brief Deletes a checkpoint by ID. + * \param id ID of checkpoint to delete. Must not be + * Checkpoint::UNIDENTIFIED_CHECKPOINT and must not be equal to the + * ID of the head checkpoint. + * \throw CheckpointError if this manager has no checkpoint with given + * id. Test with hasCheckpoint first. If id == + * Checkpoint::UNIDENTIFIED_CHECKPOINT, always throws. + * Throws if id == getHeadID(). Head cannot be deleted + * + * Internally, this deletion may be effective-only and actual data may + * still exist in an incaccessible form as part of the checkpoint + * tree implementation. + * + * If the current checkpoint is deleted, current will be updated back + * along the current checkpoints previous checkpoint chain until a non + * deleted checkpoint is found. This will become the new current + * checkpoint + */ + void deleteCheckpoint(chkpt_id_t id) override; + + /*! + * \brief Loads state from a specific checkpoint by ID + * \note Does not delete checkpoints. Checkpoints must be explicitly + * deleted by deleteCheckpoint + * \throw CheckpointError if id does not refer to checkpoint that exists + * or if checkpoint could not be load. + * \warning If checkpoint fails during loading for reasons other than an + * invalid ID, the simulation state could be corrupt + * \post current checkpoint is now the checkpoint specified by id + * \post Sets scheduler current tick to the checkpoint's tick using + * Scheduler::restartAt + */ + void loadCheckpoint(chkpt_id_t id) override; + + /*! + * \brief Gets all checkpoints taken at tick t on any timeline. + * \param t Tick number at which checkpoints should found. + * \return vector of valid checkpoint IDs (never + * checkpoint_type::UNIDENTIFIED_CHECKPOINT) + * \note Makes a new vector of results. This should not be called in the + * critical path. + */ + std::vector getCheckpointsAt(tick_t t) const override; + + /*! + * \brief Gets all checkpoint IDs available on any timeline sorted by + * tick (or equivalently checkpoint ID). + * \return vector of valid checkpoint IDs (never + * checkpoint_type::UNIDENTIFIED_CHECKPOINT) + * \note Makes a new vector of results. This should not be called in the + * critical path. + */ + std::vector getCheckpoints() const override; + + /*! + * \brief Gets the current number of checkpoints having valid IDs + */ + uint32_t getNumCheckpoints() const noexcept override; + + /*! + * \brief Gets the current number of snapshots with valid IDs + */ + uint32_t getNumSnapshots() const noexcept; + + /*! + * \brief Gets the current number of delta checkpoints with valid IDs + */ + uint32_t getNumDeltas() const noexcept; + + /*! + * \brief Gets the curent number of checkpoints (delta or snapshot) + * withOUT valid IDs. + */ + uint32_t getNumDeadCheckpoints() const noexcept; + + /*! + * \brief Debugging utility which gets a deque of checkpoints + * representing a chain starting at the checkpoint head and ending at + * the checkpoint specified by \a id. Ths results can contain + * Checkpoint::UNIDENTIFIED_CHECKPOINT to represent temporary + * deleted checkpoints in the chain. + * \param id ID of checkpoint that terminates the chain + * \return dequeue of checkpoint IDs where the front is always the head + * and the back is always the checkpoint described by \a id. If there is + * no checkpoint head, returns an empty result + * \throw CheckpointError if \a id does not refer to a valid + * checkpoint. + * \note Makes a new vector of results. This should not be called in the + * critical path. + */ + std::deque getCheckpointChain(chkpt_id_t id) const override; + + /*! + * \brief Finds the latest checkpoint at or before the given tick + * starting at the \a from checkpoint and working backward. + * If no checkpoints before or at tick are found, returns nullptr. + * \param tick Tick to search for + * \param from Checkpoint at which to begin searching for a tick. + * Must be a valid checkpoint known by this checkpointer. + * See hasCheckpoint. + * \return The latest checkpoint with a tick number less than or equal + * to the \a tick argument. Returns nullptr if no checkpoints before \a + * tick were found. It is possible for the checkpoint identified by \a + * from could be returned. + * \warning This is not a high-performance method. Generally, + * a client of this interface knows a paticular ID. + * \throw CheckpointError if \a from does not refer to a valid + * checkpoint. + */ + std::optional findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from); + + /*! + * \brief Finds a checkpoint by its ID + * \param id ID of checkpoint to find. Guaranteed not to be flagged as + * deleted + * \return Checkpoint with ID of \a id if found or nullptr if not found + */ + std::optional findCheckpoint(chkpt_id_t id) noexcept; + + /*! + * \brief Tests whether this checkpoint manager has a checkpoint with + * the given id. + * \return True if id refers to a checkpoint held by this checkpointer + * and false if not. If id == Checkpoint::UNIDENTIFIED_CHECKPOINT, + * always returns false + */ + bool hasCheckpoint(chkpt_id_t id) const noexcept override; + + /*! + * \brief Dumps the restore chain for this checkpoint. + * \see getRestoreChain() + * \param o ostream to which chain data will be dumped + * \param id ID of starting checkpoint + */ + void dumpRestoreChain(std::ostream& o, chkpt_id_t id) const; + + /*! + * \brief Returns a stack of checkpoints from this checkpoint as far + * back as possible until no previous link is found. This is a superset + * of getRestoreChain and contains checkpoints that do not actually need + * to be inspected for restoring this checkpoint's data. This may reach + * the head checkpoint if no gaps are encountered. + */ + std::stack getHistoryChain(chkpt_id_t id) const; + + /*! + * \brief Returns a stack of checkpoints that must be restored from + * top-to-bottom to fully restore the state associated with this + * checkpoint. + */ + std::stack getRestoreChain(chkpt_id_t id) const; + + /*! + * \brief Returns next checkpoint following *this. May be an empty + * vector if there are no later checkpoints. + */ + std::vector getNextIDs(chkpt_id_t id) const; + + /*! + * \brief Attempts to restore this checkpoint including any previous + * deltas (dependencies). + * + * Uses loadState to restore state from each checkpoint in the + * restore chain. + */ + void load(const std::vector& dats, chkpt_id_t id); + + /*! + * \brief TODO cnyce + */ + uint32_t getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept; + + /*! + * \brief TODO cnyce + */ + bool canDelete(chkpt_id_t id) const noexcept; + + /*! + * \brief Returns a string describing this object + */ + std::string stringize() const override; + + /*! + * \brief Dumps this checkpointer's flat list of checkpoints to an + * ostream with a newline following each checkpoint + * \param o ostream to dump to + */ + void dumpList(std::ostream& o) const override; + + /*! + * \brief Dumps this checkpointer's data to an ostream with a newline + * following each checkpoint + * \param o ostream to dump to + */ + void dumpData(std::ostream& o) const override; + + /*! + * \brief Dumps this checkpointer's data to an + * ostream with annotations between each ArchData and a newline + * following each checkpoint description and each checkpoint data dump + * \param o ostream to dump to + */ + void dumpAnnotatedData(std::ostream& o) const override; + + /*! + * \brief Debugging utility which dumps values in some bytes across a + * chain of checkpoints. The intent is to show the values loaded when + * attempting to restore needed to restore the given value in the + * selected checkpoint + * \param o ostream with each value and checkpoint ID will be printed + * \param id ID of checkpoint to "restore" value from + * \param container ArchData in which the data being traced lives + * \param offset Offset into \a container + * \param size Bytes to read at \a offset + * \warning This may change checkpoint data read/write state and should + * only be done between completed checkpoints saves/restores in order to + * not interfere. + */ + void traceValue(std::ostream& o, chkpt_id_t id, const ArchData* container, uint32_t offset, uint32_t size) override; + +private: + + /*! + * \brief Create a head node. + * \pre ArchDatas for tree root are already enumerated + * \pre Tree of getRoot() is already finalized + * \pre Guaranteed to have a null head at this time + * (getHead() == nullptr) + * \post Must create a head checkpoint + * \post Must invoke setHead_ + * \note invoked by createHead + */ + void createHead_() override; + + /*! + * \brief Create a checkpoint + * \pre Guaranteed to have a valid head at this time + * (getHead() != nullptr) + * \post Must create a checkpoint + * \return Must return a checkpoint ID not currently in use + * \note invoked by createHead + */ + chkpt_id_t createCheckpoint_(bool force_snapshot=false) override; + + /*! + * \brief Delete given checkpoint and all contiguous previous + * checkpoints which can be deleted (See checkpoint_type::canDelete). + * This is the only place where checkpoint objects are actually freed + * (aside from destruction) and it ensures that they will not disrupt + * the checkpoint delta chains. All other deletion is simply flagging + * and re-identifying checkpoints + * \param d Checkpoint to attempt to delete first. Function will then + * move through each previous checkpoint until reaching head. + * \post Head checkpoint will never be deleted by this function + * \note Never flags any new checkpoints as deleted + */ + void cleanupChain_(chkpt_id_t id); + + /*! + * \brief Look forward to see if any future checkpoints depend on \a d. + * \param d checkpoint to inspect and recursively search + * \return true if the current checkpoint or any live checkpoints + * are hit in the search. Search terminates on each branch when a + * snapshot or the end of the branch is reached. The branch to inspect + * (\a d) will not be checked itself since the point is to determine + * which branches down-chain depend on it. + */ + bool recursForwardFindAlive_(chkpt_id_t id) const; + + /*! + * \brief Attempts to find a checkpoint within this checkpointer by ID. + * \param id Checkpoint ID to search for + * \return Pointer to found checkpoint with matchind ID. If not found, + * returns nullptr. + * \todo Faster lookup? + */ + std::optional findCheckpoint_(chkpt_id_t id) const noexcept; + + /*! + * \brief Implements Checkpointer::dumpCheckpointNode_ + */ + void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const override; + + /*! + * \brief Returns IDs of the checkpoints immediately following the given checkpoint. + */ + std::vector getNextIDs_(chkpt_id_t id) const override; + + /*! + * \brief Intercept calls to Checkpointer::setHead_() and ensure we do not delete it. + */ + void setHead_(CheckpointBase* head) override; + + /*! + * \brief Intercept calls to Checkpointer::setCurrent_() and ensure we do not delete it. + * Also take this time to "unbless" the previous current node. + */ + void setCurrent_(CheckpointBase* current) override; + + /*! + * \brief Set ID of head checkpoint. + */ + void setHeadID_(chkpt_id_t id); + + /*! + * \brief Set ID of current checkpoint. + */ + void setCurrentID_(chkpt_id_t id); + + /*! + * \brief Add the given checkpoint to the cache and start processing it. + */ + void addToCache_(std::unique_ptr chkpt); + + /*! + * \brief Clone the next checkpoint that is ready for processing. + */ + bool cloneNextPipelineHeadCheckpoint_(std::unique_ptr& next); + + //! \brief Checkpointer head ID. Used to prevent the head from being deleted from the cache. + chkpt_id_t head_id_ = checkpoint_type::UNIDENTIFIED_CHECKPOINT; + + //! \brief Checkpointer current ID. Used to prevent the current node from being deleted from the cache. + chkpt_id_t current_id_ = checkpoint_type::UNIDENTIFIED_CHECKPOINT; + + //! \brief Subset (or all of) our checkpoints that we currently are holding in memory. + std::unordered_map> chkpts_cache_; + + //! \brief Ordered running list of checkpoint IDs that come in via calls to createCheckpoint_(). + //! This is used in the pipeline to pick off and start processing checkpoints in the same order + //! in which they were received, while keeping the cache designed to use an unordered_map for + //! random access. + std::queue chkpt_ids_for_pipeline_head_; + + //! \brief Mutex to protect our checkpoints cache. + mutable std::mutex mutex_; + + //! \brief SimDB instance + simdb::DatabaseManager* db_mgr_ = nullptr; + + //! \brief Cloned checkpoints for pipeline. Original checkpoints held in cache. + //using checkpoint_clone = checkpoint_type::DetachedClone; + //simdb::ConcurrentQueue>* pipeline_head_ = nullptr; + + + /*! + * \brief Snapshot generation threshold. Every n checkpoints in a chain + * are taken as snapshots instead of deltas + */ + uint32_t snap_thresh_; + + /*! + * \brief Next checkpoint ID value + */ + chkpt_id_t next_chkpt_id_; + + /*! + * \brief Number of living checkpoints of either snapshot or delta type. + * (where checkpoint isFlaggedDeleted()=false) + */ + uint32_t num_alive_checkpoints_; + + /*! + * \brief Number of living snapshot checkpoints (where checkpoint + * isFlaggedDeleted()=false). Will be <= num_alive_checkpoints_ + * The number of delta checkpoints (not snapshots) can be computed as + * num_alive_checkpoints_ - num_alive_snapshots_. + */ + uint32_t num_alive_snapshots_; + + /*! + * \brief Number of checkpoints which have been flagged as deleted but + * still exist in the checkpointer. + */ + uint32_t num_dead_checkpoints_; +}; + +} // namespace sparta::serialization::checkpoint + +namespace simdb +{ + +/*! + * \brief This AppFactory specialization is provided since we have an app that inherits + * from FastCheckpointer, and thus cannot have the default app subclass ctor signature + * that only takes the DatabaseManager like most other apps. + */ +template <> +class AppFactory : public AppFactoryBase +{ +public: + using AppT = sparta::serialization::checkpoint::DatabaseCheckpointer; + + void setSpartaElems(sparta::TreeNode& root, sparta::Scheduler* sched = nullptr) + { + root_ = &root; + sched_ = sched; + } + + AppT* createApp(DatabaseManager* db_mgr) override + { + if (!root_) { + throw sparta::SpartaException("Must set root (and maybe scheduler) before instantiating apps!"); + } + + // Make the ctor call that the default AppFactory cannot make. + return new AppT(db_mgr, *root_, sched_); + } + + void defineSchema(Schema& schema) const override + { + AppT::defineSchema(schema); + } + +private: + sparta::TreeNode* root_ = nullptr; + sparta::Scheduler* sched_ = nullptr; +}; + +} // namespace simdb diff --git a/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp index 6188064a40..59c2bae289 100644 --- a/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp @@ -14,339 +14,11 @@ #include "sparta/serialization/checkpoint/Checkpointer.hpp" #include "sparta/serialization/checkpoint/CheckpointExceptions.hpp" - +#include "sparta/serialization/checkpoint/VectorStorage.hpp" +#include "sparta/serialization/checkpoint/StringStreamStorage.hpp" namespace sparta::serialization::checkpoint { - namespace storage - { - /*! - * \brief Vector of buffers storage implementation - */ - class VectorStorage - { - class Segment{ - ArchData::line_idx_type idx_; - std::unique_ptr data_; - uint32_t bytes_; - public: - - /*! - * \brief Copying disabled (avoid memcpy) - */ - Segment(const Segment&) = delete; - - /*! - * \brief Move constructor - */ - Segment(Segment&& rhp) : - idx_(rhp.idx_), - data_(std::move(rhp.data_)), - bytes_(rhp.bytes_) - { - rhp.idx_ = ArchData::INVALID_LINE_IDX; - rhp.bytes_ = 0; - } - - /*! - * \brief Dummy constructor. Represents null entry (end of ArchData) - */ - Segment() : - idx_(ArchData::INVALID_LINE_IDX), - bytes_(0) - {;} - - /*! - * \brief Deleted assignment operator - */ - Segment& operator=(const Segment& rhp) = delete; - - /*! - * \brief Data constructor. Allocates data and copies results over - */ - Segment(ArchData::line_idx_type idx, const char* data, size_t bytes) : - idx_(idx), bytes_(bytes) - { - sparta_assert(idx != ArchData::INVALID_LINE_IDX, - "Attempted to create segment of " << bytes << " bytes with invalid line index"); - data_.reset(new char[bytes]); - ::memcpy(data_.get(), data, bytes); - } - - ArchData::line_idx_type getLineIdx() const { - return idx_; - } - - uint32_t getSize() const { - return sizeof(decltype(*this)) + bytes_; - } - - void copyTo(char* buf, uint32_t size) const { - sparta_assert(size == bytes_, \ - "Attempted to restore checkpoint data for a line where the " - "data was " << bytes_ << " bytes but the loader requested " - << size << " bytes. The sizes must match up or something is " - "wrong"); - memcpy(buf, data_.get(), bytes_); - } - - void dump(std::ostream& o) const { - if(idx_ == ArchData::INVALID_LINE_IDX){ - std::cout << "\nEnd of ArchData"; - return; - } - - std::cout << "\nLine: " << std::dec << idx_ << " (" << bytes_ << ") bytes"; - for(uint32_t off = 0; off < bytes_;){ - char chr = data_[off]; - if(off % 32 == 0){ - o << std::endl << std::setw(7) << std::hex << off; - } - if(chr == 0){ - o << ' ' << ".."; - }else{ - o << ' ' << std::setfill('0') << std::setw(2) << std::hex << (0xff & (uint16_t)chr); - } - off++; - } - } - }; - - /*! - * \brief Data segments to restore - */ - std::vector data_; - - /*! - * \brief Next line index to store when writing lines - */ - ArchData::line_idx_type next_idx_ = ArchData::INVALID_LINE_IDX; - - /*! - * \brief Index in data_ of next line to restore in nextRestoreLine - */ - uint32_t next_restore_idx_ = 0; - - /*! - * \brief iterator in data_ of line being read by call to readLineData. - * Is always next_restore_idx_ or one less. - */ - decltype(data_)::const_iterator cur_restore_itr_; - - public: - VectorStorage() { - } - - ~VectorStorage() { - } - - void dump(std::ostream& o) const { - for(auto const &seg : data_){ - seg.dump(o); - } - } - - uint32_t getSize() const { - uint32_t bytes = sizeof(decltype(*this)); - for(Segment const & seg : data_){ - bytes += seg.getSize(); - } - return bytes; - } - - void prepareForLoad() { - next_restore_idx_ = 0; - cur_restore_itr_ = data_.begin(); - } - - void beginLine(ArchData::line_idx_type idx) { - sparta_assert(idx != ArchData::INVALID_LINE_IDX, - "Cannot begin line with INVALID_LINE_IDX index"); - next_idx_ = idx; - } - - void writeLineBytes(const char* data, size_t size) { - sparta_assert(data_.size() == 0 || data_.back().getLineIdx() != next_idx_, - "Cannot store the same line idx twice in a checkpoint. Line " - << next_idx_ << " detected twice in a row"); - sparta_assert(next_idx_ != ArchData::INVALID_LINE_IDX, - "Cannot write line bytes with INVALID_LINE_IDX index"); - data_.emplace_back(next_idx_, data, size); - } - - /*! - * \brief Signals end of this checkpoint's data for one ArchData - */ - void endArchData() { - data_.emplace_back(); - } - - /*! - * \brief Is the reading state of this storage good? (i.e. haven't tried - * to read past the end of the data) - */ - bool good() const { - return next_restore_idx_ <= data_.size(); // Not past end of stream - } - - /*! - * \brief Restore next line. Return ArchData::INVALID_LINE_IDX on - * end of data. - */ - ArchData::line_idx_type getNextRestoreLine() { - if(next_restore_idx_ == data_.size()){ - next_restore_idx_++; // Increment to detect errors - return ArchData::INVALID_LINE_IDX; // Done with restore - }else if(next_restore_idx_ > data_.size()){ // Past the end - throw SpartaException("Failed to restore a checkpoint because ") - << "caller tried to keep getting next line even after " - "reaching the end of the restore data"; - } - if(next_restore_idx_ != 0){ - cur_restore_itr_++; - } - next_restore_idx_++; - - const auto next_line_idx = cur_restore_itr_->getLineIdx(); // May be invalid to indicate end of ArchData - return next_line_idx; - }; - - /*! - * \brief Read bytes for the current line - */ - void copyLineBytes(char* buf, uint32_t size) { - sparta_assert(cur_restore_itr_ != data_.end(), - "Attempted to copy line bytes from an invalid line iterator"); - sparta_assert(cur_restore_itr_->getLineIdx() != ArchData::INVALID_LINE_IDX, - "About to return line from checkpoint data segment with INVALID_LINE_IDX index"); - cur_restore_itr_->copyTo(buf, size); - } - - /*! - * \brief Steal line buffer. Useful if the checkpoint is being reloaded - * AND simultaneouslty destroyed - * \todo implement this - */ - //void stealLineBytes(char*& buf_ptr, uint32_t size) { - // cur_restore_itr_->stealBuffer(buf_ptr, size); - //} - }; - - /*! - * \brief Stringstream storage implementation - * \warning This is deprecated in favor of VectorStorage for in-memory uses. - * However, this is a starting point for disk-based storage schemes - */ - class StringStreamStorage - { - std::stringstream ss_; - - public: - StringStreamStorage() { - ss_.exceptions(std::ostream::eofbit | std::ostream::badbit | - std::ostream::failbit | std::ostream::goodbit); - } - - void dump(std::ostream& o) const { - auto s = ss_.str(); - auto itr = s.begin(); - for(; itr != s.end(); itr++){ - char chr = *itr; - if(chr == 'L'){ - uint32_t off = 0; - ArchData::line_idx_type ln_idx; - strncpy((char*)&ln_idx, s.substr(itr-s.begin(), sizeof(ln_idx)).c_str(), sizeof(ln_idx)); - std::cout << "\nLine: " << ln_idx << std::endl; - itr += sizeof(ArchData::line_idx_type); - - for(uint16_t i=0; i<64; ++i){ - chr = *itr; - if(off % 32 == 0){ - o << std::setw(7) << std::hex << off; - } - if(chr == 0){ - o << ' ' << ".."; - }else{ - o << ' ' << std::setfill('0') << std::setw(2) << std::hex << (0xff & (uint16_t)chr); - } - off++; - if(off % 32 == 0){ - o << std::endl; - } - ++itr; - } - } - } - } - - uint32_t getSize() const { - return ss_.str().size() + sizeof(decltype(*this)); - } - - void prepareForLoad() { - ss_.seekg(0); // Seek to start with get pointer before consuming - } - - void beginLine(ArchData::line_idx_type idx) { - ss_ << 'L'; // Line start char - - ArchData::line_idx_type idx_repr = reorder(idx); - ss_.write((char*)&idx_repr, sizeof(ArchData::line_idx_type)); - } - - void writeLineBytes(const char* data, size_t size) { - ss_.write(data, size); - } - - /*! - * \brief Signals end of this checkpoint's data - */ - void endArchData() { - ss_ << "E"; // Indicates end of this checkpoint data - - sparta_assert(ss_.good(), - "Ostream error while writing checkpoint data"); - } - - /*! - * \brief Is the reading state of this storage good? (i.e. haven't tried - * to read past the end of the data) - */ - bool good() const { - return ss_.good(); - } - - /*! - * \brief Restore next line. Return ArchData::INVALID_LINE_IDX on - * end of data. - */ - ArchData::line_idx_type getNextRestoreLine() { - char ctrl; - ss_ >> ctrl; - sparta_assert(ss_.good(), - "Encountered checkpoint data stream error or eof"); - if(ctrl == 'L'){ - ArchData::line_idx_type ln_idx = 0; - ss_.read((char*)&ln_idx, sizeof(ln_idx)); // Presumed LE encoding - return ln_idx; - }else if(ctrl == 'E'){ - return ArchData::INVALID_LINE_IDX; // Done with restore - }else{ - throw SpartaException("Failed to restore a checkpoint because a '") - << ctrl << "' control character was found where an 'L' or 'E' was found"; - } - }; - - /*! - * \brief Read bytes for the current line - */ - void copyLineBytes(char* buf, uint32_t size) { - ss_.read(buf, size); - } - }; - - } // namespace storage - class FastCheckpointer; /*! diff --git a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp index fc07bf9193..3931fa81a0 100644 --- a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp @@ -565,6 +565,7 @@ namespace sparta::serialization::checkpoint // Erase element in the map auto itr = chkpts_.find(id); sparta_assert(itr != chkpts_.end()); + itr->second->disconnect(); chkpts_.erase(itr); } diff --git a/sparta/sparta/serialization/checkpoint/StringStreamStorage.hpp b/sparta/sparta/serialization/checkpoint/StringStreamStorage.hpp new file mode 100644 index 0000000000..8b76f9992c --- /dev/null +++ b/sparta/sparta/serialization/checkpoint/StringStreamStorage.hpp @@ -0,0 +1,124 @@ +// -*- C++ -*- + +#pragma once + +#include "sparta/functional/ArchData.hpp" +#include "sparta/utils/SpartaException.hpp" + +namespace sparta::serialization::checkpoint::storage +{ + +/*! + * \brief Stringstream storage implementation + * \warning This is deprecated in favor of VectorStorage for in-memory uses. + * However, this is a starting point for disk-based storage schemes + */ +class StringStreamStorage +{ + std::stringstream ss_; + +public: + StringStreamStorage() { + ss_.exceptions(std::ostream::eofbit | std::ostream::badbit | + std::ostream::failbit | std::ostream::goodbit); + } + + void dump(std::ostream& o) const { + auto s = ss_.str(); + auto itr = s.begin(); + for(; itr != s.end(); itr++){ + char chr = *itr; + if(chr == 'L'){ + uint32_t off = 0; + ArchData::line_idx_type ln_idx; + strncpy((char*)&ln_idx, s.substr(itr-s.begin(), sizeof(ln_idx)).c_str(), sizeof(ln_idx)); + std::cout << "\nLine: " << ln_idx << std::endl; + itr += sizeof(ArchData::line_idx_type); + + for(uint16_t i=0; i<64; ++i){ + chr = *itr; + if(off % 32 == 0){ + o << std::setw(7) << std::hex << off; + } + if(chr == 0){ + o << ' ' << ".."; + }else{ + o << ' ' << std::setfill('0') << std::setw(2) << std::hex << (0xff & (uint16_t)chr); + } + off++; + if(off % 32 == 0){ + o << std::endl; + } + ++itr; + } + } + } + } + + uint32_t getSize() const { + return ss_.str().size() + sizeof(decltype(*this)); + } + + void prepareForLoad() { + ss_.seekg(0); // Seek to start with get pointer before consuming + } + + void beginLine(ArchData::line_idx_type idx) { + ss_ << 'L'; // Line start char + + ArchData::line_idx_type idx_repr = reorder(idx); + ss_.write((char*)&idx_repr, sizeof(ArchData::line_idx_type)); + } + + void writeLineBytes(const char* data, size_t size) { + ss_.write(data, size); + } + + /*! + * \brief Signals end of this checkpoint's data + */ + void endArchData() { + ss_ << "E"; // Indicates end of this checkpoint data + + sparta_assert(ss_.good(), + "Ostream error while writing checkpoint data"); + } + + /*! + * \brief Is the reading state of this storage good? (i.e. haven't tried + * to read past the end of the data) + */ + bool good() const { + return ss_.good(); + } + + /*! + * \brief Restore next line. Return ArchData::INVALID_LINE_IDX on + * end of data. + */ + ArchData::line_idx_type getNextRestoreLine() { + char ctrl; + ss_ >> ctrl; + sparta_assert(ss_.good(), + "Encountered checkpoint data stream error or eof"); + if(ctrl == 'L'){ + ArchData::line_idx_type ln_idx = 0; + ss_.read((char*)&ln_idx, sizeof(ln_idx)); // Presumed LE encoding + return ln_idx; + }else if(ctrl == 'E'){ + return ArchData::INVALID_LINE_IDX; // Done with restore + }else{ + throw SpartaException("Failed to restore a checkpoint because a '") + << ctrl << "' control character was found where an 'L' or 'E' was found"; + } + }; + + /*! + * \brief Read bytes for the current line + */ + void copyLineBytes(char* buf, uint32_t size) { + ss_.read(buf, size); + } +}; + +} // namespace sparta::serialization::checkpoint::storage diff --git a/sparta/sparta/serialization/checkpoint/VectorStorage.hpp b/sparta/sparta/serialization/checkpoint/VectorStorage.hpp new file mode 100644 index 0000000000..94ebab97f2 --- /dev/null +++ b/sparta/sparta/serialization/checkpoint/VectorStorage.hpp @@ -0,0 +1,229 @@ +// -*- C++ -*- + +#pragma once + +#include "sparta/functional/ArchData.hpp" +#include "sparta/utils/SpartaException.hpp" + +namespace sparta::serialization::checkpoint::storage +{ + +/*! + * \brief Vector of buffers storage implementation + */ +class VectorStorage +{ + class Segment{ + ArchData::line_idx_type idx_; + std::vector data_; + uint32_t bytes_; + public: + + /*! + * \brief Copy constructor + */ + Segment(const Segment&) = default; + + /*! + * \brief Move constructor + */ + Segment(Segment&& rhp) : + idx_(rhp.idx_), + data_(std::move(rhp.data_)), + bytes_(rhp.bytes_) + { + rhp.idx_ = ArchData::INVALID_LINE_IDX; + rhp.bytes_ = 0; + } + + /*! + * \brief Dummy constructor. Represents null entry (end of ArchData) + */ + Segment() : + idx_(ArchData::INVALID_LINE_IDX), + bytes_(0) + {;} + + /*! + * \brief Deleted assignment operator + */ + Segment& operator=(const Segment& rhp) = delete; + + /*! + * \brief Data constructor. Allocates data and copies results over + */ + Segment(ArchData::line_idx_type idx, const char* data, size_t bytes) : + idx_(idx), bytes_(bytes) + { + sparta_assert(idx != ArchData::INVALID_LINE_IDX, + "Attempted to create segment of " << bytes << " bytes with invalid line index"); + data_.resize(bytes); + ::memcpy(data_.data(), data, bytes); + } + + template + void serialize(Archive& ar, const unsigned int /*version*/) { + ar & idx_; + ar & data_; + ar & bytes_; + } + + ArchData::line_idx_type getLineIdx() const { + return idx_; + } + + uint32_t getSize() const { + return sizeof(decltype(*this)) + bytes_; + } + + void copyTo(char* buf, uint32_t size) const { + sparta_assert(size == bytes_, \ + "Attempted to restore checkpoint data for a line where the " + "data was " << bytes_ << " bytes but the loader requested " + << size << " bytes. The sizes must match up or something is " + "wrong"); + memcpy(buf, data_.data(), bytes_); + } + + void dump(std::ostream& o) const { + if(idx_ == ArchData::INVALID_LINE_IDX){ + std::cout << "\nEnd of ArchData"; + return; + } + + std::cout << "\nLine: " << std::dec << idx_ << " (" << bytes_ << ") bytes"; + for(uint32_t off = 0; off < bytes_;){ + char chr = data_[off]; + if(off % 32 == 0){ + o << std::endl << std::setw(7) << std::hex << off; + } + if(chr == 0){ + o << ' ' << ".."; + }else{ + o << ' ' << std::setfill('0') << std::setw(2) << std::hex << (0xff & (uint16_t)chr); + } + off++; + } + } + }; + + /*! + * \brief Data segments to restore + */ + std::vector data_; + + /*! + * \brief Next line index to store when writing lines + */ + ArchData::line_idx_type next_idx_ = ArchData::INVALID_LINE_IDX; + + /*! + * \brief Index in data_ of next line to restore in nextRestoreLine + */ + uint32_t next_restore_idx_ = 0; + + /*! + * \brief iterator in data_ of line being read by call to readLineData. + * Is always next_restore_idx_ or one less. + */ + decltype(data_)::const_iterator cur_restore_itr_; + +public: + VectorStorage() { + } + + ~VectorStorage() { + } + + VectorStorage(const VectorStorage&) = default; + + template + void serialize(Archive& ar, const unsigned int /*version*/) { + ar & data_; + } + + void dump(std::ostream& o) const { + for(auto const &seg : data_){ + seg.dump(o); + } + } + + uint32_t getSize() const { + uint32_t bytes = sizeof(decltype(*this)); + for(Segment const & seg : data_){ + bytes += seg.getSize(); + } + return bytes; + } + + void prepareForLoad() { + next_restore_idx_ = 0; + cur_restore_itr_ = data_.begin(); + } + + void beginLine(ArchData::line_idx_type idx) { + sparta_assert(idx != ArchData::INVALID_LINE_IDX, + "Cannot begin line with INVALID_LINE_IDX index"); + next_idx_ = idx; + } + + void writeLineBytes(const char* data, size_t size) { + sparta_assert(data_.size() == 0 || data_.back().getLineIdx() != next_idx_, + "Cannot store the same line idx twice in a checkpoint. Line " + << next_idx_ << " detected twice in a row"); + sparta_assert(next_idx_ != ArchData::INVALID_LINE_IDX, + "Cannot write line bytes with INVALID_LINE_IDX index"); + data_.emplace_back(next_idx_, data, size); + } + + /*! + * \brief Signals end of this checkpoint's data for one ArchData + */ + void endArchData() { + data_.emplace_back(); + } + + /*! + * \brief Is the reading state of this storage good? (i.e. haven't tried + * to read past the end of the data) + */ + bool good() const { + return next_restore_idx_ <= data_.size(); // Not past end of stream + } + + /*! + * \brief Restore next line. Return ArchData::INVALID_LINE_IDX on + * end of data. + */ + ArchData::line_idx_type getNextRestoreLine() { + if(next_restore_idx_ == data_.size()){ + next_restore_idx_++; // Increment to detect errors + return ArchData::INVALID_LINE_IDX; // Done with restore + }else if(next_restore_idx_ > data_.size()){ // Past the end + throw SpartaException("Failed to restore a checkpoint because ") + << "caller tried to keep getting next line even after " + "reaching the end of the restore data"; + } + if(next_restore_idx_ != 0){ + cur_restore_itr_++; + } + next_restore_idx_++; + + const auto next_line_idx = cur_restore_itr_->getLineIdx(); // May be invalid to indicate end of ArchData + return next_line_idx; + }; + + /*! + * \brief Read bytes for the current line + */ + void copyLineBytes(char* buf, uint32_t size) { + sparta_assert(cur_restore_itr_ != data_.end(), + "Attempted to copy line bytes from an invalid line iterator"); + sparta_assert(cur_restore_itr_->getLineIdx() != ArchData::INVALID_LINE_IDX, + "About to return line from checkpoint data segment with INVALID_LINE_IDX index"); + cur_restore_itr_->copyTo(buf, size); + } + +}; + +} // namespace sparta::serialization::checkpoint::storage diff --git a/sparta/src/DatabaseCheckpoint.cpp b/sparta/src/DatabaseCheckpoint.cpp new file mode 100644 index 0000000000..0d41d63298 --- /dev/null +++ b/sparta/src/DatabaseCheckpoint.cpp @@ -0,0 +1,225 @@ +// -*- C++ -*- + +#include "sparta/serialization/checkpoint/DatabaseCheckpoint.hpp" +#include "sparta/serialization/checkpoint/DatabaseCheckpointer.hpp" + +namespace sparta::serialization::checkpoint +{ + +using tick_t = typename CheckpointBase::tick_t; +using chkpt_id_t = typename CheckpointBase::chkpt_id_t; +using checkpoint_type = DatabaseCheckpoint; +using checkpoint_uptr = std::unique_ptr; + +DatabaseCheckpoint::DatabaseCheckpoint(TreeNode& root, + const std::vector& dats, + chkpt_id_t id, + tick_t tick, + chkpt_id_t prev_id, + bool is_snapshot, + DatabaseCheckpointer* checkpointer) + : CheckpointBase(id, tick) + , prev_id_(prev_id) + , deleted_id_(UNIDENTIFIED_CHECKPOINT) + , is_snapshot_(is_snapshot) + , checkpointer_(checkpointer) +{ + (void)root; + if (prev_id == UNIDENTIFIED_CHECKPOINT) { + if (is_snapshot == false) { + throw CheckpointError("Cannot create a DatabaseCheckpoint id=") + << id << " at tick=" << tick << " which has no prev_delta and is not a snapshot"; + } + } + + // Store the checkpoint from root + if (is_snapshot) { + storeSnapshot_(dats); + } else { + storeDelta_(dats); + } +} + +DatabaseCheckpoint::DatabaseCheckpoint(chkpt_id_t prev_id, + chkpt_id_t deleted_id, + bool is_snapshot, + const storage::VectorStorage& storage, + DatabaseCheckpointer* checkpointer) + : CheckpointBase(getID(), getTick()) + , prev_id_(prev_id) + , deleted_id_(deleted_id) + , is_snapshot_(is_snapshot) + , data_(storage) + , checkpointer_(checkpointer) +{ +} + +std::string DatabaseCheckpoint::stringize() const +{ + std::stringstream ss; + ss << "'; + return ss.str(); +} + +void DatabaseCheckpoint::dumpData(std::ostream& o) const +{ + data_.dump(o); +} + +void DatabaseCheckpoint::dumpRestoreChain(std::ostream& o) const +{ + checkpointer_->dumpRestoreChain(o, getID()); +} + +uint64_t DatabaseCheckpoint::getTotalMemoryUse() const noexcept +{ + return getContentMemoryUse() \ + + sizeof(decltype(*this)) \ + + (getNextIDs().size() * sizeof(typename std::remove_reference::type*)); +} + +uint64_t DatabaseCheckpoint::getContentMemoryUse() const noexcept +{ + return data_.getSize(); +} + +void DatabaseCheckpoint::traceValue( + std::ostream& o, + const std::vector& dats, + const ArchData* container, + uint32_t offset, + uint32_t size) +{ + // TODO cnyce + (void)o; + (void)dats; + (void)container; + (void)offset; + (void)size; +} + +std::stack DatabaseCheckpoint::getHistoryChain() const +{ + return checkpointer_->getHistoryChain(getID()); +} + +std::stack DatabaseCheckpoint::getRestoreChain() const +{ + return checkpointer_->getRestoreChain(getID()); +} + +chkpt_id_t DatabaseCheckpoint::getPrevID() const +{ + return prev_id_; +} + +std::vector DatabaseCheckpoint::getNextIDs() const +{ + return checkpointer_->getNextIDs(getID()); +} + +void DatabaseCheckpoint::load(const std::vector& dats) +{ + checkpointer_->load(dats, getID()); +} + +bool DatabaseCheckpoint::canDelete() const noexcept +{ + return checkpointer_->canDelete(getID()); +} + +void DatabaseCheckpoint::flagDeleted() +{ + sparta_assert(!isFlaggedDeleted(), + "Cannot delete a checkpoint when it is already deleted: " << this); + deleted_id_ = getID(); + setID_(UNIDENTIFIED_CHECKPOINT); +} + +bool DatabaseCheckpoint::isFlaggedDeleted() const noexcept +{ + return getID() == UNIDENTIFIED_CHECKPOINT; +} + +chkpt_id_t DatabaseCheckpoint::getDeletedID() const noexcept +{ + return deleted_id_; +} + +std::string DatabaseCheckpoint::getDeletedRepr() const +{ + std::stringstream ss; + if (isFlaggedDeleted()) { + ss << "*" << getDeletedID(); + } else { + ss << getID(); + } + return ss.str(); +} + +bool DatabaseCheckpoint::isSnapshot() const noexcept +{ + return is_snapshot_; +} + +uint32_t DatabaseCheckpoint::getDistanceToPrevSnapshot() const noexcept +{ + return checkpointer_->getDistanceToPrevSnapshot(getID()); +} + +void DatabaseCheckpoint::loadState(const std::vector& dats) +{ + data_.prepareForLoad(); + sparta_assert(data_.good(), + "Attempted to loadState from a DeltaCheckpoint with a bad data buffer"); + if(isSnapshot()){ + for(ArchData* ad : dats){ + ad->restoreAll(data_); + } + }else{ + for(ArchData* ad : dats){ + ad->restore(data_); + } + } +} + +std::unique_ptr DatabaseCheckpoint::clone() const +{ + auto clone = new DatabaseCheckpoint(prev_id_, deleted_id_, is_snapshot_, data_, checkpointer_); + return std::unique_ptr(clone); +} + +void DatabaseCheckpoint::storeSnapshot_(const std::vector& dats) +{ + sparta_assert(data_.good(), + "Attempted to storeSnapshot_ from a DatabaseCheckpoint with a bad data buffer"); + + // Cannot have stored already + for (ArchData* ad : dats) { + ad->saveAll(data_); + } +} + +void DatabaseCheckpoint::storeDelta_(const std::vector& dats) +{ + sparta_assert(data_.good(), + "Attempted to storeDelta_ from a DatabaseCheckpoint with a bad data buffer"); + + // Cannot have stored already + for (ArchData* ad : dats) { + ad->save(data_); + } +} + +} // namespace sparta::serialization::checkpoint diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp new file mode 100644 index 0000000000..7e0507afdf --- /dev/null +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -0,0 +1,536 @@ +// -*- C++ -*- + +#include "sparta/serialization/checkpoint/DatabaseCheckpointer.hpp" +#include "simdb/pipeline/Pipeline.hpp" +#include "simdb/pipeline/elements/Function.hpp" +#include "simdb/pipeline/elements/Buffer.hpp" +#include "simdb/utils/Compress.hpp" + +#include +#include +#include +#include +#include +#include + +namespace sparta::serialization::checkpoint +{ + +using tick_t = typename CheckpointBase::tick_t; +using chkpt_id_t = typename CheckpointBase::chkpt_id_t; +using checkpoint_type = DatabaseCheckpoint; +using checkpoint_uptr = std::unique_ptr; +using checkpoint_uptrs = std::vector; + +struct ChkptWindow { + std::vector chkpt_ids; + checkpoint_uptrs chkpts; + + // TODO cnyce: Try to avoid use of unique_ptr. Everything is already movable + // and has default constructors. + template + void serialize(Archive& ar, const unsigned int /*version*/) { + ar & chkpt_ids; + + if (chkpts.empty()) { + // We are loading checkpoint window from disk + chkpts.reserve(chkpt_ids.size()); + for (size_t i = 0; i < chkpt_ids.size(); ++i) { + chkpts.emplace_back(new DatabaseCheckpoint); + ar & *chkpts.back(); + } + } + + else { + // We are saving a checkpoint window to disk + for (auto& chkpt : chkpts) { + ar & *chkpt; + } + } + } +}; + +struct ChkptWindowBytes { + std::vector chkpt_ids; + std::vector chkpt_bytes; +}; + +using EvictedChkptIDs = std::vector; + +void DatabaseCheckpointer::defineSchema(simdb::Schema& schema) +{ + using dt = simdb::SqlDataType; + + auto& window_bytes = schema.addTable("ChkptWindowBytes"); + window_bytes.addColumn("WindowBytes", dt::blob_t); + + auto& window_ids = schema.addTable("ChkptWindowIDs"); + window_ids.addColumn("ChkptWindowBytesID", dt::int32_t); + window_ids.addColumn("ChkptID", dt::int32_t); + window_ids.createIndexOn("ChkptID"); + window_ids.disableAutoIncPrimaryKey(); +} + +std::unique_ptr DatabaseCheckpointer::createPipeline( + simdb::pipeline::AsyncDatabaseAccessor* db_accessor) +{ + auto pipeline = std::make_unique(db_mgr_, NAME); + + // Task 1: Clone the next checkpoint from the cache to send down pipeline + auto feed_pipeline = simdb::pipeline::createTask>( + [this](simdb::ConcurrentQueue& out, bool /*simulation_terminating*/) mutable + { + checkpoint_uptr next_chkpt; + if (cloneNextPipelineHeadCheckpoint_(next_chkpt)) { + out.emplace(std::move(next_chkpt)); + return true; + } + return false; + } + ); + + // Task 2: Buffer snapshots and their deltas into checkpoint windows + const auto window_len = getSnapshotThreshold(); + const auto flush_partial = true; + auto create_window = simdb::pipeline::createTask>(window_len, flush_partial); + + // Task 3: Add the IDs of all checkpoints in this window + auto add_chkpt_ids = simdb::pipeline::createTask>( + [](checkpoint_uptrs&& chkpts, + simdb::ConcurrentQueue& windows, + bool /*simulation_terminating*/) + { + ChkptWindow window; + window.chkpts = std::move(chkpts); + for (auto& chkpt : window.chkpts) { + window.chkpt_ids.push_back(chkpt->getID()); + } + windows.emplace(std::move(window)); + } + ); + + // Task 4: Serialize a checkpoint window into a char buffer + auto window_to_bytes = simdb::pipeline::createTask>( + [](ChkptWindow&& window, + simdb::ConcurrentQueue& window_bytes, + bool /*simulation_terminating*/) + { + ChkptWindowBytes bytes; + boost::iostreams::back_insert_device> inserter(bytes.chkpt_bytes); + boost::iostreams::stream>> os(inserter); + boost::archive::binary_oarchive oa(os); + oa << window; + os.flush(); + + for (const auto& chkpt : window.chkpts) { + bytes.chkpt_ids.push_back(chkpt->getID()); + } + + window_bytes.emplace(std::move(bytes)); + } + ); + + // Task 5: Perform zlib compression on the checkpoint window bytes + auto zlib_bytes = simdb::pipeline::createTask>( + [](ChkptWindowBytes&& bytes_in, + simdb::ConcurrentQueue& bytes_out, + bool /*simulation_terminating*/) + { + ChkptWindowBytes compressed; + compressed.chkpt_ids = std::move(bytes_in.chkpt_ids); + simdb::compressData(bytes_in.chkpt_bytes, compressed.chkpt_bytes); + bytes_out.emplace(std::move(compressed)); + } + ); + + // Task 6: Write to the database + auto write_to_db = db_accessor->createAsyncWriter( + [](ChkptWindowBytes&& bytes_in, + simdb::ConcurrentQueue& evicted_ids, + simdb::pipeline::AppPreparedINSERTs* tables, + bool /*simulation_terminating*/) + { + auto bytes_inserter = tables->getPreparedINSERT("ChkptWindowBytes"); + bytes_inserter->setColumnValue(0, bytes_in.chkpt_bytes); + auto bytes_id = bytes_inserter->createRecord(); + + auto chkpt_ids_inserter = tables->getPreparedINSERT("ChkptWindowIDs"); + chkpt_ids_inserter->setColumnValue(0, bytes_id); + for (auto id : bytes_in.chkpt_ids) { + chkpt_ids_inserter->setColumnValue(1, id); + chkpt_ids_inserter->createRecord(); + } + + evicted_ids.emplace(std::move(bytes_in.chkpt_ids)); + } + ); + + // Task 7: Perform cache eviction after a window of checkpoints has been written to SimDB + auto evict_from_cache = simdb::pipeline::createTask>( + [this](EvictedChkptIDs&& evicted_ids, bool /*simulation_terminating*/) mutable + { + for (auto id : evicted_ids) { + sparta_assert(id != head_id_); + sparta_assert(id != current_id_); + + // TODO cnyce: We are allocating and deallocating a LOT of checkpoints. + // See if we can reuse a pool of them. Could also try to just add a pool + // to the VectorStorage::Segment class. + std::lock_guard lock(mutex_); + chkpts_cache_.erase(id); + } + } + ); + + *feed_pipeline >> *create_window >> *add_chkpt_ids >> *window_to_bytes >> *zlib_bytes >> *write_to_db >> *evict_from_cache; + + pipeline->createTaskGroup("CheckpointPipeline") + ->addTask(std::move(feed_pipeline)) + ->addTask(std::move(create_window)) + ->addTask(std::move(window_to_bytes)) + ->addTask(std::move(zlib_bytes)) + ->addTask(std::move(evict_from_cache)); + + return pipeline; +} + +uint32_t DatabaseCheckpointer::getSnapshotThreshold() const noexcept +{ + return 0; +} + +void DatabaseCheckpointer::setSnapshotThreshold(uint32_t thresh) noexcept +{ + (void)thresh; +} + +uint64_t DatabaseCheckpointer::getTotalMemoryUse() const noexcept +{ + return 0; +} + +uint64_t DatabaseCheckpointer::getContentMemoryUse() const noexcept +{ + return 0; +} + +void DatabaseCheckpointer::deleteCheckpoint(chkpt_id_t id) +{ + (void)id; +} + +void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) +{ + (void)id; +} + +std::vector DatabaseCheckpointer::getCheckpointsAt(tick_t t) const +{ + (void)t; + return {}; +} + +std::vector DatabaseCheckpointer::getCheckpoints() const +{ + return {}; +} + +uint32_t DatabaseCheckpointer::getNumCheckpoints() const noexcept +{ + return 0; +} + +uint32_t DatabaseCheckpointer::getNumSnapshots() const noexcept +{ + return 0; +} + +uint32_t DatabaseCheckpointer::getNumDeltas() const noexcept +{ + return 0; +} + +uint32_t DatabaseCheckpointer::getNumDeadCheckpoints() const noexcept +{ + return 0; +} + +std::deque DatabaseCheckpointer::getCheckpointChain(chkpt_id_t id) const +{ + return {}; +} + +std::optional DatabaseCheckpointer::findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from) +{ + (void)tick; + (void)from; + return std::optional(); +} + +std::optional DatabaseCheckpointer::findCheckpoint(chkpt_id_t id) noexcept +{ + (void)id; + return std::optional(); +} + +bool DatabaseCheckpointer::hasCheckpoint(chkpt_id_t id) const noexcept +{ + (void)id; + return false; +} + +void DatabaseCheckpointer::dumpRestoreChain(std::ostream& o, chkpt_id_t id) const +{ + (void)o; + (void)id; +} + +std::stack DatabaseCheckpointer::getHistoryChain(chkpt_id_t id) const +{ + (void)id; + return {}; +} + +std::stack DatabaseCheckpointer::getRestoreChain(chkpt_id_t id) const +{ + (void)id; + return {}; +} + +std::vector DatabaseCheckpointer::getNextIDs(chkpt_id_t id) const +{ + (void)id; + return {}; +} + +void DatabaseCheckpointer::load(const std::vector& dats, chkpt_id_t id) +{ + (void)dats; + (void)id; +} + +uint32_t DatabaseCheckpointer::getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept +{ + (void)id; + return 0; +} + +std::string DatabaseCheckpointer::stringize() const +{ + return ""; +} + +void DatabaseCheckpointer::dumpList(std::ostream& o) const +{ + (void)o; +} + +void DatabaseCheckpointer::dumpData(std::ostream& o) const +{ + (void)o; +} + +void DatabaseCheckpointer::dumpAnnotatedData(std::ostream& o) const +{ + (void)o; +} + +void DatabaseCheckpointer::traceValue( + std::ostream& o, + chkpt_id_t id, + const ArchData* container, + uint32_t offset, + uint32_t size) +{ + (void)o; + (void)id; + (void)container; + (void)offset; + (void)size; +} + +void DatabaseCheckpointer::createHead_() +{ + tick_t tick = 0; + if (sched_) { + tick = sched_->getCurrentTick(); + } + + if (getHead()) { + throw CheckpointError("Cannot create head at ") + << tick << " because a head already exists in this checkpointer"; + } + if (getRoot().isFinalized() == false) { + CheckpointError exc("Cannot create a checkpoint until the tree is finalized. Attempting to checkpoint from node "); + exc << getRoot().getLocation() << " at tick "; + if(sched_){ + exc << tick; + }else{ + exc << ""; + } + throw exc; + } + + std::unique_ptr chkpt(new checkpoint_type( + getRoot(), getArchDatas(), next_chkpt_id_++, tick, + checkpoint_type::UNIDENTIFIED_CHECKPOINT, true, this)); + + setHead_(chkpt.get()); + num_alive_checkpoints_++; + num_alive_snapshots_++; + setCurrent_(chkpt.get()); + addToCache_(std::move(chkpt)); +} + +chkpt_id_t DatabaseCheckpointer::createCheckpoint_(bool force_snapshot) +{ + bool is_snapshot; + checkpoint_type* prev; + + if (next_chkpt_id_ == checkpoint_type::UNIDENTIFIED_CHECKPOINT) { + throw CheckpointError("Exhausted all ") + << checkpoint_type::UNIDENTIFIED_CHECKPOINT << " possible checkpoint IDs. " + << "This is likely a gross misuse of checkpointing"; + } + + // Caller guarantees a head + sparta_assert(getHead() != nullptr); + + tick_t tick; + if (sched_) { + tick = sched_->getCurrentTick(); + } else { + tick = 0; + } + + if (sched_ && (tick < getHead()->getTick())) { + throw CheckpointError("Cannot create a new checkpoint at tick ") + << tick << " because this tick number is smaller than the tick number of the head checkpoint at: " + << getHead()->getTick() << ". The head checkpoint cannot be reset once created, so it should be done " + << "at the start of simulation before running. The simulator front-end should do this so this must " + << "likely be fixed in the simulator."; + } + + if (nullptr == getCurrent_()) { + // Creating a delta from the head + prev = static_cast(getHead_()); + is_snapshot = false; + } else { + if (sched_ && (tick < getCurrent_()->getTick())) { + throw CheckpointError("Current tick number from sparta scheduler (") + << tick << " ) is less than the current checkpoint's tick number (" + << getCurrent_()->getTick() << " To create a checkpoint with an earlier tick number, an " + << "older checkpoint having a tick number <= the tick number specified here must first be " + << "loaded"; + } + + // Find latest checkpoint <= tick + prev = static_cast(getCurrent_()); + is_snapshot = prev->getDistanceToPrevSnapshot() >= getSnapshotThreshold(); + } + + std::unique_ptr chkpt(new checkpoint_type( + getRoot(), getArchDatas(), next_chkpt_id_++, tick, + prev->getID(), force_snapshot || is_snapshot, this)); + + auto current = chkpt.get(); + setCurrent_(current); + addToCache_(std::move(chkpt)); + num_alive_checkpoints_++; + num_alive_snapshots_ += (current->isSnapshot() == true) ? 1 : 0; + + if (current->isSnapshot()) { + // Clean up starting with this snapshot and moving back. + // May have an opportunity to free older deltas right now + // (instead of upon next deletion) + cleanupChain_(current->getID()); + } + + return current->getID(); +} + +void DatabaseCheckpointer::cleanupChain_(chkpt_id_t id) +{ + (void)id; +} + +bool DatabaseCheckpointer::recursForwardFindAlive_(chkpt_id_t id) const +{ + (void)id; + return false; +} + +std::optional DatabaseCheckpointer::findCheckpoint_(chkpt_id_t id) const noexcept +{ + (void)id; + return std::optional(); +} + +void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const +{ + (void)id; + (void)o; +} + +std::vector DatabaseCheckpointer::getNextIDs_(chkpt_id_t id) const +{ + (void)id; + return {}; +} + +void DatabaseCheckpointer::setHead_(CheckpointBase* head) +{ + setHeadID_(head->getID()); + Checkpointer::setHead_(head); +} + +void DatabaseCheckpointer::setCurrent_(CheckpointBase* current) +{ + setCurrentID_(current->getID()); + Checkpointer::setCurrent_(current); +} + +void DatabaseCheckpointer::setHeadID_(chkpt_id_t id) +{ + std::lock_guard lock(mutex_); + sparta_assert(id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); + sparta_assert(head_id_ == checkpoint_type::UNIDENTIFIED_CHECKPOINT); + head_id_ = id; +} + +void DatabaseCheckpointer::setCurrentID_(chkpt_id_t id) +{ + std::lock_guard lock(mutex_); + sparta_assert(id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); + current_id_ = id; +} + +void DatabaseCheckpointer::addToCache_(std::unique_ptr chkpt) +{ + std::lock_guard lock(mutex_); + auto id = chkpt->getID(); + chkpt_ids_for_pipeline_head_.push(id); + chkpts_cache_[id] = std::move(chkpt); +} + +bool DatabaseCheckpointer::cloneNextPipelineHeadCheckpoint_(std::unique_ptr& next) +{ + std::lock_guard lock(mutex_); + if (chkpt_ids_for_pipeline_head_.empty()) { + return false; + } + + auto next_id = chkpt_ids_for_pipeline_head_.front(); + chkpt_ids_for_pipeline_head_.pop(); + + auto it = chkpts_cache_.find(next_id); + sparta_assert(it != chkpts_cache_.end()); + + auto& next_chkpt = it->second; + next = next_chkpt->clone(); + return true; +} + +REGISTER_SIMDB_APPLICATION(DatabaseCheckpointer); + +} // namespace sparta::serialization::checkpoint diff --git a/sparta/test/FastCheckpoint/CMakeLists.txt b/sparta/test/FastCheckpoint/CMakeLists.txt index 19c186e058..5a94bdd494 100644 --- a/sparta/test/FastCheckpoint/CMakeLists.txt +++ b/sparta/test/FastCheckpoint/CMakeLists.txt @@ -6,4 +6,4 @@ sparta_test(FastCheckpoint_test FastCheckpoint_test_RUN) add_subdirectory(FILEStream) add_subdirectory(PersistentFastCheckpoint) - +add_subdirectory(DatabaseCheckpoint) diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/CMakeLists.txt b/sparta/test/FastCheckpoint/DatabaseCheckpoint/CMakeLists.txt new file mode 100644 index 0000000000..2d219a250f --- /dev/null +++ b/sparta/test/FastCheckpoint/DatabaseCheckpoint/CMakeLists.txt @@ -0,0 +1,5 @@ +project(DatabaseCheckpoint_test) + +sparta_add_test_executable(DatabaseCheckpoint_test DatabaseCheckpoint_test.cpp) + +sparta_test(DatabaseCheckpoint_test DatabaseCheckpoint_test_RUN) diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp new file mode 100644 index 0000000000..edb9a672b3 --- /dev/null +++ b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp @@ -0,0 +1,181 @@ +#include +#include +#include +#include +#include +#include + +#include "sparta/sparta.hpp" +#include "sparta/simulation/TreeNode.hpp" +#include "sparta/log/Tap.hpp" +#include "sparta/log/Destination.hpp" +#include "sparta/functional/Register.hpp" +#include "sparta/functional/RegisterSet.hpp" +#include "sparta/memory/MemoryObject.hpp" +#include "sparta/serialization/checkpoint/DatabaseCheckpointer.hpp" +#include "sparta/utils/SpartaTester.hpp" + +#include "simdb/apps/AppManager.hpp" +#include "simdb/sqlite/DatabaseManager.hpp" +#include "simdb/pipeline/Pipeline.hpp" + +/*! + * \file DatabaseCheckpoint_test.cpp + * \brief Test for SimDB-backed Checkpoints + * + * This is modified from FastCheckpoint_test.cpp. + * + * Register is built on DataView and RegisterSet is built on ArchData. + * The DataView test performs extensive testing so some test-cases related + * to register sizes and layouts may be omitted from this test. + */ + +TEST_INIT + +using sparta::Register; +using sparta::RegisterSet; +using sparta::RootTreeNode; +using sparta::memory::MemoryObject; +using sparta::memory::BlockingMemoryObjectIFNode; +using sparta::serialization::checkpoint::DatabaseCheckpointer; + +static const uint16_t HINT_NONE=0; + +//! Some register and field definition tables +Register::Definition reg_defs[] = { + { 0, "reg0", Register::GROUP_NUM_NONE, "", Register::GROUP_IDX_NONE, "reg desc", 1, + {}, {}, nullptr, Register::INVALID_ID, 0, nullptr, HINT_NONE, 0 }, + { 1, "reg1", Register::GROUP_NUM_NONE, "", Register::GROUP_IDX_NONE, "reg desc", 2, + {}, {}, nullptr, Register::INVALID_ID, 0, nullptr, HINT_NONE, 0 }, + { 2, "reg2", Register::GROUP_NUM_NONE, "", Register::GROUP_IDX_NONE, "reg desc", 4, + {}, {}, nullptr, Register::INVALID_ID, 0, nullptr, HINT_NONE, 0 }, + { 3, "reg3", Register::GROUP_NUM_NONE, "", Register::GROUP_IDX_NONE, "reg desc", 8, + {}, {}, nullptr, Register::INVALID_ID, 0, nullptr, HINT_NONE, 0 }, + { 4, "reg4", Register::GROUP_NUM_NONE, "", Register::GROUP_IDX_NONE, "reg desc", 16, + {}, {}, nullptr, Register::INVALID_ID, 0, nullptr, HINT_NONE, 0 }, + Register::DEFINITION_END +}; + +//! Dummy device +class DummyDevice : public sparta::TreeNode +{ +public: + DummyDevice(sparta::TreeNode* parent) : + sparta::TreeNode(parent, "dummy", "", sparta::TreeNode::GROUP_IDX_NONE, "dummy node for register test") + {} +}; + +//! General test for saving and loading checkpoints to/from SimDB +void generalTest() +{ + sparta::Scheduler sched; + RootTreeNode clocks("clocks"); + sparta::Clock clk(&clocks, "clock", &sched); + + // Create a tree with some register sets and memory + RootTreeNode root; + + DummyDevice dummy(&root); + std::unique_ptr rset(RegisterSet::create(&dummy, reg_defs)); + + DummyDevice dummy2(&dummy); + std::unique_ptr rset2(RegisterSet::create(&dummy2, reg_defs)); + + auto r1 = rset->getRegister("reg2"); + auto r2 = rset2->getRegister("reg2"); + assert(r1 != r2); + + simdb::DatabaseManager db_mgr("test.db", true); + simdb::AppManager app_mgr(&db_mgr); + + // Setup... + app_mgr.getAppFactory()->setSpartaElems(root, &sched); + app_mgr.enableApp(DatabaseCheckpointer::NAME); + app_mgr.createEnabledApps(); + app_mgr.postInit(0, nullptr); + app_mgr.openPipelines(); + + auto& dbcp = *app_mgr.getApp(); + dbcp.setSnapshotThreshold(100); + + root.enterConfiguring(); + root.enterFinalized(); + sched.finalize(); + EXPECT_EQUAL(sched.getCurrentTick(), 0); // Unfinalized sched at tick 0 + + // CHECKPOINT: HEAD + DatabaseCheckpointer::chkpt_id_t head_id; + EXPECT_NOTHROW(dbcp.createHead()); + head_id = dbcp.getHeadID(); + EXPECT_EQUAL(head_id, 0); + + // Checkpoints 1 through 10000. Save a few of the register values + // with their checkpoint IDs so we can verify the correct registers + // after rolling back to previous checkpoints. + std::vector chkpt_ids; + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> distrib(1, 10); + + for (uint32_t i = 1; i <= 10000; ++i) { + r1->write(i * 5ul); + r2->write(i % 5ul); + sched.run(1, true, false); + EXPECT_EQUAL(i, sched.getCurrentTick()); + EXPECT_EQUAL(i, dbcp.getCurrentTick()); + + DatabaseCheckpointer::chkpt_id_t id; + EXPECT_NOTHROW(id = dbcp.createCheckpoint()); + EXPECT_EQUAL(id, i); + + if (distrib(gen) == 5) { + chkpt_ids.push_back(id); + } + } + + // Shuffle up the checkpoint IDs and wait a bit before we start + // loading checkpoints and verifying the registers. + std::shuffle(chkpt_ids.begin(), chkpt_ids.end(), gen); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + for (auto id : chkpt_ids) { + EXPECT_NOTHROW(dbcp.loadCheckpoint(id)); + + auto chkpt = dbcp.findCheckpoint(id); + EXPECT_TRUE(chkpt.has_value()); + + if (chkpt.has_value()) { + uint32_t expected_r1 = id * 5ul; + EXPECT_EQUAL(r1->read(), expected_r1); + + uint32_t expected_r2 = id % 5ul; + EXPECT_EQUAL(r2->read(), expected_r2); + + auto expected_tick = dbcp.getCurrentTick(); + EXPECT_EQUAL(sched.getCurrentTick(), expected_tick); + EXPECT_EQUAL(chkpt->getTick(), expected_tick); + } + } + + // Finish... + app_mgr.postSimLoopTeardown(); +} + +int main() +{ + std::unique_ptr warn_cerr(new sparta::log::Tap( + sparta::TreeNode::getVirtualGlobalNode(), + sparta::log::categories::WARN, + std::cerr)); + + std::unique_ptr warn_file(new sparta::log::Tap( + sparta::TreeNode::getVirtualGlobalNode(), + sparta::log::categories::WARN, + "warnings.log")); + + generalTest(); + + REPORT_ERROR; + return ERROR_CODE; +} From bc571e837f50bfa2e55c6237458150ada4e14b44 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 12 Aug 2025 16:32:08 -0500 Subject: [PATCH 03/30] Database-backed checkpointer --- .../serialization/checkpoint/Checkpoint.hpp | 3 +- .../checkpoint/DatabaseCheckpoint.hpp | 52 ++--- .../checkpoint/DatabaseCheckpointAccessor.hpp | 183 ++++++++++++++++++ .../checkpoint/DatabaseCheckpointAccessor.tpp | 138 +++++++++++++ .../checkpoint/DatabaseCheckpointBase.hpp | 115 +++++++++++ .../checkpoint/DatabaseCheckpointer.hpp | 29 +-- .../checkpoint/FastCheckpointer.hpp | 1 - sparta/src/DatabaseCheckpoint.cpp | 47 ++--- sparta/src/DatabaseCheckpointer.cpp | 119 +++++++++--- .../DatabaseCheckpoint_test.cpp | 18 +- 10 files changed, 586 insertions(+), 119 deletions(-) create mode 100644 sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.hpp create mode 100644 sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp create mode 100644 sparta/sparta/serialization/checkpoint/DatabaseCheckpointBase.hpp diff --git a/sparta/sparta/serialization/checkpoint/Checkpoint.hpp b/sparta/sparta/serialization/checkpoint/Checkpoint.hpp index f5753b9b68..86e3a1b86c 100644 --- a/sparta/sparta/serialization/checkpoint/Checkpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/Checkpoint.hpp @@ -51,12 +51,11 @@ namespace sparta::serialization::checkpoint public: - /*! * \brief Removes this checkpoint from the chain and patches chain between prev * and each item in the nexts list */ - virtual void disconnect() { + virtual ~Checkpoint() { if(getPrev()){ getPrev()->removeNext(this); } diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp index de284b571e..d91abe832d 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp @@ -2,11 +2,7 @@ #pragma once -#include "sparta/serialization/checkpoint/CheckpointBase.hpp" -#include "sparta/serialization/checkpoint/CheckpointExceptions.hpp" -#include "sparta/serialization/checkpoint/VectorStorage.hpp" - -#include +#include "sparta/serialization/checkpoint/DatabaseCheckpointBase.hpp" namespace sparta::serialization::checkpoint { @@ -16,7 +12,7 @@ namespace sparta::serialization::checkpoint * \brief Checkpoint class optimized for use with database-backed * checkpointers. */ - class DatabaseCheckpoint : public CheckpointBase + class DatabaseCheckpoint : public DatabaseCheckpointBase { public: @@ -46,14 +42,15 @@ namespace sparta::serialization::checkpoint const std::vector& dats, chkpt_id_t id, tick_t tick, - chkpt_id_t prev_id, + DatabaseCheckpoint* prev, bool is_snapshot, DatabaseCheckpointer* checkpointer); //! \brief This constructor is called during checkpoing cloning DatabaseCheckpoint(chkpt_id_t prev_id, - chkpt_id_t deleted_id_, - bool is_snapshot_, + const std::vector& next_ids, + chkpt_id_t deleted_id, + bool is_snapshot, const storage::VectorStorage& storage, DatabaseCheckpointer* checkpointer); @@ -68,6 +65,7 @@ namespace sparta::serialization::checkpoint void serialize(Archive& ar, const unsigned int version) { CheckpointBase::serialize(ar, version); ar & prev_id_; + ar & next_ids_; ar & deleted_id_; ar & is_snapshot_; ar & data_; @@ -85,13 +83,6 @@ namespace sparta::serialization::checkpoint */ void dumpData(std::ostream& o) const override; - /*! - * \brief Dumps the restore chain for this checkpoint. - * \see getRestoreChain() - * \param o ostream to which chain data will be dumped - */ - void dumpRestoreChain(std::ostream& o) const; - /*! * \brief Returns memory usage by this checkpoint */ @@ -102,12 +93,6 @@ namespace sparta::serialization::checkpoint */ uint64_t getContentMemoryUse() const noexcept override; - /*! - * \brief Implement trace of a value across the restore chain as described in Checkpointer::traceValue - */ - void traceValue(std::ostream& o, const std::vector& dats, - const ArchData* container, uint32_t offset, uint32_t size); - /*! * \brief Returns a stack of checkpoints from this checkpoint as far * back as possible until no previous link is found. This is a superset @@ -115,14 +100,14 @@ namespace sparta::serialization::checkpoint * to be inspected for restoring this checkpoint's data. This may reach * the head checkpoint if no gaps are encountered. */ - std::stack getHistoryChain() const; + std::stack getHistoryChain() const override; /*! * \brief Returns a stack of checkpoints that must be restored from * top-to-bottom to fully restore the state associated with this * checkpoint. */ - std::stack getRestoreChain() const; + std::stack getRestoreChain() const override; /*! * \brief Get the ID of our previous checkpoint. Returns UNIDENTIFIED_CHECKPOINT @@ -154,7 +139,7 @@ namespace sparta::serialization::checkpoint * \warning This is a recursive search of a checkpoint tree which has potentially many * branches and could have high time cost */ - bool canDelete() const noexcept; + bool canDelete() const noexcept override; /*! * \brief Allows this checkpoint to be deleted if it is no longer a @@ -167,7 +152,7 @@ namespace sparta::serialization::checkpoint * \see canDelete * \see isFlaggedDeleted */ - void flagDeleted(); + void flagDeleted() override; /*! * \brief Indicates whether this checkpoint has been flagged deleted. @@ -176,14 +161,14 @@ namespace sparta::serialization::checkpoint * \note If false, Checkpoint ID will also be UNIDENTIFIED_CHECKPOINT * \see flagDeleted() */ - bool isFlaggedDeleted() const noexcept; + bool isFlaggedDeleted() const noexcept override; /*! * \brief Return the ID had by this checkpoint before it was deleted * If this checkpoint has not been flagged for deletion, this will be * UNIDENTIFIED_CHECKPOINT */ - chkpt_id_t getDeletedID() const noexcept; + chkpt_id_t getDeletedID() const noexcept override; /*! * \brief Gets the representation of this deleted checkpoint as part of @@ -196,7 +181,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Is this checkpoint a snapshot (contains ALL simulator state) */ - bool isSnapshot() const noexcept; + bool isSnapshot() const noexcept override; /*! * \brief Determines how many checkpoints away the closest, earlier @@ -208,14 +193,14 @@ namespace sparta::serialization::checkpoint * \note This is a noexcept function, which means that the exception if * no snapshot is encountered is uncatchable. This is intentional. */ - uint32_t getDistanceToPrevSnapshot() const noexcept; + uint32_t getDistanceToPrevSnapshot() const noexcept override; /*! * \brief Loads delta state of this checkpoint to root. * Does not look at any other checkpoints checkpoints. * \see load */ - void loadState(const std::vector& dats); + void loadState(const std::vector& dats) override; /*! * \brief Create a deep copy of this checkpoint. @@ -245,6 +230,11 @@ namespace sparta::serialization::checkpoint */ chkpt_id_t prev_id_; + /*! + * \brief IDs of the next checkpoints. + */ + std::vector next_ids_; + /*! * \brief ID of the checkpoint before it was deleted. This is invalid * until deletion. Prevents misuse of checkpoint ID or any confusion diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.hpp new file mode 100644 index 0000000000..0a4b587d82 --- /dev/null +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.hpp @@ -0,0 +1,183 @@ +// -*- C++ -*- + +#pragma once + +#include "sparta/serialization/checkpoint/DatabaseCheckpointBase.hpp" + +namespace sparta::serialization::checkpoint +{ + +class DatabaseCheckpoint; +class DatabaseCheckpointer; + +/*! + * \brief This class wraps a DatabaseCheckpoint and recreates it from disk + * if the checkpoint no longer exists in the checkpointer in memory. + */ +template +class DatabaseCheckpointAccessor : public DatabaseCheckpointBase +{ +public: + using db_checkpointer = std::conditional_t; + using db_checkpoint = std::conditional_t; + + //! Constructor + DatabaseCheckpointAccessor(db_checkpointer* checkpointer, chkpt_id_t id); + + //! Moves allowed + DatabaseCheckpointAccessor(DatabaseCheckpointAccessor&&) = default; + + //! Copies disallowed + DatabaseCheckpointAccessor(const DatabaseCheckpointAccessor&) = delete; + + //! Move assignment disallowed + DatabaseCheckpointAccessor& operator=(DatabaseCheckpointAccessor&&) = delete; + + //! Copy assignment disallowed + DatabaseCheckpointAccessor& operator=(const DatabaseCheckpointAccessor&) = delete; + + //! For parity with all the other in-memory checkpoint types. + DatabaseCheckpointAccessor* operator->() { return this; } + + //! For parity with all the other in-memory checkpoint types. + const DatabaseCheckpointAccessor* operator->() const { return this; } + + //! Destructor + ~DatabaseCheckpointAccessor(); + + /*! + * \brief Returns a string describing this object + */ + std::string stringize() const override; + + /*! + * \brief Writes all checkpoint raw data to an ostream + * \param o ostream to which raw data will be written + * \note No newlines or other extra characters will be appended + */ + void dumpData(std::ostream& o) const override; + + /*! + * \brief Returns memory usage by this checkpoint including any + * framework data structures + */ + uint64_t getTotalMemoryUse() const noexcept override; + + /*! + * \brief Returns memory usage by this checkpoint solely for the + * checkpointed content. + */ + uint64_t getContentMemoryUse() const noexcept override; + + /*! + * \brief Attempts to restore this checkpoint state to the simulation + * state (ArchData) objects given to this Checkpoint at construction + */ + void load(const std::vector& dats) override; + + /*! + * \brief Get the ID of our previous checkpoint. Returns UNIDENTIFIED_CHECKPOINT + * if we have no previous checkpoint, as is the case with the head checkpoint + * and snapshots. + */ + chkpt_id_t getPrevID() const override; + + /*! + * \brief Returns next checkpoint following *this. May be an empty + * vector if there are no later checkpoints. + */ + std::vector getNextIDs() const override; + + /*! + * \brief Gets the representation of this deleted checkpoint as part of + * a checkpoint chain (if that checkpointer supports deletion) + */ + std::string getDeletedRepr() const override; + + /*! + * \brief Returns a stack of checkpoints from this checkpoint as far + * back as possible until no previous link is found. This is a superset + * of getRestoreChain and contains checkpoints that do not actually need + * to be inspected for restoring this checkpoint's data. This may reach + * the head checkpoint if no gaps are encountered. + */ + std::stack getHistoryChain() const override; + + /*! + * \brief Returns a stack of checkpoints that must be restored from + * top-to-bottom to fully restore the state associated with this + * checkpoint. + */ + std::stack getRestoreChain() const override; + + /*! + * \brief Can this checkpoint be deleted + * Cannot be deleted if: + * \li This checkpoint has any ancestors which are not deletable and not snapshots + * \li This checkpoint was not flagged for deletion with flagDeleted + * \warning This is a recursive search of a checkpoint tree which has potentially many + * branches and could have high time cost + */ + bool canDelete() const noexcept override; + + /*! + * \brief Allows this checkpoint to be deleted if it is no longer a + * previous delta of some other delta (i.e. getNexts() returns an + * empty vector). Sets the checkpoint ID to invalid. Calling multiple + * times has no effect + * \pre Must not already be flagged deleted + * \post isFlaggedDeleted() will return true + * \post getDeletedID() will return the current ID (if any) + * \see canDelete + * \see isFlaggedDeleted + */ + void flagDeleted() override; + + /*! + * \brief Indicates whether this checkpoint has been flagged deleted. + * \note Does not imply that the checkpoint can safely be deleted; + * only that it was flagged for deletion. + * \note If false, Checkpoint ID will also be UNIDENTIFIED_CHECKPOINT + * \see flagDeleted() + */ + bool isFlaggedDeleted() const noexcept override; + + /*! + * \brief Return the ID had by this checkpoint before it was deleted + * If this checkpoint has not been flagged for deletion, this will be + * UNIDENTIFIED_CHECKPOINT + */ + chkpt_id_t getDeletedID() const noexcept override; + + /*! + * \brief Is this checkpoint a snapshot (contains ALL simulator state) + */ + bool isSnapshot() const noexcept override; + + /*! + * \brief Determines how many checkpoints away the closest, earlier + * snapshot is. + * \return distance to closest snapshot. If this node is a snapshot, + * returns 0; if immediate getPrev() is a snapshot, returns 1; and + * so on. + * + * \note This is a noexcept function, which means that the exception if + * no snapshot is encountered is uncatchable. This is intentional. + */ + uint32_t getDistanceToPrevSnapshot() const noexcept override; + + /*! + * \brief Loads delta state of this checkpoint to root. + * Does not look at any other checkpoints checkpoints. + * \see load + */ + void loadState(const std::vector& dats) override; + +private: + db_checkpointer* checkpointer_; + chkpt_id_t id_; +}; + +} // namespace sparta::serialization::checkpoint + +#include "sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp" diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp new file mode 100644 index 0000000000..0821afa535 --- /dev/null +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp @@ -0,0 +1,138 @@ +namespace sparta::serialization::checkpoint +{ + +using chkpt_id_t = typename CheckpointBase::chkpt_id_t; + +template +DatabaseCheckpointAccessor::DatabaseCheckpointAccessor(db_checkpointer* checkpointer, chkpt_id_t id) +{ + //TODO cnyce + (void)checkpointer; + (void)id; +} + +template +DatabaseCheckpointAccessor::~DatabaseCheckpointAccessor() +{ + //TODO cnyce +} + +template +std::string DatabaseCheckpointAccessor::stringize() const +{ + //TODO cnyce + return ""; +} + +template +void DatabaseCheckpointAccessor::dumpData(std::ostream& o) const +{ + //TODO cnyce + (void)o; +} + +template +uint64_t DatabaseCheckpointAccessor::getTotalMemoryUse() const noexcept +{ + //TODO cnyce + return 0; +} + +template +uint64_t DatabaseCheckpointAccessor::getContentMemoryUse() const noexcept +{ + //TODO cnyce + return 0; +} + +template +void DatabaseCheckpointAccessor::load(const std::vector& dats) +{ + //TODO cnyce + (void)dats; +} + +template +chkpt_id_t DatabaseCheckpointAccessor::getPrevID() const +{ + //TODO cnyce + return 0; +} + +template +std::vector DatabaseCheckpointAccessor::getNextIDs() const +{ + //TODO cnyce + return {}; +} + +template +std::string DatabaseCheckpointAccessor::getDeletedRepr() const +{ + //TODO cnyce + return ""; +} + +template +std::stack DatabaseCheckpointAccessor::getHistoryChain() const +{ + //TODO cnyce + return {}; +} + +template +std::stack DatabaseCheckpointAccessor::getRestoreChain() const +{ + //TODO cnyce + return {}; +} + +template +bool DatabaseCheckpointAccessor::canDelete() const noexcept +{ + //TODO cnyce + return false; +} + +template +void DatabaseCheckpointAccessor::flagDeleted() +{ + //TODO cnyce +} + +template +bool DatabaseCheckpointAccessor::isFlaggedDeleted() const noexcept +{ + //TODO cnyce + return false; +} + +template +chkpt_id_t DatabaseCheckpointAccessor::getDeletedID() const noexcept +{ + //TODO cnyce + return 0; +} + +template +bool DatabaseCheckpointAccessor::isSnapshot() const noexcept +{ + //TODO cnyce + return false; +} + +template +uint32_t DatabaseCheckpointAccessor::getDistanceToPrevSnapshot() const noexcept +{ + //TODO cnyce + return 0; +} + +template +void DatabaseCheckpointAccessor::loadState(const std::vector& dats) +{ + //TODO cnyce + (void)dats; +} + +} // namespace sparta::serialization::checkpoint diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointBase.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointBase.hpp new file mode 100644 index 0000000000..67fd9cd1c0 --- /dev/null +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointBase.hpp @@ -0,0 +1,115 @@ +// -*- C++ -*- + +#pragma once + +#include "sparta/serialization/checkpoint/CheckpointBase.hpp" +#include "sparta/serialization/checkpoint/CheckpointExceptions.hpp" +#include "sparta/serialization/checkpoint/VectorStorage.hpp" + +#include + +namespace sparta::serialization::checkpoint +{ + class DatabaseCheckpointer; + + /*! + * \brief Checkpoint class optimized for use with database-backed + * checkpointers. + */ + class DatabaseCheckpointBase : public CheckpointBase + { + public: + /*! + * \brief Forwarding constructor + */ + template + DatabaseCheckpointBase(Args&&... args) + : CheckpointBase(std::forward(args)...) + {} + + /*! + * \brief Destructor + */ + virtual ~DatabaseCheckpointBase() = default; + + /*! + * \brief Returns a stack of checkpoints from this checkpoint as far + * back as possible until no previous link is found. This is a superset + * of getRestoreChain and contains checkpoints that do not actually need + * to be inspected for restoring this checkpoint's data. This may reach + * the head checkpoint if no gaps are encountered. + */ + virtual std::stack getHistoryChain() const = 0; + + /*! + * \brief Returns a stack of checkpoints that must be restored from + * top-to-bottom to fully restore the state associated with this + * checkpoint. + */ + virtual std::stack getRestoreChain() const = 0; + + /*! + * \brief Can this checkpoint be deleted + * Cannot be deleted if: + * \li This checkpoint has any ancestors which are not deletable and not snapshots + * \li This checkpoint was not flagged for deletion with flagDeleted + * \warning This is a recursive search of a checkpoint tree which has potentially many + * branches and could have high time cost + */ + virtual bool canDelete() const noexcept = 0; + + /*! + * \brief Allows this checkpoint to be deleted if it is no longer a + * previous delta of some other delta (i.e. getNexts() returns an + * empty vector). Sets the checkpoint ID to invalid. Calling multiple + * times has no effect + * \pre Must not already be flagged deleted + * \post isFlaggedDeleted() will return true + * \post getDeletedID() will return the current ID (if any) + * \see canDelete + * \see isFlaggedDeleted + */ + virtual void flagDeleted() = 0; + + /*! + * \brief Indicates whether this checkpoint has been flagged deleted. + * \note Does not imply that the checkpoint can safely be deleted; + * only that it was flagged for deletion. + * \note If false, Checkpoint ID will also be UNIDENTIFIED_CHECKPOINT + * \see flagDeleted() + */ + virtual bool isFlaggedDeleted() const noexcept = 0; + + /*! + * \brief Return the ID had by this checkpoint before it was deleted + * If this checkpoint has not been flagged for deletion, this will be + * UNIDENTIFIED_CHECKPOINT + */ + virtual chkpt_id_t getDeletedID() const noexcept = 0; + + /*! + * \brief Is this checkpoint a snapshot (contains ALL simulator state) + */ + virtual bool isSnapshot() const noexcept = 0; + + /*! + * \brief Determines how many checkpoints away the closest, earlier + * snapshot is. + * \return distance to closest snapshot. If this node is a snapshot, + * returns 0; if immediate getPrev() is a snapshot, returns 1; and + * so on. + * + * \note This is a noexcept function, which means that the exception if + * no snapshot is encountered is uncatchable. This is intentional. + */ + virtual uint32_t getDistanceToPrevSnapshot() const noexcept = 0; + + /*! + * \brief Loads delta state of this checkpoint to root. + * Does not look at any other checkpoints checkpoints. + * \see load + */ + virtual void loadState(const std::vector& dats) = 0; + }; + +} // namespace sparta::serialization::checkpoint diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp index 5c8388f82a..9a0a2ab927 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -4,10 +4,9 @@ #include "sparta/serialization/checkpoint/Checkpointer.hpp" #include "sparta/serialization/checkpoint/DatabaseCheckpoint.hpp" -#include "simdb/apps/AppRegistration.hpp" +#include "sparta/serialization/checkpoint/DatabaseCheckpointAccessor.hpp" #include "simdb/apps/App.hpp" -#include "simdb/utils/ConcurrentQueue.hpp" -#include +#include "simdb/pipeline/Pipeline.hpp" //! Default threshold for creating snapshots #ifndef DEFAULT_SNAPSHOT_THRESH @@ -17,6 +16,8 @@ namespace sparta::serialization::checkpoint { +class DatabaseCheckpointer; + /*! * \brief Implementation of the FastCheckpointer which only holds * a "window" of checkpoints in memory at any given time, and sends @@ -210,7 +211,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \throw CheckpointError if \a from does not refer to a valid * checkpoint. */ - std::optional findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from); + DatabaseCheckpointAccessor findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from); /*! * \brief Finds a checkpoint by its ID @@ -218,7 +219,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * deleted * \return Checkpoint with ID of \a id if found or nullptr if not found */ - std::optional findCheckpoint(chkpt_id_t id) noexcept; + DatabaseCheckpointAccessor findCheckpoint(chkpt_id_t id) noexcept; /*! * \brief Tests whether this checkpoint manager has a checkpoint with @@ -377,7 +378,12 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * returns nullptr. * \todo Faster lookup? */ - std::optional findCheckpoint_(chkpt_id_t id) const noexcept; + DatabaseCheckpointAccessor findCheckpoint_(chkpt_id_t id) noexcept; + + /*! + * \brief Const version of findCheckpoint_() + */ + DatabaseCheckpointAccessor findCheckpoint_(chkpt_id_t id) const noexcept; /*! * \brief Implements Checkpointer::dumpCheckpointNode_ @@ -413,12 +419,12 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer /*! * \brief Add the given checkpoint to the cache and start processing it. */ - void addToCache_(std::unique_ptr chkpt); + void addToCache_(std::shared_ptr chkpt); /*! * \brief Clone the next checkpoint that is ready for processing. */ - bool cloneNextPipelineHeadCheckpoint_(std::unique_ptr& next); + bool cloneNextPipelineHeadCheckpoint_(std::shared_ptr& next); //! \brief Checkpointer head ID. Used to prevent the head from being deleted from the cache. chkpt_id_t head_id_ = checkpoint_type::UNIDENTIFIED_CHECKPOINT; @@ -427,7 +433,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer chkpt_id_t current_id_ = checkpoint_type::UNIDENTIFIED_CHECKPOINT; //! \brief Subset (or all of) our checkpoints that we currently are holding in memory. - std::unordered_map> chkpts_cache_; + std::unordered_map> chkpts_cache_; //! \brief Ordered running list of checkpoint IDs that come in via calls to createCheckpoint_(). //! This is used in the pipeline to pick off and start processing checkpoints in the same order @@ -441,11 +447,6 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer //! \brief SimDB instance simdb::DatabaseManager* db_mgr_ = nullptr; - //! \brief Cloned checkpoints for pipeline. Original checkpoints held in cache. - //using checkpoint_clone = checkpoint_type::DetachedClone; - //simdb::ConcurrentQueue>* pipeline_head_ = nullptr; - - /*! * \brief Snapshot generation threshold. Every n checkpoints in a chain * are taken as snapshots instead of deltas diff --git a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp index 3931fa81a0..fc07bf9193 100644 --- a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp @@ -565,7 +565,6 @@ namespace sparta::serialization::checkpoint // Erase element in the map auto itr = chkpts_.find(id); sparta_assert(itr != chkpts_.end()); - itr->second->disconnect(); chkpts_.erase(itr); } diff --git a/sparta/src/DatabaseCheckpoint.cpp b/sparta/src/DatabaseCheckpoint.cpp index 0d41d63298..9d760f2705 100644 --- a/sparta/src/DatabaseCheckpoint.cpp +++ b/sparta/src/DatabaseCheckpoint.cpp @@ -9,29 +9,33 @@ namespace sparta::serialization::checkpoint using tick_t = typename CheckpointBase::tick_t; using chkpt_id_t = typename CheckpointBase::chkpt_id_t; using checkpoint_type = DatabaseCheckpoint; -using checkpoint_uptr = std::unique_ptr; +using checkpoint_ptr = std::shared_ptr; DatabaseCheckpoint::DatabaseCheckpoint(TreeNode& root, const std::vector& dats, chkpt_id_t id, tick_t tick, - chkpt_id_t prev_id, + DatabaseCheckpoint* prev, bool is_snapshot, DatabaseCheckpointer* checkpointer) - : CheckpointBase(id, tick) - , prev_id_(prev_id) + : DatabaseCheckpointBase(id, tick) + , prev_id_(prev ? prev->getID() : UNIDENTIFIED_CHECKPOINT) , deleted_id_(UNIDENTIFIED_CHECKPOINT) , is_snapshot_(is_snapshot) , checkpointer_(checkpointer) { (void)root; - if (prev_id == UNIDENTIFIED_CHECKPOINT) { + if (prev_id_ == UNIDENTIFIED_CHECKPOINT) { if (is_snapshot == false) { throw CheckpointError("Cannot create a DatabaseCheckpoint id=") << id << " at tick=" << tick << " which has no prev_delta and is not a snapshot"; } } + if (prev) { + prev->next_ids_.push_back(getID()); + } + // Store the checkpoint from root if (is_snapshot) { storeSnapshot_(dats); @@ -41,12 +45,14 @@ DatabaseCheckpoint::DatabaseCheckpoint(TreeNode& root, } DatabaseCheckpoint::DatabaseCheckpoint(chkpt_id_t prev_id, + const std::vector& next_ids, chkpt_id_t deleted_id, bool is_snapshot, const storage::VectorStorage& storage, DatabaseCheckpointer* checkpointer) - : CheckpointBase(getID(), getTick()) + : DatabaseCheckpointBase(getID(), getTick()) , prev_id_(prev_id) + , next_ids_(next_ids) , deleted_id_(deleted_id) , is_snapshot_(is_snapshot) , data_(storage) @@ -77,11 +83,6 @@ void DatabaseCheckpoint::dumpData(std::ostream& o) const data_.dump(o); } -void DatabaseCheckpoint::dumpRestoreChain(std::ostream& o) const -{ - checkpointer_->dumpRestoreChain(o, getID()); -} - uint64_t DatabaseCheckpoint::getTotalMemoryUse() const noexcept { return getContentMemoryUse() \ @@ -94,21 +95,6 @@ uint64_t DatabaseCheckpoint::getContentMemoryUse() const noexcept return data_.getSize(); } -void DatabaseCheckpoint::traceValue( - std::ostream& o, - const std::vector& dats, - const ArchData* container, - uint32_t offset, - uint32_t size) -{ - // TODO cnyce - (void)o; - (void)dats; - (void)container; - (void)offset; - (void)size; -} - std::stack DatabaseCheckpoint::getHistoryChain() const { return checkpointer_->getHistoryChain(getID()); @@ -126,12 +112,13 @@ chkpt_id_t DatabaseCheckpoint::getPrevID() const std::vector DatabaseCheckpoint::getNextIDs() const { - return checkpointer_->getNextIDs(getID()); + return next_ids_; } void DatabaseCheckpoint::load(const std::vector& dats) { - checkpointer_->load(dats, getID()); + //TODO cnyce + (void)dats; } bool DatabaseCheckpoint::canDelete() const noexcept @@ -196,7 +183,7 @@ void DatabaseCheckpoint::loadState(const std::vector& dats) std::unique_ptr DatabaseCheckpoint::clone() const { - auto clone = new DatabaseCheckpoint(prev_id_, deleted_id_, is_snapshot_, data_, checkpointer_); + auto clone = new DatabaseCheckpoint(prev_id_, next_ids_, deleted_id_, is_snapshot_, data_, checkpointer_); return std::unique_ptr(clone); } @@ -205,7 +192,6 @@ void DatabaseCheckpoint::storeSnapshot_(const std::vector& dats) sparta_assert(data_.good(), "Attempted to storeSnapshot_ from a DatabaseCheckpoint with a bad data buffer"); - // Cannot have stored already for (ArchData* ad : dats) { ad->saveAll(data_); } @@ -216,7 +202,6 @@ void DatabaseCheckpoint::storeDelta_(const std::vector& dats) sparta_assert(data_.good(), "Attempted to storeDelta_ from a DatabaseCheckpoint with a bad data buffer"); - // Cannot have stored already for (ArchData* ad : dats) { ad->save(data_); } diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index 7e0507afdf..37d82ec1bd 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -1,6 +1,9 @@ // -*- C++ -*- #include "sparta/serialization/checkpoint/DatabaseCheckpointer.hpp" +#include "simdb/apps/AppRegistration.hpp" +#include "simdb/schema/SchemaDef.hpp" +#include "simdb/pipeline/AsyncDatabaseAccessor.hpp" #include "simdb/pipeline/Pipeline.hpp" #include "simdb/pipeline/elements/Function.hpp" #include "simdb/pipeline/elements/Buffer.hpp" @@ -19,12 +22,12 @@ namespace sparta::serialization::checkpoint using tick_t = typename CheckpointBase::tick_t; using chkpt_id_t = typename CheckpointBase::chkpt_id_t; using checkpoint_type = DatabaseCheckpoint; -using checkpoint_uptr = std::unique_ptr; -using checkpoint_uptrs = std::vector; +using checkpoint_ptr = std::shared_ptr; +using checkpoint_ptrs = std::vector; struct ChkptWindow { std::vector chkpt_ids; - checkpoint_uptrs chkpts; + checkpoint_ptrs chkpts; // TODO cnyce: Try to avoid use of unique_ptr. Everything is already movable // and has default constructors. @@ -77,10 +80,10 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( auto pipeline = std::make_unique(db_mgr_, NAME); // Task 1: Clone the next checkpoint from the cache to send down pipeline - auto feed_pipeline = simdb::pipeline::createTask>( - [this](simdb::ConcurrentQueue& out, bool /*simulation_terminating*/) mutable + auto feed_pipeline = simdb::pipeline::createTask>( + [this](simdb::ConcurrentQueue& out, bool /*simulation_terminating*/) mutable { - checkpoint_uptr next_chkpt; + checkpoint_ptr next_chkpt; if (cloneNextPipelineHeadCheckpoint_(next_chkpt)) { out.emplace(std::move(next_chkpt)); return true; @@ -92,11 +95,11 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( // Task 2: Buffer snapshots and their deltas into checkpoint windows const auto window_len = getSnapshotThreshold(); const auto flush_partial = true; - auto create_window = simdb::pipeline::createTask>(window_len, flush_partial); + auto create_window = simdb::pipeline::createTask>(window_len, flush_partial); // Task 3: Add the IDs of all checkpoints in this window - auto add_chkpt_ids = simdb::pipeline::createTask>( - [](checkpoint_uptrs&& chkpts, + auto add_chkpt_ids = simdb::pipeline::createTask>( + [](checkpoint_ptrs&& chkpts, simdb::ConcurrentQueue& windows, bool /*simulation_terminating*/) { @@ -196,142 +199,171 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( uint32_t DatabaseCheckpointer::getSnapshotThreshold() const noexcept { - return 0; + return snap_thresh_; } void DatabaseCheckpointer::setSnapshotThreshold(uint32_t thresh) noexcept { - (void)thresh; + snap_thresh_ = thresh; } uint64_t DatabaseCheckpointer::getTotalMemoryUse() const noexcept { + //TODO cnyce return 0; } uint64_t DatabaseCheckpointer::getContentMemoryUse() const noexcept { + //TODO cnyce return 0; } void DatabaseCheckpointer::deleteCheckpoint(chkpt_id_t id) { + //TODO cnyce (void)id; } void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) { + //TODO cnyce (void)id; } std::vector DatabaseCheckpointer::getCheckpointsAt(tick_t t) const { + //TODO cnyce (void)t; return {}; } std::vector DatabaseCheckpointer::getCheckpoints() const { + //TODO cnyce return {}; } uint32_t DatabaseCheckpointer::getNumCheckpoints() const noexcept { + //TODO cnyce return 0; } uint32_t DatabaseCheckpointer::getNumSnapshots() const noexcept { + //TODO cnyce return 0; } uint32_t DatabaseCheckpointer::getNumDeltas() const noexcept { + //TODO cnyce return 0; } uint32_t DatabaseCheckpointer::getNumDeadCheckpoints() const noexcept { + //TODO cnyce return 0; } std::deque DatabaseCheckpointer::getCheckpointChain(chkpt_id_t id) const { + //TODO cnyce return {}; } -std::optional DatabaseCheckpointer::findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from) +DatabaseCheckpointAccessor DatabaseCheckpointer::findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from) { + //TODO cnyce (void)tick; (void)from; - return std::optional(); + return DatabaseCheckpointAccessor(this, CheckpointBase::UNIDENTIFIED_CHECKPOINT); } -std::optional DatabaseCheckpointer::findCheckpoint(chkpt_id_t id) noexcept +DatabaseCheckpointAccessor DatabaseCheckpointer::findCheckpoint(chkpt_id_t id) noexcept { - (void)id; - return std::optional(); + return DatabaseCheckpointAccessor(this, id); } bool DatabaseCheckpointer::hasCheckpoint(chkpt_id_t id) const noexcept { + //TODO cnyce (void)id; return false; } void DatabaseCheckpointer::dumpRestoreChain(std::ostream& o, chkpt_id_t id) const { + //TODO cnyce (void)o; (void)id; } std::stack DatabaseCheckpointer::getHistoryChain(chkpt_id_t id) const { + //TODO cnyce (void)id; return {}; } std::stack DatabaseCheckpointer::getRestoreChain(chkpt_id_t id) const { + //TODO cnyce (void)id; return {}; } std::vector DatabaseCheckpointer::getNextIDs(chkpt_id_t id) const { + //TODO cnyce (void)id; return {}; } void DatabaseCheckpointer::load(const std::vector& dats, chkpt_id_t id) { + //TODO cnyce (void)dats; (void)id; } uint32_t DatabaseCheckpointer::getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept { + //TODO cnyce (void)id; return 0; } +bool DatabaseCheckpointer::canDelete(chkpt_id_t id) const noexcept +{ + //TODO cnyce + (void)id; + return true; +} + std::string DatabaseCheckpointer::stringize() const { + //TODO cnyce return ""; } void DatabaseCheckpointer::dumpList(std::ostream& o) const { + //TODO cnyce (void)o; } void DatabaseCheckpointer::dumpData(std::ostream& o) const { + //TODO cnyce (void)o; } void DatabaseCheckpointer::dumpAnnotatedData(std::ostream& o) const { + //TODO cnyce (void)o; } @@ -342,6 +374,7 @@ void DatabaseCheckpointer::traceValue( uint32_t offset, uint32_t size) { + //TODO cnyce (void)o; (void)id; (void)container; @@ -371,9 +404,9 @@ void DatabaseCheckpointer::createHead_() throw exc; } - std::unique_ptr chkpt(new checkpoint_type( + std::shared_ptr chkpt(new checkpoint_type( getRoot(), getArchDatas(), next_chkpt_id_++, tick, - checkpoint_type::UNIDENTIFIED_CHECKPOINT, true, this)); + nullptr, true, this)); setHead_(chkpt.get()); num_alive_checkpoints_++; @@ -429,9 +462,9 @@ chkpt_id_t DatabaseCheckpointer::createCheckpoint_(bool force_snapshot) is_snapshot = prev->getDistanceToPrevSnapshot() >= getSnapshotThreshold(); } - std::unique_ptr chkpt(new checkpoint_type( + std::shared_ptr chkpt(new checkpoint_type( getRoot(), getArchDatas(), next_chkpt_id_++, tick, - prev->getID(), force_snapshot || is_snapshot, this)); + prev, force_snapshot || is_snapshot, this)); auto current = chkpt.get(); setCurrent_(current); @@ -451,30 +484,55 @@ chkpt_id_t DatabaseCheckpointer::createCheckpoint_(bool force_snapshot) void DatabaseCheckpointer::cleanupChain_(chkpt_id_t id) { + // TODO cnyce (void)id; } bool DatabaseCheckpointer::recursForwardFindAlive_(chkpt_id_t id) const { + // TODO cnyce (void)id; return false; } -std::optional DatabaseCheckpointer::findCheckpoint_(chkpt_id_t id) const noexcept +DatabaseCheckpointAccessor DatabaseCheckpointer::findCheckpoint_(chkpt_id_t id) noexcept { - (void)id; - return std::optional(); + return DatabaseCheckpointAccessor(this, id); +} + +DatabaseCheckpointAccessor DatabaseCheckpointer::findCheckpoint_(chkpt_id_t id) const noexcept +{ + return DatabaseCheckpointAccessor(this, id); } void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const { - (void)id; - (void)o; + static std::string SNAPSHOT_NOTICE = "(s)"; + auto cp = findCheckpoint_(id); + + // Draw data for this checkpoint + if(cp->isFlaggedDeleted()){ + o << cp->getDeletedRepr(); + }else{ + o << cp->getID(); + } + // Show that this is a snapshot + if(cp->isSnapshot()){ + o << ' ' << SNAPSHOT_NOTICE; + } } std::vector DatabaseCheckpointer::getNextIDs_(chkpt_id_t id) const { - (void)id; + { + std::lock_guard lock(mutex_); + auto it = chkpts_cache_.find(id); + if (it != chkpts_cache_.end()) { + return it->second->getNextIDs(); + } + } + + // TODO cnyce: go to database return {}; } @@ -505,15 +563,18 @@ void DatabaseCheckpointer::setCurrentID_(chkpt_id_t id) current_id_ = id; } -void DatabaseCheckpointer::addToCache_(std::unique_ptr chkpt) +void DatabaseCheckpointer::addToCache_(std::shared_ptr chkpt) { std::lock_guard lock(mutex_); auto id = chkpt->getID(); chkpt_ids_for_pipeline_head_.push(id); - chkpts_cache_[id] = std::move(chkpt); + + auto& cp = chkpts_cache_[id]; + sparta_assert(!cp); + cp = std::move(chkpt); } -bool DatabaseCheckpointer::cloneNextPipelineHeadCheckpoint_(std::unique_ptr& next) +bool DatabaseCheckpointer::cloneNextPipelineHeadCheckpoint_(std::shared_ptr& next) { std::lock_guard lock(mutex_); if (chkpt_ids_for_pipeline_head_.empty()) { diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp index edb9a672b3..9982b82b28 100644 --- a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp +++ b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp @@ -143,19 +143,15 @@ void generalTest() EXPECT_NOTHROW(dbcp.loadCheckpoint(id)); auto chkpt = dbcp.findCheckpoint(id); - EXPECT_TRUE(chkpt.has_value()); + uint32_t expected_r1 = id * 5ul; + EXPECT_EQUAL(r1->read(), expected_r1); - if (chkpt.has_value()) { - uint32_t expected_r1 = id * 5ul; - EXPECT_EQUAL(r1->read(), expected_r1); + uint32_t expected_r2 = id % 5ul; + EXPECT_EQUAL(r2->read(), expected_r2); - uint32_t expected_r2 = id % 5ul; - EXPECT_EQUAL(r2->read(), expected_r2); - - auto expected_tick = dbcp.getCurrentTick(); - EXPECT_EQUAL(sched.getCurrentTick(), expected_tick); - EXPECT_EQUAL(chkpt->getTick(), expected_tick); - } + auto expected_tick = dbcp.getCurrentTick(); + EXPECT_EQUAL(sched.getCurrentTick(), expected_tick); + EXPECT_EQUAL(chkpt->getTick(), expected_tick); } // Finish... From ed2e9b8382013d3936066fb59a08e2d7f34b6067 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Wed, 20 Aug 2025 12:32:16 -0500 Subject: [PATCH 04/30] Database-backed checkpointer --- sparta/CMakeLists.txt | 1 + .../checkpoint/DatabaseCheckpointAccessor.tpp | 12 ++ .../checkpoint/DatabaseCheckpointQuery.hpp | 51 +++++ .../checkpoint/DatabaseCheckpointer.hpp | 40 ++-- sparta/src/DatabaseCheckpoint.cpp | 3 +- sparta/src/DatabaseCheckpointQuery.cpp | 57 ++++++ sparta/src/DatabaseCheckpointer.cpp | 190 ++++++++++++++---- 7 files changed, 306 insertions(+), 48 deletions(-) create mode 100644 sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp create mode 100644 sparta/src/DatabaseCheckpointQuery.cpp diff --git a/sparta/CMakeLists.txt b/sparta/CMakeLists.txt index e604953fc8..211bdb4acd 100644 --- a/sparta/CMakeLists.txt +++ b/sparta/CMakeLists.txt @@ -42,6 +42,7 @@ list (APPEND SourceCppFiles src/DAG.cpp src/DatabaseCheckpoint.cpp src/DatabaseCheckpointer.cpp + src/DatabaseCheckpointQuery.cpp src/Destination.cpp src/EdgeFactory.cpp src/EventNode.cpp diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp index 0821afa535..9b45972136 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp @@ -48,6 +48,10 @@ uint64_t DatabaseCheckpointAccessor::getContentMemoryUse() const noexce template void DatabaseCheckpointAccessor::load(const std::vector& dats) { + if constexpr (IsConst) { + throw SpartaException("Cannot call load() on a const DatabaseCheckpointAccessor"); + } + //TODO cnyce (void)dats; } @@ -97,6 +101,10 @@ bool DatabaseCheckpointAccessor::canDelete() const noexcept template void DatabaseCheckpointAccessor::flagDeleted() { + if constexpr (IsConst) { + throw SpartaException("Cannot call flagDeleted() on a const DatabaseCheckpointAccessor"); + } + //TODO cnyce } @@ -131,6 +139,10 @@ uint32_t DatabaseCheckpointAccessor::getDistanceToPrevSnapshot() const template void DatabaseCheckpointAccessor::loadState(const std::vector& dats) { + if constexpr (IsConst) { + throw SpartaException("Cannot call loadState() on a const DatabaseCheckpointAccessor"); + } + //TODO cnyce (void)dats; } diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp new file mode 100644 index 0000000000..8bd4ea0028 --- /dev/null +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp @@ -0,0 +1,51 @@ +// -*- C++ -*- + +#pragma once + +#include +#include + +namespace simdb +{ + class DatabaseManager; +} + +namespace sparta::serialization::checkpoint +{ + +/*! + * \brief SQLite query object to "extend" the checkpoint search space from just the + * cache to include the database. Combinations of in-memory checkpoints, recreated + * checkpoints, and database schema/query optimizations are used for performance. + */ +class DatabaseCheckpointQuery +{ +public: + //! \brief Construct with a SimDB instance + DatabaseCheckpointQuery(simdb::DatabaseManager* db_mgr) + : db_mgr_(db_mgr) + {} + + using chkpt_id_t = uint64_t; + using tick_t = uint64_t; + + bool hasCheckpoint(chkpt_id_t id) const noexcept; + + chkpt_id_t getPrevID(chkpt_id_t id) const; + + std::vector getNextIDs(chkpt_id_t id) const; + + std::vector getCheckpointsAt(tick_t t) const; + + std::vector getCheckpoints() const; + + uint32_t getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept; + + bool canDelete(chkpt_id_t id) const noexcept; + +private: + //! \brief SimDB instance + simdb::DatabaseManager* db_mgr_ = nullptr; +}; + +} // namespace sparta::serialization::checkpoint diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp index 9a0a2ab927..2670af6289 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -17,6 +17,7 @@ namespace sparta::serialization::checkpoint { class DatabaseCheckpointer; +class DatabaseCheckpointQuery; /*! * \brief Implementation of the FastCheckpointer which only holds @@ -46,15 +47,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \param sched Scheduler to read and restart on checkpoint restore (if * not nullptr) */ - DatabaseCheckpointer(simdb::DatabaseManager* db_mgr, TreeNode& root, Scheduler* sched=nullptr) : - Checkpointer(root, sched), - db_mgr_(db_mgr), - snap_thresh_(DEFAULT_SNAPSHOT_THRESH), - next_chkpt_id_(checkpoint_type::MIN_CHECKPOINT), - num_alive_checkpoints_(0), - num_alive_snapshots_(0), - num_dead_checkpoints_(0) - { } + DatabaseCheckpointer(simdb::DatabaseManager* db_mgr, TreeNode& root, Scheduler* sched=nullptr); /*! * \brief Define the SimDB schema for this checkpointer. @@ -219,7 +212,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * deleted * \return Checkpoint with ID of \a id if found or nullptr if not found */ - DatabaseCheckpointAccessor findCheckpoint(chkpt_id_t id) noexcept; + DatabaseCheckpointAccessor findCheckpoint(chkpt_id_t id); /*! * \brief Tests whether this checkpoint manager has a checkpoint with @@ -270,12 +263,30 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer void load(const std::vector& dats, chkpt_id_t id); /*! - * \brief TODO cnyce + * \brief Determines how many checkpoints away the closest, earlier + * snapshot is. + * \return distance to closest snapshot. If this node is a snapshot, + * returns 0; if immediate getPrev() is a snapshot, returns 1; and + * so on. + * + * \note This is a noexcept function, which means that the exception if + * no snapshot is encountered is uncatchable. This is intentional. */ uint32_t getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept; /*! - * \brief TODO cnyce + * \brief Check if the given checkpoint is a snapshot (not a delta). + * \return Returns false if not a snapshot or the id is not a checkpoint. + */ + bool isSnapshot(chkpt_id_t id) const noexcept; + + /*! + * \brief Can this checkpoint be deleted + * Cannot be deleted if: + * \li This checkpoint has any ancestors which are not deletable and not snapshots + * \li This checkpoint was not flagged for deletion with flagDeleted + * \warning This is a recursive search of a checkpoint tree which has potentially many + * branches and could have high time cost */ bool canDelete(chkpt_id_t id) const noexcept; @@ -441,6 +452,11 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer //! random access. std::queue chkpt_ids_for_pipeline_head_; + //! \brief SQLite query object to "extend" the checkpoint search space from just the + //! cache to include the database. Combinations of in-memory checkpoints, recreated + //! checkpoints, and database schema/query optimizations are used for performance. + std::shared_ptr chkpt_query_; + //! \brief Mutex to protect our checkpoints cache. mutable std::mutex mutex_; diff --git a/sparta/src/DatabaseCheckpoint.cpp b/sparta/src/DatabaseCheckpoint.cpp index 9d760f2705..f0af149285 100644 --- a/sparta/src/DatabaseCheckpoint.cpp +++ b/sparta/src/DatabaseCheckpoint.cpp @@ -117,8 +117,7 @@ std::vector DatabaseCheckpoint::getNextIDs() const void DatabaseCheckpoint::load(const std::vector& dats) { - //TODO cnyce - (void)dats; + checkpointer_->load(dats, getID()); } bool DatabaseCheckpoint::canDelete() const noexcept diff --git a/sparta/src/DatabaseCheckpointQuery.cpp b/sparta/src/DatabaseCheckpointQuery.cpp new file mode 100644 index 0000000000..948dd2c270 --- /dev/null +++ b/sparta/src/DatabaseCheckpointQuery.cpp @@ -0,0 +1,57 @@ +#include "sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp" + +namespace sparta::serialization::checkpoint +{ + +using chkpt_id_t = typename DatabaseCheckpointQuery::chkpt_id_t; +using tick_t = typename DatabaseCheckpointQuery::tick_t; + +bool DatabaseCheckpointQuery::hasCheckpoint(chkpt_id_t id) const noexcept +{ + //TODO cnyce + (void)id; + return false; +} + +chkpt_id_t DatabaseCheckpointQuery::getPrevID(chkpt_id_t id) const +{ + //TODO cnyce + (void)id; + return 0; +} + +std::vector DatabaseCheckpointQuery::getNextIDs(chkpt_id_t id) const +{ + //TODO cnyce + (void)id; + return {}; +} + +std::vector DatabaseCheckpointQuery::getCheckpointsAt(tick_t t) const +{ + //TODO cnyce + (void)t; + return {}; +} + +std::vector DatabaseCheckpointQuery::getCheckpoints() const +{ + //TODO cnyce + return {}; +} + +uint32_t DatabaseCheckpointQuery::getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept +{ + //TODO cnyce + (void)id; + return 0; +} + +bool DatabaseCheckpointQuery::canDelete(chkpt_id_t id) const noexcept +{ + //TODO cnyce + (void)id; + return false; +} + +} // namespace sparta::serialization::checkpoint diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index 37d82ec1bd..0e7746add9 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -1,6 +1,7 @@ // -*- C++ -*- #include "sparta/serialization/checkpoint/DatabaseCheckpointer.hpp" +#include "sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp" #include "simdb/apps/AppRegistration.hpp" #include "simdb/schema/SchemaDef.hpp" #include "simdb/pipeline/AsyncDatabaseAccessor.hpp" @@ -60,6 +61,17 @@ struct ChkptWindowBytes { using EvictedChkptIDs = std::vector; +DatabaseCheckpointer::DatabaseCheckpointer(simdb::DatabaseManager* db_mgr, TreeNode& root, Scheduler* sched) : + Checkpointer(root, sched), + chkpt_query_(std::make_shared(db_mgr)), + db_mgr_(db_mgr), + snap_thresh_(DEFAULT_SNAPSHOT_THRESH), + next_chkpt_id_(checkpoint_type::MIN_CHECKPOINT), + num_alive_checkpoints_(0), + num_alive_snapshots_(0), + num_dead_checkpoints_(0) +{ } + void DatabaseCheckpointer::defineSchema(simdb::Schema& schema) { using dt = simdb::SqlDataType; @@ -209,14 +221,22 @@ void DatabaseCheckpointer::setSnapshotThreshold(uint32_t thresh) noexcept uint64_t DatabaseCheckpointer::getTotalMemoryUse() const noexcept { - //TODO cnyce - return 0; + std::lock_guard lock(mutex_); + uint64_t mem = 0; + for (const auto& [id, chkpt] : chkpts_cache_) { + mem += chkpt->getTotalMemoryUse(); + } + return mem; } uint64_t DatabaseCheckpointer::getContentMemoryUse() const noexcept { - //TODO cnyce - return 0; + std::lock_guard lock(mutex_); + uint64_t mem = 0; + for (const auto& [id, chkpt] : chkpts_cache_) { + mem += chkpt->getTotalMemoryUse(); + } + return mem; } void DatabaseCheckpointer::deleteCheckpoint(chkpt_id_t id) @@ -233,65 +253,116 @@ void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) std::vector DatabaseCheckpointer::getCheckpointsAt(tick_t t) const { - //TODO cnyce - (void)t; - return {}; + std::lock_guard lock(mutex_); + + std::vector results; + for (const auto& [id, chkpt] : chkpts_cache_) { + if (chkpt->getTick() == t && !chkpt->isFlaggedDeleted()) { + results.push_back(id); + } + } + + for (auto id : chkpt_query_->getCheckpointsAt(t)) { + results.push_back(id); + } + + return results; } std::vector DatabaseCheckpointer::getCheckpoints() const { - //TODO cnyce - return {}; + std::lock_guard lock(mutex_); + + std::vector results; + for (const auto& [id, chkpt] : chkpts_cache_) { + if (!chkpt->isFlaggedDeleted()) { + results.push_back(id); + } + } + + for (auto id : chkpt_query_->getCheckpoints()) { + results.push_back(id); + } + + return results; } uint32_t DatabaseCheckpointer::getNumCheckpoints() const noexcept { - //TODO cnyce - return 0; + return num_alive_checkpoints_; } uint32_t DatabaseCheckpointer::getNumSnapshots() const noexcept { - //TODO cnyce - return 0; + return num_alive_snapshots_; } uint32_t DatabaseCheckpointer::getNumDeltas() const noexcept { - //TODO cnyce - return 0; + return getNumCheckpoints() - getNumSnapshots(); } uint32_t DatabaseCheckpointer::getNumDeadCheckpoints() const noexcept { - //TODO cnyce - return 0; + return num_dead_checkpoints_; } std::deque DatabaseCheckpointer::getCheckpointChain(chkpt_id_t id) const { - //TODO cnyce - return {}; + std::lock_guard lock(mutex_); + + std::deque chain; + if (!getHead()) { + return chain; + } + + if (!hasCheckpoint(id)) { + throw CheckpointError("There is no checkpoint with ID ") << id; + } + + auto it = chkpts_cache_.find(id); + while (it != chkpts_cache_.end()) { + chain.push_back(id); + id = it->second->getPrevID(); + it = chkpts_cache_.find(id); + } + + while (id != checkpoint_type::UNIDENTIFIED_CHECKPOINT) { + chain.push_back(id); + id = chkpt_query_->getPrevID(id); + } + + return chain; } DatabaseCheckpointAccessor DatabaseCheckpointer::findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from) { + if (!hasCheckpoint(from)) { + throw SpartaException("Invalid checkpoint ID"); + } + //TODO cnyce (void)tick; - (void)from; return DatabaseCheckpointAccessor(this, CheckpointBase::UNIDENTIFIED_CHECKPOINT); } -DatabaseCheckpointAccessor DatabaseCheckpointer::findCheckpoint(chkpt_id_t id) noexcept +DatabaseCheckpointAccessor DatabaseCheckpointer::findCheckpoint(chkpt_id_t id) { + if (!hasCheckpoint(id)) { + throw SpartaException("Invalid checkpoint ID"); + } + return DatabaseCheckpointAccessor(this, id); } bool DatabaseCheckpointer::hasCheckpoint(chkpt_id_t id) const noexcept { - //TODO cnyce - (void)id; - return false; + std::lock_guard lock(mutex_); + if (chkpts_cache_.find(id) != chkpts_cache_.end()) { + return true; + } + + return chkpt_query_->hasCheckpoint(id); } void DatabaseCheckpointer::dumpRestoreChain(std::ostream& o, chkpt_id_t id) const @@ -303,9 +374,22 @@ void DatabaseCheckpointer::dumpRestoreChain(std::ostream& o, chkpt_id_t id) cons std::stack DatabaseCheckpointer::getHistoryChain(chkpt_id_t id) const { - //TODO cnyce - (void)id; - return {}; + std::lock_guard lock(mutex_); + + std::stack chain; + auto it = chkpts_cache_.find(id); + while (it != chkpts_cache_.end()) { + chain.push(id); + id = it->second->getPrevID(); + it = chkpts_cache_.find(id); + } + + while (id != checkpoint_type::UNIDENTIFIED_CHECKPOINT) { + chain.push(id); + id = chkpt_query_->getPrevID(id); + } + + return chain; } std::stack DatabaseCheckpointer::getRestoreChain(chkpt_id_t id) const @@ -317,9 +401,14 @@ std::stack DatabaseCheckpointer::getRestoreChain(chkpt_id_t id) cons std::vector DatabaseCheckpointer::getNextIDs(chkpt_id_t id) const { - //TODO cnyce - (void)id; - return {}; + std::lock_guard lock(mutex_); + + auto it = chkpts_cache_.find(id); + if (it != chkpts_cache_.end()) { + return it->second->getNextIDs(); + } + + return chkpt_query_->getNextIDs(id); } void DatabaseCheckpointer::load(const std::vector& dats, chkpt_id_t id) @@ -330,17 +419,50 @@ void DatabaseCheckpointer::load(const std::vector& dats, chkpt_id_t i } uint32_t DatabaseCheckpointer::getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept +{ + std::lock_guard lock(mutex_); + + uint32_t dist = 0; + auto it = chkpts_cache_.find(id); + while (it != chkpts_cache_.end()) { + if (it->second->isSnapshot()) { + return dist; + } + id = it->second->getPrevID(); + it = chkpts_cache_.find(id); + ++dist; + } + + return chkpt_query_->getDistanceToPrevSnapshot(id); +} + +bool DatabaseCheckpointer::isSnapshot(chkpt_id_t id) const noexcept { //TODO cnyce (void)id; - return 0; + return false; } bool DatabaseCheckpointer::canDelete(chkpt_id_t id) const noexcept { - //TODO cnyce - (void)id; - return true; + std::lock_guard lock(mutex_); + + auto it = chkpts_cache_.find(id); + if (it == chkpts_cache_.end()) { + return chkpt_query_->canDelete(id); + } + + if (!it->second->isFlaggedDeleted()) { + return false; + } + + for (auto next_id : getNextIDs(id)) { + if (!canDelete(next_id) && !isSnapshot(next_id)) { + return false; + } + } + + return false; } std::string DatabaseCheckpointer::stringize() const From 8d404a1d1d31be968421ac3b0fcb76152a547477 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Wed, 20 Aug 2025 13:20:56 -0500 Subject: [PATCH 05/30] Database-backed checkpointer --- .../checkpoint/DatabaseCheckpoint.hpp | 24 +-- .../checkpoint/DatabaseCheckpointAccessor.hpp | 183 ------------------ .../checkpoint/DatabaseCheckpointAccessor.tpp | 150 -------------- .../checkpoint/DatabaseCheckpointBase.hpp | 115 ----------- .../checkpoint/DatabaseCheckpointQuery.hpp | 50 +++-- .../checkpoint/DatabaseCheckpointer.hpp | 9 +- sparta/src/DatabaseCheckpoint.cpp | 4 +- sparta/src/DatabaseCheckpointQuery.cpp | 104 +++++++++- sparta/src/DatabaseCheckpointer.cpp | 23 ++- 9 files changed, 166 insertions(+), 496 deletions(-) delete mode 100644 sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.hpp delete mode 100644 sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp delete mode 100644 sparta/sparta/serialization/checkpoint/DatabaseCheckpointBase.hpp diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp index d91abe832d..5a9317c152 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp @@ -2,7 +2,9 @@ #pragma once -#include "sparta/serialization/checkpoint/DatabaseCheckpointBase.hpp" +#include "sparta/serialization/checkpoint/CheckpointBase.hpp" +#include "sparta/serialization/checkpoint/CheckpointExceptions.hpp" +#include "sparta/serialization/checkpoint/VectorStorage.hpp" namespace sparta::serialization::checkpoint { @@ -12,7 +14,7 @@ namespace sparta::serialization::checkpoint * \brief Checkpoint class optimized for use with database-backed * checkpointers. */ - class DatabaseCheckpoint : public DatabaseCheckpointBase + class DatabaseCheckpoint : public CheckpointBase { public: @@ -100,14 +102,14 @@ namespace sparta::serialization::checkpoint * to be inspected for restoring this checkpoint's data. This may reach * the head checkpoint if no gaps are encountered. */ - std::stack getHistoryChain() const override; + std::stack getHistoryChain() const; /*! * \brief Returns a stack of checkpoints that must be restored from * top-to-bottom to fully restore the state associated with this * checkpoint. */ - std::stack getRestoreChain() const override; + std::stack getRestoreChain() const; /*! * \brief Get the ID of our previous checkpoint. Returns UNIDENTIFIED_CHECKPOINT @@ -139,7 +141,7 @@ namespace sparta::serialization::checkpoint * \warning This is a recursive search of a checkpoint tree which has potentially many * branches and could have high time cost */ - bool canDelete() const noexcept override; + bool canDelete() const noexcept; /*! * \brief Allows this checkpoint to be deleted if it is no longer a @@ -152,7 +154,7 @@ namespace sparta::serialization::checkpoint * \see canDelete * \see isFlaggedDeleted */ - void flagDeleted() override; + void flagDeleted(); /*! * \brief Indicates whether this checkpoint has been flagged deleted. @@ -161,14 +163,14 @@ namespace sparta::serialization::checkpoint * \note If false, Checkpoint ID will also be UNIDENTIFIED_CHECKPOINT * \see flagDeleted() */ - bool isFlaggedDeleted() const noexcept override; + bool isFlaggedDeleted() const noexcept; /*! * \brief Return the ID had by this checkpoint before it was deleted * If this checkpoint has not been flagged for deletion, this will be * UNIDENTIFIED_CHECKPOINT */ - chkpt_id_t getDeletedID() const noexcept override; + chkpt_id_t getDeletedID() const noexcept; /*! * \brief Gets the representation of this deleted checkpoint as part of @@ -181,7 +183,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Is this checkpoint a snapshot (contains ALL simulator state) */ - bool isSnapshot() const noexcept override; + bool isSnapshot() const noexcept; /*! * \brief Determines how many checkpoints away the closest, earlier @@ -193,14 +195,14 @@ namespace sparta::serialization::checkpoint * \note This is a noexcept function, which means that the exception if * no snapshot is encountered is uncatchable. This is intentional. */ - uint32_t getDistanceToPrevSnapshot() const noexcept override; + uint32_t getDistanceToPrevSnapshot() const noexcept; /*! * \brief Loads delta state of this checkpoint to root. * Does not look at any other checkpoints checkpoints. * \see load */ - void loadState(const std::vector& dats) override; + void loadState(const std::vector& dats); /*! * \brief Create a deep copy of this checkpoint. diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.hpp deleted file mode 100644 index 0a4b587d82..0000000000 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.hpp +++ /dev/null @@ -1,183 +0,0 @@ -// -*- C++ -*- - -#pragma once - -#include "sparta/serialization/checkpoint/DatabaseCheckpointBase.hpp" - -namespace sparta::serialization::checkpoint -{ - -class DatabaseCheckpoint; -class DatabaseCheckpointer; - -/*! - * \brief This class wraps a DatabaseCheckpoint and recreates it from disk - * if the checkpoint no longer exists in the checkpointer in memory. - */ -template -class DatabaseCheckpointAccessor : public DatabaseCheckpointBase -{ -public: - using db_checkpointer = std::conditional_t; - using db_checkpoint = std::conditional_t; - - //! Constructor - DatabaseCheckpointAccessor(db_checkpointer* checkpointer, chkpt_id_t id); - - //! Moves allowed - DatabaseCheckpointAccessor(DatabaseCheckpointAccessor&&) = default; - - //! Copies disallowed - DatabaseCheckpointAccessor(const DatabaseCheckpointAccessor&) = delete; - - //! Move assignment disallowed - DatabaseCheckpointAccessor& operator=(DatabaseCheckpointAccessor&&) = delete; - - //! Copy assignment disallowed - DatabaseCheckpointAccessor& operator=(const DatabaseCheckpointAccessor&) = delete; - - //! For parity with all the other in-memory checkpoint types. - DatabaseCheckpointAccessor* operator->() { return this; } - - //! For parity with all the other in-memory checkpoint types. - const DatabaseCheckpointAccessor* operator->() const { return this; } - - //! Destructor - ~DatabaseCheckpointAccessor(); - - /*! - * \brief Returns a string describing this object - */ - std::string stringize() const override; - - /*! - * \brief Writes all checkpoint raw data to an ostream - * \param o ostream to which raw data will be written - * \note No newlines or other extra characters will be appended - */ - void dumpData(std::ostream& o) const override; - - /*! - * \brief Returns memory usage by this checkpoint including any - * framework data structures - */ - uint64_t getTotalMemoryUse() const noexcept override; - - /*! - * \brief Returns memory usage by this checkpoint solely for the - * checkpointed content. - */ - uint64_t getContentMemoryUse() const noexcept override; - - /*! - * \brief Attempts to restore this checkpoint state to the simulation - * state (ArchData) objects given to this Checkpoint at construction - */ - void load(const std::vector& dats) override; - - /*! - * \brief Get the ID of our previous checkpoint. Returns UNIDENTIFIED_CHECKPOINT - * if we have no previous checkpoint, as is the case with the head checkpoint - * and snapshots. - */ - chkpt_id_t getPrevID() const override; - - /*! - * \brief Returns next checkpoint following *this. May be an empty - * vector if there are no later checkpoints. - */ - std::vector getNextIDs() const override; - - /*! - * \brief Gets the representation of this deleted checkpoint as part of - * a checkpoint chain (if that checkpointer supports deletion) - */ - std::string getDeletedRepr() const override; - - /*! - * \brief Returns a stack of checkpoints from this checkpoint as far - * back as possible until no previous link is found. This is a superset - * of getRestoreChain and contains checkpoints that do not actually need - * to be inspected for restoring this checkpoint's data. This may reach - * the head checkpoint if no gaps are encountered. - */ - std::stack getHistoryChain() const override; - - /*! - * \brief Returns a stack of checkpoints that must be restored from - * top-to-bottom to fully restore the state associated with this - * checkpoint. - */ - std::stack getRestoreChain() const override; - - /*! - * \brief Can this checkpoint be deleted - * Cannot be deleted if: - * \li This checkpoint has any ancestors which are not deletable and not snapshots - * \li This checkpoint was not flagged for deletion with flagDeleted - * \warning This is a recursive search of a checkpoint tree which has potentially many - * branches and could have high time cost - */ - bool canDelete() const noexcept override; - - /*! - * \brief Allows this checkpoint to be deleted if it is no longer a - * previous delta of some other delta (i.e. getNexts() returns an - * empty vector). Sets the checkpoint ID to invalid. Calling multiple - * times has no effect - * \pre Must not already be flagged deleted - * \post isFlaggedDeleted() will return true - * \post getDeletedID() will return the current ID (if any) - * \see canDelete - * \see isFlaggedDeleted - */ - void flagDeleted() override; - - /*! - * \brief Indicates whether this checkpoint has been flagged deleted. - * \note Does not imply that the checkpoint can safely be deleted; - * only that it was flagged for deletion. - * \note If false, Checkpoint ID will also be UNIDENTIFIED_CHECKPOINT - * \see flagDeleted() - */ - bool isFlaggedDeleted() const noexcept override; - - /*! - * \brief Return the ID had by this checkpoint before it was deleted - * If this checkpoint has not been flagged for deletion, this will be - * UNIDENTIFIED_CHECKPOINT - */ - chkpt_id_t getDeletedID() const noexcept override; - - /*! - * \brief Is this checkpoint a snapshot (contains ALL simulator state) - */ - bool isSnapshot() const noexcept override; - - /*! - * \brief Determines how many checkpoints away the closest, earlier - * snapshot is. - * \return distance to closest snapshot. If this node is a snapshot, - * returns 0; if immediate getPrev() is a snapshot, returns 1; and - * so on. - * - * \note This is a noexcept function, which means that the exception if - * no snapshot is encountered is uncatchable. This is intentional. - */ - uint32_t getDistanceToPrevSnapshot() const noexcept override; - - /*! - * \brief Loads delta state of this checkpoint to root. - * Does not look at any other checkpoints checkpoints. - * \see load - */ - void loadState(const std::vector& dats) override; - -private: - db_checkpointer* checkpointer_; - chkpt_id_t id_; -}; - -} // namespace sparta::serialization::checkpoint - -#include "sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp" diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp deleted file mode 100644 index 9b45972136..0000000000 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointAccessor.tpp +++ /dev/null @@ -1,150 +0,0 @@ -namespace sparta::serialization::checkpoint -{ - -using chkpt_id_t = typename CheckpointBase::chkpt_id_t; - -template -DatabaseCheckpointAccessor::DatabaseCheckpointAccessor(db_checkpointer* checkpointer, chkpt_id_t id) -{ - //TODO cnyce - (void)checkpointer; - (void)id; -} - -template -DatabaseCheckpointAccessor::~DatabaseCheckpointAccessor() -{ - //TODO cnyce -} - -template -std::string DatabaseCheckpointAccessor::stringize() const -{ - //TODO cnyce - return ""; -} - -template -void DatabaseCheckpointAccessor::dumpData(std::ostream& o) const -{ - //TODO cnyce - (void)o; -} - -template -uint64_t DatabaseCheckpointAccessor::getTotalMemoryUse() const noexcept -{ - //TODO cnyce - return 0; -} - -template -uint64_t DatabaseCheckpointAccessor::getContentMemoryUse() const noexcept -{ - //TODO cnyce - return 0; -} - -template -void DatabaseCheckpointAccessor::load(const std::vector& dats) -{ - if constexpr (IsConst) { - throw SpartaException("Cannot call load() on a const DatabaseCheckpointAccessor"); - } - - //TODO cnyce - (void)dats; -} - -template -chkpt_id_t DatabaseCheckpointAccessor::getPrevID() const -{ - //TODO cnyce - return 0; -} - -template -std::vector DatabaseCheckpointAccessor::getNextIDs() const -{ - //TODO cnyce - return {}; -} - -template -std::string DatabaseCheckpointAccessor::getDeletedRepr() const -{ - //TODO cnyce - return ""; -} - -template -std::stack DatabaseCheckpointAccessor::getHistoryChain() const -{ - //TODO cnyce - return {}; -} - -template -std::stack DatabaseCheckpointAccessor::getRestoreChain() const -{ - //TODO cnyce - return {}; -} - -template -bool DatabaseCheckpointAccessor::canDelete() const noexcept -{ - //TODO cnyce - return false; -} - -template -void DatabaseCheckpointAccessor::flagDeleted() -{ - if constexpr (IsConst) { - throw SpartaException("Cannot call flagDeleted() on a const DatabaseCheckpointAccessor"); - } - - //TODO cnyce -} - -template -bool DatabaseCheckpointAccessor::isFlaggedDeleted() const noexcept -{ - //TODO cnyce - return false; -} - -template -chkpt_id_t DatabaseCheckpointAccessor::getDeletedID() const noexcept -{ - //TODO cnyce - return 0; -} - -template -bool DatabaseCheckpointAccessor::isSnapshot() const noexcept -{ - //TODO cnyce - return false; -} - -template -uint32_t DatabaseCheckpointAccessor::getDistanceToPrevSnapshot() const noexcept -{ - //TODO cnyce - return 0; -} - -template -void DatabaseCheckpointAccessor::loadState(const std::vector& dats) -{ - if constexpr (IsConst) { - throw SpartaException("Cannot call loadState() on a const DatabaseCheckpointAccessor"); - } - - //TODO cnyce - (void)dats; -} - -} // namespace sparta::serialization::checkpoint diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointBase.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointBase.hpp deleted file mode 100644 index 67fd9cd1c0..0000000000 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointBase.hpp +++ /dev/null @@ -1,115 +0,0 @@ -// -*- C++ -*- - -#pragma once - -#include "sparta/serialization/checkpoint/CheckpointBase.hpp" -#include "sparta/serialization/checkpoint/CheckpointExceptions.hpp" -#include "sparta/serialization/checkpoint/VectorStorage.hpp" - -#include - -namespace sparta::serialization::checkpoint -{ - class DatabaseCheckpointer; - - /*! - * \brief Checkpoint class optimized for use with database-backed - * checkpointers. - */ - class DatabaseCheckpointBase : public CheckpointBase - { - public: - /*! - * \brief Forwarding constructor - */ - template - DatabaseCheckpointBase(Args&&... args) - : CheckpointBase(std::forward(args)...) - {} - - /*! - * \brief Destructor - */ - virtual ~DatabaseCheckpointBase() = default; - - /*! - * \brief Returns a stack of checkpoints from this checkpoint as far - * back as possible until no previous link is found. This is a superset - * of getRestoreChain and contains checkpoints that do not actually need - * to be inspected for restoring this checkpoint's data. This may reach - * the head checkpoint if no gaps are encountered. - */ - virtual std::stack getHistoryChain() const = 0; - - /*! - * \brief Returns a stack of checkpoints that must be restored from - * top-to-bottom to fully restore the state associated with this - * checkpoint. - */ - virtual std::stack getRestoreChain() const = 0; - - /*! - * \brief Can this checkpoint be deleted - * Cannot be deleted if: - * \li This checkpoint has any ancestors which are not deletable and not snapshots - * \li This checkpoint was not flagged for deletion with flagDeleted - * \warning This is a recursive search of a checkpoint tree which has potentially many - * branches and could have high time cost - */ - virtual bool canDelete() const noexcept = 0; - - /*! - * \brief Allows this checkpoint to be deleted if it is no longer a - * previous delta of some other delta (i.e. getNexts() returns an - * empty vector). Sets the checkpoint ID to invalid. Calling multiple - * times has no effect - * \pre Must not already be flagged deleted - * \post isFlaggedDeleted() will return true - * \post getDeletedID() will return the current ID (if any) - * \see canDelete - * \see isFlaggedDeleted - */ - virtual void flagDeleted() = 0; - - /*! - * \brief Indicates whether this checkpoint has been flagged deleted. - * \note Does not imply that the checkpoint can safely be deleted; - * only that it was flagged for deletion. - * \note If false, Checkpoint ID will also be UNIDENTIFIED_CHECKPOINT - * \see flagDeleted() - */ - virtual bool isFlaggedDeleted() const noexcept = 0; - - /*! - * \brief Return the ID had by this checkpoint before it was deleted - * If this checkpoint has not been flagged for deletion, this will be - * UNIDENTIFIED_CHECKPOINT - */ - virtual chkpt_id_t getDeletedID() const noexcept = 0; - - /*! - * \brief Is this checkpoint a snapshot (contains ALL simulator state) - */ - virtual bool isSnapshot() const noexcept = 0; - - /*! - * \brief Determines how many checkpoints away the closest, earlier - * snapshot is. - * \return distance to closest snapshot. If this node is a snapshot, - * returns 0; if immediate getPrev() is a snapshot, returns 1; and - * so on. - * - * \note This is a noexcept function, which means that the exception if - * no snapshot is encountered is uncatchable. This is intentional. - */ - virtual uint32_t getDistanceToPrevSnapshot() const noexcept = 0; - - /*! - * \brief Loads delta state of this checkpoint to root. - * Does not look at any other checkpoints checkpoints. - * \see load - */ - virtual void loadState(const std::vector& dats) = 0; - }; - -} // namespace sparta::serialization::checkpoint diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp index 8bd4ea0028..b803580f02 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp @@ -2,8 +2,7 @@ #pragma once -#include -#include +#include "sparta/serialization/checkpoint/Checkpointer.hpp" namespace simdb { @@ -18,32 +17,57 @@ namespace sparta::serialization::checkpoint * cache to include the database. Combinations of in-memory checkpoints, recreated * checkpoints, and database schema/query optimizations are used for performance. */ -class DatabaseCheckpointQuery +class DatabaseCheckpointQuery : public Checkpointer { public: - //! \brief Construct with a SimDB instance - DatabaseCheckpointQuery(simdb::DatabaseManager* db_mgr) - : db_mgr_(db_mgr) + DatabaseCheckpointQuery(simdb::DatabaseManager* db_mgr, TreeNode& root, sparta::Scheduler* sched=nullptr) + : Checkpointer(root, sched) + , db_mgr_(db_mgr) {} - using chkpt_id_t = uint64_t; - using tick_t = uint64_t; + uint64_t getTotalMemoryUse() const noexcept override; - bool hasCheckpoint(chkpt_id_t id) const noexcept; + uint64_t getContentMemoryUse() const noexcept override; - chkpt_id_t getPrevID(chkpt_id_t id) const; + bool hasCheckpoint(chkpt_id_t id) const noexcept override; - std::vector getNextIDs(chkpt_id_t id) const; + void deleteCheckpoint(chkpt_id_t id) override; + + void loadCheckpoint(chkpt_id_t id) override; + + std::vector getCheckpointsAt(tick_t t) const override; + + std::vector getCheckpoints() const override; + + uint32_t getNumCheckpoints() const noexcept override; + + std::deque getCheckpointChain(chkpt_id_t id) const override; + + void dumpList(std::ostream& o) const override; + + void dumpData(std::ostream& o) const override; - std::vector getCheckpointsAt(tick_t t) const; + void dumpAnnotatedData(std::ostream& o) const override; - std::vector getCheckpoints() const; + void traceValue(std::ostream& o, chkpt_id_t id, const ArchData* container, uint32_t offset, uint32_t size) override; + + chkpt_id_t getPrevID(chkpt_id_t id) const; + + std::vector getNextIDs(chkpt_id_t id) const; uint32_t getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept; bool canDelete(chkpt_id_t id) const noexcept; private: + void createHead_() override; + + chkpt_id_t createCheckpoint_(bool force_snapshot=false) override; + + void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const override; + + std::vector getNextIDs_(chkpt_id_t id) const override; + //! \brief SimDB instance simdb::DatabaseManager* db_mgr_ = nullptr; }; diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp index 2670af6289..e70677147b 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -4,7 +4,6 @@ #include "sparta/serialization/checkpoint/Checkpointer.hpp" #include "sparta/serialization/checkpoint/DatabaseCheckpoint.hpp" -#include "sparta/serialization/checkpoint/DatabaseCheckpointAccessor.hpp" #include "simdb/apps/App.hpp" #include "simdb/pipeline/Pipeline.hpp" @@ -204,7 +203,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \throw CheckpointError if \a from does not refer to a valid * checkpoint. */ - DatabaseCheckpointAccessor findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from); + std::unique_ptr findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from); /*! * \brief Finds a checkpoint by its ID @@ -212,7 +211,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * deleted * \return Checkpoint with ID of \a id if found or nullptr if not found */ - DatabaseCheckpointAccessor findCheckpoint(chkpt_id_t id); + std::unique_ptr findCheckpoint(chkpt_id_t id); /*! * \brief Tests whether this checkpoint manager has a checkpoint with @@ -389,12 +388,12 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * returns nullptr. * \todo Faster lookup? */ - DatabaseCheckpointAccessor findCheckpoint_(chkpt_id_t id) noexcept; + std::unique_ptr findCheckpoint_(chkpt_id_t id) noexcept; /*! * \brief Const version of findCheckpoint_() */ - DatabaseCheckpointAccessor findCheckpoint_(chkpt_id_t id) const noexcept; + std::unique_ptr findCheckpoint_(chkpt_id_t id) const noexcept; /*! * \brief Implements Checkpointer::dumpCheckpointNode_ diff --git a/sparta/src/DatabaseCheckpoint.cpp b/sparta/src/DatabaseCheckpoint.cpp index f0af149285..83fbf222ee 100644 --- a/sparta/src/DatabaseCheckpoint.cpp +++ b/sparta/src/DatabaseCheckpoint.cpp @@ -18,7 +18,7 @@ DatabaseCheckpoint::DatabaseCheckpoint(TreeNode& root, DatabaseCheckpoint* prev, bool is_snapshot, DatabaseCheckpointer* checkpointer) - : DatabaseCheckpointBase(id, tick) + : CheckpointBase(id, tick) , prev_id_(prev ? prev->getID() : UNIDENTIFIED_CHECKPOINT) , deleted_id_(UNIDENTIFIED_CHECKPOINT) , is_snapshot_(is_snapshot) @@ -50,7 +50,7 @@ DatabaseCheckpoint::DatabaseCheckpoint(chkpt_id_t prev_id, bool is_snapshot, const storage::VectorStorage& storage, DatabaseCheckpointer* checkpointer) - : DatabaseCheckpointBase(getID(), getTick()) + : CheckpointBase(getID(), getTick()) , prev_id_(prev_id) , next_ids_(next_ids) , deleted_id_(deleted_id) diff --git a/sparta/src/DatabaseCheckpointQuery.cpp b/sparta/src/DatabaseCheckpointQuery.cpp index 948dd2c270..ba2de7d4d6 100644 --- a/sparta/src/DatabaseCheckpointQuery.cpp +++ b/sparta/src/DatabaseCheckpointQuery.cpp @@ -6,25 +6,28 @@ namespace sparta::serialization::checkpoint using chkpt_id_t = typename DatabaseCheckpointQuery::chkpt_id_t; using tick_t = typename DatabaseCheckpointQuery::tick_t; -bool DatabaseCheckpointQuery::hasCheckpoint(chkpt_id_t id) const noexcept +uint64_t DatabaseCheckpointQuery::getTotalMemoryUse() const noexcept { //TODO cnyce - (void)id; - return false; + return 0; } -chkpt_id_t DatabaseCheckpointQuery::getPrevID(chkpt_id_t id) const +uint64_t DatabaseCheckpointQuery::getContentMemoryUse() const noexcept { //TODO cnyce - (void)id; return 0; } -std::vector DatabaseCheckpointQuery::getNextIDs(chkpt_id_t id) const +void DatabaseCheckpointQuery::deleteCheckpoint(chkpt_id_t id) +{ + //TODO cnyce + (void)id; +} + +void DatabaseCheckpointQuery::loadCheckpoint(chkpt_id_t id) { //TODO cnyce (void)id; - return {}; } std::vector DatabaseCheckpointQuery::getCheckpointsAt(tick_t t) const @@ -40,6 +43,68 @@ std::vector DatabaseCheckpointQuery::getCheckpoints() const return {}; } +uint32_t DatabaseCheckpointQuery::getNumCheckpoints() const noexcept +{ + //TODO cnyce + return 0; +} + +std::deque DatabaseCheckpointQuery::getCheckpointChain(chkpt_id_t id) const +{ + //TODO cnyce + (void)id; + return {}; +} + +bool DatabaseCheckpointQuery::hasCheckpoint(chkpt_id_t id) const noexcept +{ + //TODO cnyce + (void)id; + return false; +} + +void DatabaseCheckpointQuery::dumpList(std::ostream& o) const +{ + //TODO cnyce + (void)o; +} + +void DatabaseCheckpointQuery::dumpData(std::ostream& o) const +{ + //TODO cnyce + (void)o; +} + +void DatabaseCheckpointQuery::dumpAnnotatedData(std::ostream& o) const +{ + //TODO cnyce + (void)o; +} + +void DatabaseCheckpointQuery::traceValue(std::ostream& o, chkpt_id_t id, const ArchData* container, uint32_t offset, uint32_t size) +{ + //TODO cnyce + (void)o; + (void)id; + (void)container; + (void)offset; + (void)size; +} + +chkpt_id_t DatabaseCheckpointQuery::getPrevID(chkpt_id_t id) const +{ + //TODO cnyce + (void)id; + return 0; +} + +std::vector DatabaseCheckpointQuery::getNextIDs(chkpt_id_t id) const +{ + //TODO cnyce + (void)id; + return {}; +} + uint32_t DatabaseCheckpointQuery::getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept { //TODO cnyce @@ -54,4 +119,29 @@ bool DatabaseCheckpointQuery::canDelete(chkpt_id_t id) const noexcept return false; } +void DatabaseCheckpointQuery::createHead_() +{ +} + +chkpt_id_t DatabaseCheckpointQuery::createCheckpoint_(bool force_snapshot) +{ + //TODO cnyce + (void)force_snapshot; + return 0; +} + +void DatabaseCheckpointQuery::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const +{ + //TODO cnyce + (void)id; + (void)o; +} + +std::vector DatabaseCheckpointQuery::getNextIDs_(chkpt_id_t id) const +{ + //TODO cnyce + (void)id; + return {}; +} + } // namespace sparta::serialization::checkpoint diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index 0e7746add9..85f1f02710 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -63,7 +63,7 @@ using EvictedChkptIDs = std::vector; DatabaseCheckpointer::DatabaseCheckpointer(simdb::DatabaseManager* db_mgr, TreeNode& root, Scheduler* sched) : Checkpointer(root, sched), - chkpt_query_(std::make_shared(db_mgr)), + chkpt_query_(std::make_shared(db_mgr, root, sched)), db_mgr_(db_mgr), snap_thresh_(DEFAULT_SNAPSHOT_THRESH), next_chkpt_id_(checkpoint_type::MIN_CHECKPOINT), @@ -335,7 +335,7 @@ std::deque DatabaseCheckpointer::getCheckpointChain(chkpt_id_t id) c return chain; } -DatabaseCheckpointAccessor DatabaseCheckpointer::findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from) +std::unique_ptr DatabaseCheckpointer::findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from) { if (!hasCheckpoint(from)) { throw SpartaException("Invalid checkpoint ID"); @@ -343,16 +343,15 @@ DatabaseCheckpointAccessor DatabaseCheckpointer::findLatestCheckpointAtOr //TODO cnyce (void)tick; - return DatabaseCheckpointAccessor(this, CheckpointBase::UNIDENTIFIED_CHECKPOINT); + return nullptr; } -DatabaseCheckpointAccessor DatabaseCheckpointer::findCheckpoint(chkpt_id_t id) +std::unique_ptr DatabaseCheckpointer::findCheckpoint(chkpt_id_t id) { if (!hasCheckpoint(id)) { throw SpartaException("Invalid checkpoint ID"); } - - return DatabaseCheckpointAccessor(this, id); + return nullptr; } bool DatabaseCheckpointer::hasCheckpoint(chkpt_id_t id) const noexcept @@ -617,14 +616,18 @@ bool DatabaseCheckpointer::recursForwardFindAlive_(chkpt_id_t id) const return false; } -DatabaseCheckpointAccessor DatabaseCheckpointer::findCheckpoint_(chkpt_id_t id) noexcept +std::unique_ptr DatabaseCheckpointer::findCheckpoint_(chkpt_id_t id) noexcept { - return DatabaseCheckpointAccessor(this, id); + //TODO cnyce + (void)id; + return nullptr; } -DatabaseCheckpointAccessor DatabaseCheckpointer::findCheckpoint_(chkpt_id_t id) const noexcept +std::unique_ptr DatabaseCheckpointer::findCheckpoint_(chkpt_id_t id) const noexcept { - return DatabaseCheckpointAccessor(this, id); + //TODO cnyce + (void)id; + return nullptr; } void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const From 81002a6878cec6cc6de5c5c1c4e592ecc17454bc Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Wed, 20 Aug 2025 18:16:34 -0500 Subject: [PATCH 06/30] Database-backed checkpointer --- .../checkpoint/DatabaseCheckpointQuery.hpp | 10 +- .../checkpoint/DatabaseCheckpointer.hpp | 33 +- .../checkpoint/DeltaCheckpoint.hpp | 2 +- .../checkpoint/FastCheckpointer.hpp | 2 +- sparta/src/DatabaseCheckpoint.cpp | 10 +- sparta/src/DatabaseCheckpointQuery.cpp | 15 +- sparta/src/DatabaseCheckpointer.cpp | 388 +++++++++++++----- 7 files changed, 325 insertions(+), 135 deletions(-) diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp index b803580f02..253f4f5491 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp @@ -12,6 +12,8 @@ namespace simdb namespace sparta::serialization::checkpoint { +class DatabaseCheckpoint; + /*! * \brief SQLite query object to "extend" the checkpoint search space from just the * cache to include the database. Combinations of in-memory checkpoints, recreated @@ -29,8 +31,6 @@ class DatabaseCheckpointQuery : public Checkpointer uint64_t getContentMemoryUse() const noexcept override; - bool hasCheckpoint(chkpt_id_t id) const noexcept override; - void deleteCheckpoint(chkpt_id_t id) override; void loadCheckpoint(chkpt_id_t id) override; @@ -43,6 +43,8 @@ class DatabaseCheckpointQuery : public Checkpointer std::deque getCheckpointChain(chkpt_id_t id) const override; + bool hasCheckpoint(chkpt_id_t id) const noexcept override; + void dumpList(std::ostream& o) const override; void dumpData(std::ostream& o) const override; @@ -51,14 +53,14 @@ class DatabaseCheckpointQuery : public Checkpointer void traceValue(std::ostream& o, chkpt_id_t id, const ArchData* container, uint32_t offset, uint32_t size) override; + std::unique_ptr findCheckpoint(chkpt_id_t id); + chkpt_id_t getPrevID(chkpt_id_t id) const; std::vector getNextIDs(chkpt_id_t id) const; uint32_t getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept; - bool canDelete(chkpt_id_t id) const noexcept; - private: void createHead_() override; diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp index e70677147b..2b6c8b52d3 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -211,7 +211,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * deleted * \return Checkpoint with ID of \a id if found or nullptr if not found */ - std::unique_ptr findCheckpoint(chkpt_id_t id); + std::unique_ptr findCheckpoint(chkpt_id_t id, bool must_exist=true) const; /*! * \brief Tests whether this checkpoint manager has a checkpoint with @@ -252,15 +252,6 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer */ std::vector getNextIDs(chkpt_id_t id) const; - /*! - * \brief Attempts to restore this checkpoint including any previous - * deltas (dependencies). - * - * Uses loadState to restore state from each checkpoint in the - * restore chain. - */ - void load(const std::vector& dats, chkpt_id_t id); - /*! * \brief Determines how many checkpoints away the closest, earlier * snapshot is. @@ -370,6 +361,12 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer */ void cleanupChain_(chkpt_id_t id); + /*! + * \brief Remove the given checkpoint from the cache and/or DB. The + * adjacent checkpoints, if any, will be reconnected appropriately. + */ + void disconnectChainLink_(chkpt_id_t id); + /*! * \brief Look forward to see if any future checkpoints depend on \a d. * \param d checkpoint to inspect and recursively search @@ -381,20 +378,6 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer */ bool recursForwardFindAlive_(chkpt_id_t id) const; - /*! - * \brief Attempts to find a checkpoint within this checkpointer by ID. - * \param id Checkpoint ID to search for - * \return Pointer to found checkpoint with matchind ID. If not found, - * returns nullptr. - * \todo Faster lookup? - */ - std::unique_ptr findCheckpoint_(chkpt_id_t id) noexcept; - - /*! - * \brief Const version of findCheckpoint_() - */ - std::unique_ptr findCheckpoint_(chkpt_id_t id) const noexcept; - /*! * \brief Implements Checkpointer::dumpCheckpointNode_ */ @@ -457,7 +440,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer std::shared_ptr chkpt_query_; //! \brief Mutex to protect our checkpoints cache. - mutable std::mutex mutex_; + mutable std::recursive_mutex mutex_; //! \brief SimDB instance simdb::DatabaseManager* db_mgr_ = nullptr; diff --git a/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp index 59c2bae289..fa95d9d722 100644 --- a/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp @@ -181,7 +181,7 @@ namespace sparta::serialization::checkpoint o << '('; } if(cp->getID() == UNIDENTIFIED_CHECKPOINT){ - o << "*" << getDeletedID() << ""; + o << "*" << getDeletedID(); }else{ o << cp->getID(); } diff --git a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp index fc07bf9193..be1bd00bf6 100644 --- a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp @@ -528,7 +528,7 @@ namespace sparta::serialization::checkpoint // This snapshot is needed later. Move to previous delta and work from there d = static_cast(d->getPrev()); }else{ - return; // This delta is needed. Therefore all preceeding deltas are needed + return; // This delta is needed. Therefore all preceeding deltas are needed } } diff --git a/sparta/src/DatabaseCheckpoint.cpp b/sparta/src/DatabaseCheckpoint.cpp index 83fbf222ee..b2a1cbb350 100644 --- a/sparta/src/DatabaseCheckpoint.cpp +++ b/sparta/src/DatabaseCheckpoint.cpp @@ -117,7 +117,15 @@ std::vector DatabaseCheckpoint::getNextIDs() const void DatabaseCheckpoint::load(const std::vector& dats) { - checkpointer_->load(dats, getID()); + // BUild stack up to last snapshot + std::stack chkpt_ids = getRestoreChain(); + + // Load in proper order + while (!chkpt_ids.empty()) { + auto id = chkpt_ids.top(); + chkpt_ids.pop(); + checkpointer_->findCheckpoint(id)->loadState(dats); + } } bool DatabaseCheckpoint::canDelete() const noexcept diff --git a/sparta/src/DatabaseCheckpointQuery.cpp b/sparta/src/DatabaseCheckpointQuery.cpp index ba2de7d4d6..714c8dd184 100644 --- a/sparta/src/DatabaseCheckpointQuery.cpp +++ b/sparta/src/DatabaseCheckpointQuery.cpp @@ -1,4 +1,5 @@ #include "sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp" +#include "sparta/serialization/checkpoint/DatabaseCheckpoint.hpp" namespace sparta::serialization::checkpoint { @@ -91,6 +92,13 @@ void DatabaseCheckpointQuery::traceValue(std::ostream& o, chkpt_id_t id, const A (void)size; } +std::unique_ptr DatabaseCheckpointQuery::findCheckpoint(chkpt_id_t id) +{ + //TODO cnyce + (void)id; + return nullptr; +} + chkpt_id_t DatabaseCheckpointQuery::getPrevID(chkpt_id_t id) const { //TODO cnyce @@ -112,13 +120,6 @@ uint32_t DatabaseCheckpointQuery::getDistanceToPrevSnapshot(chkpt_id_t id) const return 0; } -bool DatabaseCheckpointQuery::canDelete(chkpt_id_t id) const noexcept -{ - //TODO cnyce - (void)id; - return false; -} - void DatabaseCheckpointQuery::createHead_() { } diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index 85f1f02710..8cb42e5238 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -70,7 +70,8 @@ DatabaseCheckpointer::DatabaseCheckpointer(simdb::DatabaseManager* db_mgr, TreeN num_alive_checkpoints_(0), num_alive_snapshots_(0), num_dead_checkpoints_(0) -{ } +{ +} void DatabaseCheckpointer::defineSchema(simdb::Schema& schema) { @@ -191,7 +192,7 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( // TODO cnyce: We are allocating and deallocating a LOT of checkpoints. // See if we can reuse a pool of them. Could also try to just add a pool // to the VectorStorage::Segment class. - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); chkpts_cache_.erase(id); } } @@ -221,39 +222,85 @@ void DatabaseCheckpointer::setSnapshotThreshold(uint32_t thresh) noexcept uint64_t DatabaseCheckpointer::getTotalMemoryUse() const noexcept { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); uint64_t mem = 0; for (const auto& [id, chkpt] : chkpts_cache_) { mem += chkpt->getTotalMemoryUse(); } + mem += chkpt_query_->getTotalMemoryUse(); return mem; } uint64_t DatabaseCheckpointer::getContentMemoryUse() const noexcept { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); uint64_t mem = 0; for (const auto& [id, chkpt] : chkpts_cache_) { mem += chkpt->getTotalMemoryUse(); } + mem += chkpt_query_->getContentMemoryUse(); return mem; } void DatabaseCheckpointer::deleteCheckpoint(chkpt_id_t id) { - //TODO cnyce - (void)id; + if (!hasCheckpoint(id)) { + throw CheckpointError("Could not delete checkpoint ID=") + << id << " because no checkpoint by this ID was found"; + } + + std::lock_guard lock(mutex_); + + if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { + checkpoint_type* chkpt = it->second.get(); + + // Allow deletion and change ID to UNIDENTIFIED_CHECKPOINT. + // This is still part of a chain though until there are no + // dependencies on it. + if (!chkpt->isFlaggedDeleted()) { + num_dead_checkpoints_++; + if (chkpt->isSnapshot()) { + num_alive_snapshots_--; + } + num_alive_checkpoints_--; + chkpt->flagDeleted(); + } + + // Delete this and all contiguous previous checkpoint which were + // flagged deleted if possible. Stop if current_ is encountered + cleanupChain_(chkpt->getID()); + } + + chkpt_query_->deleteCheckpoint(id); } void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) { - //TODO cnyce - (void)id; + auto chkpt = findCheckpoint(id); + chkpt->load(getArchDatas()); + + // Move current to another checkpoint. Anything between head and the + // old current_ is fair game for removal if allowed + checkpoint_type* rmv = static_cast(getCurrent_()); + setCurrent_(chkpt.get()); + addToCache_(std::move(chkpt)); + + // Restore scheduler tick number + if (sched_) { + sched_->restartAt(getCurrentTick()); + } + + // Remove all checkpoints which can be. Stop if the new current_ is + // encountered again. + // Note that is is OK if current_ was moved to a later position in + // the chain. No important checkpoints will be removed. The + // important thing is never to remove current_. + cleanupChain_(rmv->getID()); } std::vector DatabaseCheckpointer::getCheckpointsAt(tick_t t) const { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); std::vector results; for (const auto& [id, chkpt] : chkpts_cache_) { @@ -271,7 +318,7 @@ std::vector DatabaseCheckpointer::getCheckpointsAt(tick_t t) const std::vector DatabaseCheckpointer::getCheckpoints() const { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); std::vector results; for (const auto& [id, chkpt] : chkpts_cache_) { @@ -309,7 +356,7 @@ uint32_t DatabaseCheckpointer::getNumDeadCheckpoints() const noexcept std::deque DatabaseCheckpointer::getCheckpointChain(chkpt_id_t id) const { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); std::deque chain; if (!getHead()) { @@ -338,25 +385,42 @@ std::deque DatabaseCheckpointer::getCheckpointChain(chkpt_id_t id) c std::unique_ptr DatabaseCheckpointer::findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from) { if (!hasCheckpoint(from)) { - throw SpartaException("Invalid checkpoint ID"); + throw CheckpointError("There is no checkpoint with ID ") << from; } - //TODO cnyce - (void)tick; - return nullptr; + std::lock_guard lock(mutex_); + + auto id = from; + do { + auto chkpt = findCheckpoint(id); + if (chkpt->getTick() <= tick) { + break; + } + id = chkpt->getPrevID(); + } while (id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); + + return findCheckpoint(id); } -std::unique_ptr DatabaseCheckpointer::findCheckpoint(chkpt_id_t id) +std::unique_ptr DatabaseCheckpointer::findCheckpoint(chkpt_id_t id, bool must_exist) const { if (!hasCheckpoint(id)) { - throw SpartaException("Invalid checkpoint ID"); + throw CheckpointError("There is no checkpoint with ID ") << id; } - return nullptr; + + std::lock_guard lock(mutex_); + + if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { + return it->second->clone(); + } + + return chkpt_query_->findCheckpoint(id); } bool DatabaseCheckpointer::hasCheckpoint(chkpt_id_t id) const noexcept { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); + if (chkpts_cache_.find(id) != chkpts_cache_.end()) { return true; } @@ -366,14 +430,32 @@ bool DatabaseCheckpointer::hasCheckpoint(chkpt_id_t id) const noexcept void DatabaseCheckpointer::dumpRestoreChain(std::ostream& o, chkpt_id_t id) const { - //TODO cnyce - (void)o; - (void)id; + auto rc = getRestoreChain(id); + while (true) { + const auto chkpt = findCheckpoint(rc.top()); + rc.pop(); + if (chkpt->isSnapshot()) { + o << '('; + } + if (chkpt->getID() == checkpoint_type::UNIDENTIFIED_CHECKPOINT) { + o << "*" << chkpt->getDeletedID(); + } else { + o << chkpt->getID(); + } + if (chkpt->isSnapshot()) { + o << ')'; + } + if (rc.empty()) { + break; + } else { + o << " --> "; + } + } } std::stack DatabaseCheckpointer::getHistoryChain(chkpt_id_t id) const { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); std::stack chain; auto it = chkpts_cache_.find(id); @@ -393,33 +475,27 @@ std::stack DatabaseCheckpointer::getHistoryChain(chkpt_id_t id) cons std::stack DatabaseCheckpointer::getRestoreChain(chkpt_id_t id) const { - //TODO cnyce - (void)id; - return {}; -} - -std::vector DatabaseCheckpointer::getNextIDs(chkpt_id_t id) const -{ - std::lock_guard lock(mutex_); - - auto it = chkpts_cache_.find(id); - if (it != chkpts_cache_.end()) { - return it->second->getNextIDs(); + // Build stack up to last snapshot + std::stack chkpts; + while (true) { + chkpts.push(id); + auto chkpt = findCheckpoint(id); + if (chkpt->isSnapshot()) { + break; + } + id = chkpt->getPrevID(); } - - return chkpt_query_->getNextIDs(id); + return chkpts; } -void DatabaseCheckpointer::load(const std::vector& dats, chkpt_id_t id) +std::vector DatabaseCheckpointer::getNextIDs(chkpt_id_t id) const { - //TODO cnyce - (void)dats; - (void)id; + return findCheckpoint(id)->getNextIDs(); } uint32_t DatabaseCheckpointer::getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); uint32_t dist = 0; auto it = chkpts_cache_.find(id); @@ -432,30 +508,42 @@ uint32_t DatabaseCheckpointer::getDistanceToPrevSnapshot(chkpt_id_t id) const no ++dist; } + // Note that we only evict entire checkpoint "windows" from the cache, + // which means the cache never has "partial" windows like: + // + // Snapshot threshold: 10 (window length) + // (1 snapshot, 9 deltas) + // + // Cache: DB: + // 3,4,5,6,7,8,9,10 1,2 <-- never going to happen + // + // Cache: DB: + // 21-30 1-20 <-- always like this ("full" windows only) + // + // This means we either can answer the API question entirely using the + // cache or entirely using the DB. That is why the line of code below + // is not something like: + // + // return dist + chkpt_query_->getDistanceToPrevSnapshot(id); + return chkpt_query_->getDistanceToPrevSnapshot(id); } bool DatabaseCheckpointer::isSnapshot(chkpt_id_t id) const noexcept { - //TODO cnyce - (void)id; - return false; + return findCheckpoint(id)->isSnapshot(); } bool DatabaseCheckpointer::canDelete(chkpt_id_t id) const noexcept { - std::lock_guard lock(mutex_); - - auto it = chkpts_cache_.find(id); - if (it == chkpts_cache_.end()) { - return chkpt_query_->canDelete(id); - } + std::lock_guard lock(mutex_); - if (!it->second->isFlaggedDeleted()) { + auto chkpt = findCheckpoint(id); + if (!chkpt->isFlaggedDeleted()) { return false; } - for (auto next_id : getNextIDs(id)) { + for (auto next_id : chkpt->getNextIDs()) { if (!canDelete(next_id) && !isSnapshot(next_id)) { return false; } @@ -466,26 +554,45 @@ bool DatabaseCheckpointer::canDelete(chkpt_id_t id) const noexcept std::string DatabaseCheckpointer::stringize() const { - //TODO cnyce - return ""; + std::stringstream ss; + ss << "'; + return ss.str(); } void DatabaseCheckpointer::dumpList(std::ostream& o) const { - //TODO cnyce - (void)o; + std::lock_guard lock(mutex_); + + for (const auto& [id, chkpt] : chkpts_cache_) { + o << chkpt->stringize() << std::endl; + } + + chkpt_query_->dumpList(o); } void DatabaseCheckpointer::dumpData(std::ostream& o) const { - //TODO cnyce - (void)o; + std::lock_guard lock(mutex_); + + for (const auto& [id, chkpt] : chkpts_cache_) { + chkpt->dumpData(o); + o << std::endl; + } + + chkpt_query_->dumpData(o); } void DatabaseCheckpointer::dumpAnnotatedData(std::ostream& o) const { - //TODO cnyce - (void)o; + std::lock_guard lock(mutex_); + + for (const auto& [id, chkpt] : chkpts_cache_) { + o << chkpt->stringize() << std::endl; + chkpt->dumpData(o); + o << std::endl; + } + + chkpt_query_->dumpAnnotatedData(o); } void DatabaseCheckpointer::traceValue( @@ -495,12 +602,13 @@ void DatabaseCheckpointer::traceValue( uint32_t offset, uint32_t size) { - //TODO cnyce (void)o; (void)id; (void)container; (void)offset; (void)size; + + sparta_assert(false, "Not implemented"); } void DatabaseCheckpointer::createHead_() @@ -517,7 +625,7 @@ void DatabaseCheckpointer::createHead_() if (getRoot().isFinalized() == false) { CheckpointError exc("Cannot create a checkpoint until the tree is finalized. Attempting to checkpoint from node "); exc << getRoot().getLocation() << " at tick "; - if(sched_){ + if (sched_) { exc << tick; }else{ exc << ""; @@ -530,10 +638,11 @@ void DatabaseCheckpointer::createHead_() nullptr, true, this)); setHead_(chkpt.get()); - num_alive_checkpoints_++; - num_alive_snapshots_++; setCurrent_(chkpt.get()); addToCache_(std::move(chkpt)); + + num_alive_checkpoints_++; + num_alive_snapshots_++; } chkpt_id_t DatabaseCheckpointer::createCheckpoint_(bool force_snapshot) @@ -605,60 +714,147 @@ chkpt_id_t DatabaseCheckpointer::createCheckpoint_(bool force_snapshot) void DatabaseCheckpointer::cleanupChain_(chkpt_id_t id) { - // TODO cnyce - (void)id; -} + std::lock_guard lock(mutex_); + + // In order to truly delete any checkpoints, we must traverse back + // to the previous snapshot (or the head) and forward to the another + // snapshot or the end of the chain. + // ONLY if both of those points can be reached without encountering + // a living checkpoint or the current checkpoint (forward + // only) can the whole chain (including the leading shapshot) be + // deleted. + + if (id == getHeadID()) { + // Cannot delete head of checkpoint tree + return; + } + + // Walk forward to another snapshot or current + const bool needed_later = (getCurrentID() == id) || recursForwardFindAlive_(id); + if (needed_later) { + // Cannot delete because a later living checkpoint (or current) depends on this. + auto chkpt = findCheckpoint(id); + if (chkpt->isSnapshot()) { + // This snapshot is needed later. Move to previous delta and work from there. + id = chkpt->getPrevID(); + } else { + return; // This delta is needed. Therefore all preceeding deltas are needed. + } + } -bool DatabaseCheckpointer::recursForwardFindAlive_(chkpt_id_t id) const -{ - // TODO cnyce - (void)id; - return false; + // Delete backward until current, head, or a non-flagged-deleted checkpoint is hit. + // It is possible to fracture the checkpoint tree by deleting a segment + // between two snapshots, so prev can end up with nothing leading up to it + while (true) { + if (id == checkpoint_type::UNIDENTIFIED_CHECKPOINT) { + break; + } + + if (id == getHeadID()) { + break; + } + + auto chkpt = findCheckpoint(id); + if (!chkpt->isFlaggedDeleted()) { + break; + } + + // If the checkpoint to delete is the current checkpoint, then + // We cannot just set current to the previous checkpoint because + // we may have run forward and storing a checkpoint in the + // future would depend on the checkpoint we are about to delete. + // This could be fixed by requiring the next checkpoint to be a + // spapshot. Instead, point to the flagged-deleted checkpoint + // and do not delete + if (getCurrentID() == id) { + return; + } + + auto prev = findCheckpoint(chkpt->getPrevID(), false); + + // If nothing later in the chain (tree) depends on d's data, it can be deleted. + // This also patches the checkpoint tree around the deleted checkpoint + //! \todo canDelete is recursive at worst and might benefit from optimization + if (chkpt->canDelete()) { + // Get checkpoint id regardless of whether alive or dead + chkpt_id_t id = chkpt->getID(); + if (chkpt->isFlaggedDeleted()) { + id = chkpt->getDeletedID(); + } + + num_dead_checkpoints_--; + + // Erase element in the cache/DB + disconnectChainLink_(id); + } + + // Continue until head is reached + if (prev) { + id = prev->getID(); + } else { + break; + } + } } -std::unique_ptr DatabaseCheckpointer::findCheckpoint_(chkpt_id_t id) noexcept +void DatabaseCheckpointer::disconnectChainLink_(chkpt_id_t id) { //TODO cnyce (void)id; - return nullptr; } -std::unique_ptr DatabaseCheckpointer::findCheckpoint_(chkpt_id_t id) const noexcept +bool DatabaseCheckpointer::recursForwardFindAlive_(chkpt_id_t id) const { - //TODO cnyce - (void)id; - return nullptr; + const auto next_ids = getNextIDs(id); + + for (const auto next_id : next_ids) { + auto chkpt = findCheckpoint(next_id); + // Only check descendants for snapshot-ness + if (chkpt->isSnapshot()) { + // Found a live snapshot that ends this branch. chkpt is not needed + // after this + return false; + } + if (next_id == getCurrentID()) { + // Found current in this search chain + return true; + } + if (chkpt->isFlaggedDeleted() == false) { + // Encountered a checkpoint later in the chain that still + // depends on this. + return true; + } + + // Continue the search recursively + if (recursForwardFindAlive_(next_id)) { + return true; + } + } + + // Found nothing alive. + return false; } void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const { static std::string SNAPSHOT_NOTICE = "(s)"; - auto cp = findCheckpoint_(id); + auto cp = findCheckpoint(id); // Draw data for this checkpoint - if(cp->isFlaggedDeleted()){ + if (cp->isFlaggedDeleted()) { o << cp->getDeletedRepr(); }else{ o << cp->getID(); } // Show that this is a snapshot - if(cp->isSnapshot()){ + if (cp->isSnapshot()) { o << ' ' << SNAPSHOT_NOTICE; } } std::vector DatabaseCheckpointer::getNextIDs_(chkpt_id_t id) const { - { - std::lock_guard lock(mutex_); - auto it = chkpts_cache_.find(id); - if (it != chkpts_cache_.end()) { - return it->second->getNextIDs(); - } - } - - // TODO cnyce: go to database - return {}; + return findCheckpoint(id)->getNextIDs(); } void DatabaseCheckpointer::setHead_(CheckpointBase* head) @@ -675,7 +871,7 @@ void DatabaseCheckpointer::setCurrent_(CheckpointBase* current) void DatabaseCheckpointer::setHeadID_(chkpt_id_t id) { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); sparta_assert(id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); sparta_assert(head_id_ == checkpoint_type::UNIDENTIFIED_CHECKPOINT); head_id_ = id; @@ -683,14 +879,14 @@ void DatabaseCheckpointer::setHeadID_(chkpt_id_t id) void DatabaseCheckpointer::setCurrentID_(chkpt_id_t id) { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); sparta_assert(id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); current_id_ = id; } void DatabaseCheckpointer::addToCache_(std::shared_ptr chkpt) { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); auto id = chkpt->getID(); chkpt_ids_for_pipeline_head_.push(id); @@ -701,7 +897,7 @@ void DatabaseCheckpointer::addToCache_(std::shared_ptr chkpt) bool DatabaseCheckpointer::cloneNextPipelineHeadCheckpoint_(std::shared_ptr& next) { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); if (chkpt_ids_for_pipeline_head_.empty()) { return false; } From 146c59f6135f100d3c5136665304bd939d4fe474 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 26 Aug 2025 13:45:04 -0500 Subject: [PATCH 07/30] Database-backed checkpointer --- sparta/simdb | 2 +- .../checkpoint/DatabaseCheckpoint.hpp | 47 ++ .../checkpoint/DatabaseCheckpointQuery.hpp | 14 +- .../checkpoint/DatabaseCheckpointer.hpp | 108 ++--- sparta/src/DatabaseCheckpoint.cpp | 5 +- sparta/src/DatabaseCheckpointQuery.cpp | 177 ++++++-- sparta/src/DatabaseCheckpointer.cpp | 419 ++++++------------ .../DatabaseCheckpoint_test.cpp | 2 +- 8 files changed, 388 insertions(+), 386 deletions(-) diff --git a/sparta/simdb b/sparta/simdb index d531b6ebc5..394f8bea37 160000 --- a/sparta/simdb +++ b/sparta/simdb @@ -1 +1 @@ -Subproject commit d531b6ebc5a0ec643c5f1f5b91e1e0dca13d0319 +Subproject commit 394f8bea37cf6684f938fc4ffe7cd4471b4a9f83 diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp index 5a9317c152..ea27e94689 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp @@ -9,6 +9,27 @@ namespace sparta::serialization::checkpoint { class DatabaseCheckpointer; + class DatabaseCheckpoint; + + struct ChkptWindowBytes { + using chkpt_id_t = CheckpointBase::chkpt_id_t; + std::vector chkpt_ids; + std::vector chkpt_bytes; + uint64_t start_tick; + uint64_t end_tick; + }; + + struct ChkptWindow { + using chkpt_id_t = CheckpointBase::chkpt_id_t; + std::vector chkpt_ids; + std::vector> chkpts; + uint64_t start_tick; + uint64_t end_tick; + + //! \brief Support boost::serialization + template + void serialize(Archive& ar, const unsigned int /*version*/); + }; /*! * \brief Checkpoint class optimized for use with database-backed @@ -254,4 +275,30 @@ namespace sparta::serialization::checkpoint DatabaseCheckpointer* checkpointer_ = nullptr; }; + //! Defined down here for "new DatabaseCheckpoint" + template + inline void ChkptWindow::serialize(Archive& ar, const unsigned int /*version*/) { + // TODO cnyce: Try to avoid use of unique_ptr. Everything is already movable + // and has default constructors. + ar & chkpt_ids; + ar & start_tick; + ar & end_tick; + + if (chkpts.empty()) { + // We are loading checkpoint window from disk + chkpts.reserve(chkpt_ids.size()); + for (size_t i = 0; i < chkpt_ids.size(); ++i) { + chkpts.emplace_back(new DatabaseCheckpoint); + ar & *chkpts.back(); + } + } + + else { + // We are saving a checkpoint window to disk + for (auto& chkpt : chkpts) { + ar & *chkpt; + } + } + } + } // namespace sparta::serialization::checkpoint diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp index 253f4f5491..8bf2a10eec 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp @@ -13,6 +13,7 @@ namespace sparta::serialization::checkpoint { class DatabaseCheckpoint; +class DatabaseCheckpointer; /*! * \brief SQLite query object to "extend" the checkpoint search space from just the @@ -22,8 +23,12 @@ class DatabaseCheckpoint; class DatabaseCheckpointQuery : public Checkpointer { public: - DatabaseCheckpointQuery(simdb::DatabaseManager* db_mgr, TreeNode& root, sparta::Scheduler* sched=nullptr) + DatabaseCheckpointQuery(DatabaseCheckpointer* checkpointer, + simdb::DatabaseManager* db_mgr, + TreeNode& root, + Scheduler* sched=nullptr) : Checkpointer(root, sched) + , checkpointer_(checkpointer) , db_mgr_(db_mgr) {} @@ -53,7 +58,7 @@ class DatabaseCheckpointQuery : public Checkpointer void traceValue(std::ostream& o, chkpt_id_t id, const ArchData* container, uint32_t offset, uint32_t size) override; - std::unique_ptr findCheckpoint(chkpt_id_t id); + std::shared_ptr findCheckpoint(chkpt_id_t id, bool must_exist=false) const; chkpt_id_t getPrevID(chkpt_id_t id) const; @@ -70,8 +75,9 @@ class DatabaseCheckpointQuery : public Checkpointer std::vector getNextIDs_(chkpt_id_t id) const override; - //! \brief SimDB instance - simdb::DatabaseManager* db_mgr_ = nullptr; + mutable DatabaseCheckpointer* checkpointer_ = nullptr; + + mutable simdb::DatabaseManager* db_mgr_ = nullptr; }; } // namespace sparta::serialization::checkpoint diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp index 2b6c8b52d3..257619a7ee 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -6,6 +6,7 @@ #include "sparta/serialization/checkpoint/DatabaseCheckpoint.hpp" #include "simdb/apps/App.hpp" #include "simdb/pipeline/Pipeline.hpp" +#include //! Default threshold for creating snapshots #ifndef DEFAULT_SNAPSHOT_THRESH @@ -94,25 +95,11 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer uint64_t getContentMemoryUse() const noexcept override; /*! - * \brief Deletes a checkpoint by ID. - * \param id ID of checkpoint to delete. Must not be - * Checkpoint::UNIDENTIFIED_CHECKPOINT and must not be equal to the - * ID of the head checkpoint. - * \throw CheckpointError if this manager has no checkpoint with given - * id. Test with hasCheckpoint first. If id == - * Checkpoint::UNIDENTIFIED_CHECKPOINT, always throws. - * Throws if id == getHeadID(). Head cannot be deleted - * - * Internally, this deletion may be effective-only and actual data may - * still exist in an incaccessible form as part of the checkpoint - * tree implementation. - * - * If the current checkpoint is deleted, current will be updated back - * along the current checkpoints previous checkpoint chain until a non - * deleted checkpoint is found. This will become the new current - * checkpoint + * \brief Explicit checkpoint deletion is NOT supported by this checkpointer. */ - void deleteCheckpoint(chkpt_id_t id) override; + void deleteCheckpoint(chkpt_id_t) override final { + throw CheckpointError("deleteCheckpoint() not supported"); + } /*! * \brief Loads state from a specific checkpoint by ID @@ -203,15 +190,25 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \throw CheckpointError if \a from does not refer to a valid * checkpoint. */ - std::unique_ptr findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from); + std::shared_ptr findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from); /*! - * \brief Finds a checkpoint by its ID + * \brief Finds a checkpoint by its ID. * \param id ID of checkpoint to find. Guaranteed not to be flagged as * deleted + * \note ONLY SEARCHES CHECKPOINT CACHE. Use cloneCheckpoint() to also search the database. * \return Checkpoint with ID of \a id if found or nullptr if not found */ - std::unique_ptr findCheckpoint(chkpt_id_t id, bool must_exist=true) const; + std::weak_ptr findCheckpoint(chkpt_id_t id) const; + + /*! + * \brief Finds a checkpoint by its ID. + * \param id ID of checkpoint to find. Guaranteed not to be flagged as + * deleted + * \note SEARCHES BOTH THE CACHE AND THE DATABASE + * \return Checkpoint with ID of \a id if found or nullptr if not found + */ + std::shared_ptr cloneCheckpoint(chkpt_id_t id) const; /*! * \brief Tests whether this checkpoint manager has a checkpoint with @@ -348,35 +345,25 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer chkpt_id_t createCheckpoint_(bool force_snapshot=false) override; /*! - * \brief Delete given checkpoint and all contiguous previous - * checkpoints which can be deleted (See checkpoint_type::canDelete). - * This is the only place where checkpoint objects are actually freed - * (aside from destruction) and it ensures that they will not disrupt - * the checkpoint delta chains. All other deletion is simply flagging - * and re-identifying checkpoints - * \param d Checkpoint to attempt to delete first. Function will then - * move through each previous checkpoint until reaching head. - * \post Head checkpoint will never be deleted by this function - * \note Never flags any new checkpoints as deleted - */ - void cleanupChain_(chkpt_id_t id); - - /*! - * \brief Remove the given checkpoint from the cache and/or DB. The - * adjacent checkpoints, if any, will be reconnected appropriately. - */ - void disconnectChainLink_(chkpt_id_t id); - - /*! - * \brief Look forward to see if any future checkpoints depend on \a d. - * \param d checkpoint to inspect and recursively search - * \return true if the current checkpoint or any live checkpoints - * are hit in the search. Search terminates on each branch when a - * snapshot or the end of the branch is reached. The branch to inspect - * (\a d) will not be checked itself since the point is to determine - * which branches down-chain depend on it. + * \brief Deletes a checkpoint by ID. + * \param id ID of checkpoint to delete. Must not be + * Checkpoint::UNIDENTIFIED_CHECKPOINT and must not be equal to the + * ID of the head checkpoint. + * \throw CheckpointError if this manager has no checkpoint with given + * id. Test with hasCheckpoint first. If id == + * Checkpoint::UNIDENTIFIED_CHECKPOINT, always throws. + * Throws if id == getHeadID(). Head cannot be deleted + * + * Internally, this deletion may be effective-only and actual data may + * still exist in an incaccessible form as part of the checkpoint + * tree implementation. + * + * If the current checkpoint is deleted, current will be updated back + * along the current checkpoints previous checkpoint chain until a non + * deleted checkpoint is found. This will become the new current + * checkpoint */ - bool recursForwardFindAlive_(chkpt_id_t id) const; + void deleteCheckpoint_(chkpt_id_t id); /*! * \brief Implements Checkpointer::dumpCheckpointNode_ @@ -414,11 +401,6 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer */ void addToCache_(std::shared_ptr chkpt); - /*! - * \brief Clone the next checkpoint that is ready for processing. - */ - bool cloneNextPipelineHeadCheckpoint_(std::shared_ptr& next); - //! \brief Checkpointer head ID. Used to prevent the head from being deleted from the cache. chkpt_id_t head_id_ = checkpoint_type::UNIDENTIFIED_CHECKPOINT; @@ -428,11 +410,8 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer //! \brief Subset (or all of) our checkpoints that we currently are holding in memory. std::unordered_map> chkpts_cache_; - //! \brief Ordered running list of checkpoint IDs that come in via calls to createCheckpoint_(). - //! This is used in the pipeline to pick off and start processing checkpoints in the same order - //! in which they were received, while keeping the cache designed to use an unordered_map for - //! random access. - std::queue chkpt_ids_for_pipeline_head_; + //! \brief Subset (or all of) our checkpoints in the cache that haven't been send down the pipeline. + std::vector> chkpts_queue_; //! \brief SQLite query object to "extend" the checkpoint search space from just the //! cache to include the database. Combinations of in-memory checkpoints, recreated @@ -440,11 +419,20 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer std::shared_ptr chkpt_query_; //! \brief Mutex to protect our checkpoints cache. - mutable std::recursive_mutex mutex_; + mutable std::recursive_mutex cache_mutex_; + + //! \brief Set of dead checkpoint IDs. + std::unordered_set dead_chkpt_ids_; + + //! \brief Mutex to protect our set of dead checkpoint IDs. + mutable std::recursive_mutex dead_chkpts_mutex_; //! \brief SimDB instance simdb::DatabaseManager* db_mgr_ = nullptr; + //! \brief Pipeline. Held onto to enable flushing. + simdb::pipeline::Pipeline* pipeline_ = nullptr; + /*! * \brief Snapshot generation threshold. Every n checkpoints in a chain * are taken as snapshots instead of deltas diff --git a/sparta/src/DatabaseCheckpoint.cpp b/sparta/src/DatabaseCheckpoint.cpp index b2a1cbb350..9934602d9e 100644 --- a/sparta/src/DatabaseCheckpoint.cpp +++ b/sparta/src/DatabaseCheckpoint.cpp @@ -33,6 +33,9 @@ DatabaseCheckpoint::DatabaseCheckpoint(TreeNode& root, } if (prev) { + if (!prev->next_ids_.empty()) { + throw CheckpointError("DatabaseCheckpointer does not support multiple checkpoint branches"); + } prev->next_ids_.push_back(getID()); } @@ -124,7 +127,7 @@ void DatabaseCheckpoint::load(const std::vector& dats) while (!chkpt_ids.empty()) { auto id = chkpt_ids.top(); chkpt_ids.pop(); - checkpointer_->findCheckpoint(id)->loadState(dats); + checkpointer_->cloneCheckpoint(id)->loadState(dats); } } diff --git a/sparta/src/DatabaseCheckpointQuery.cpp b/sparta/src/DatabaseCheckpointQuery.cpp index 714c8dd184..22d96fb03f 100644 --- a/sparta/src/DatabaseCheckpointQuery.cpp +++ b/sparta/src/DatabaseCheckpointQuery.cpp @@ -1,5 +1,15 @@ #include "sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp" #include "sparta/serialization/checkpoint/DatabaseCheckpoint.hpp" +#include "simdb/sqlite/DatabaseManager.hpp" +#include "simdb/sqlite/Iterator.hpp" +#include "simdb/utils/Compress.hpp" + +#include +#include +#include +#include +#include +#include namespace sparta::serialization::checkpoint { @@ -9,45 +19,80 @@ using tick_t = typename DatabaseCheckpointQuery::tick_t; uint64_t DatabaseCheckpointQuery::getTotalMemoryUse() const noexcept { - //TODO cnyce return 0; } uint64_t DatabaseCheckpointQuery::getContentMemoryUse() const noexcept { - //TODO cnyce return 0; } -void DatabaseCheckpointQuery::deleteCheckpoint(chkpt_id_t id) +void DatabaseCheckpointQuery::deleteCheckpoint(chkpt_id_t) { - //TODO cnyce - (void)id; + throw CheckpointError("deleteCheckpoint() not supported"); } -void DatabaseCheckpointQuery::loadCheckpoint(chkpt_id_t id) +void DatabaseCheckpointQuery::loadCheckpoint(chkpt_id_t) { - //TODO cnyce - (void)id; + throw CheckpointError("loadCheckpoint() not supported"); } std::vector DatabaseCheckpointQuery::getCheckpointsAt(tick_t t) const { - //TODO cnyce - (void)t; - return {}; + // SELECT ChkptWindowBytesID FROM ChkptWindowTicks WHERE t <= EndTick AND t >= StartTick + auto query = db_mgr_->createQuery("ChkptWindowTicks"); + + query->addConstraintForUInt64("StartTick", simdb::Constraints::LESS_EQUAL, t); + query->addConstraintForUInt64("EndTick", simdb::Constraints::GREATER_EQUAL, t); + + int window_id; + query->select("ChkptWindowBytesID", window_id); + + auto results = query->getResultSet(); + if (!results.getNextRecord()) { + return {}; + } + + // SELECT ChkptID FROM ChkptWindowIDs WHERE ChkptWindowBytesID = + query = db_mgr_->createQuery("ChkptWindowIDs"); + + int chkpt_id; + query->select("ChkptID", chkpt_id); + query->addConstraintForInt("ChkptWindowBytesID", simdb::Constraints::EQUAL, window_id); + + auto results2 = query->getResultSet(); + std::vector ids; + while (results2.getNextRecord()) { + if (auto chkpt = findCheckpoint(chkpt_id)) { + if (chkpt->getTick() == t) { + ids.push_back(chkpt_id); + } + } + } + + return ids; } std::vector DatabaseCheckpointQuery::getCheckpoints() const { - //TODO cnyce - return {}; + auto query = db_mgr_->createQuery("ChkptWindowIDs"); + + int chkpt_id; + query->select("ChkptID", chkpt_id); + + auto results = query->getResultSet(); + std::vector ids; + while (results.getNextRecord()) { + ids.push_back(chkpt_id); + } + + return ids; } uint32_t DatabaseCheckpointQuery::getNumCheckpoints() const noexcept { - //TODO cnyce - return 0; + auto query = db_mgr_->createQuery("ChkptWindowIDs"); + return query->count(); } std::deque DatabaseCheckpointQuery::getCheckpointChain(chkpt_id_t id) const @@ -59,58 +104,100 @@ std::deque DatabaseCheckpointQuery::getCheckpointChain(chkpt_id_t id bool DatabaseCheckpointQuery::hasCheckpoint(chkpt_id_t id) const noexcept { - //TODO cnyce - (void)id; - return false; + auto query = db_mgr_->createQuery("ChkptWindowIDs"); + query->addConstraintForUInt64("ChkptID", simdb::Constraints::EQUAL, id); + auto results = query->getResultSet(); + return results.getNextRecord(); } void DatabaseCheckpointQuery::dumpList(std::ostream& o) const { - //TODO cnyce + //TODO cnyce: look back (void)o; } void DatabaseCheckpointQuery::dumpData(std::ostream& o) const { - //TODO cnyce + //TODO cnyce: look back (void)o; } void DatabaseCheckpointQuery::dumpAnnotatedData(std::ostream& o) const { - //TODO cnyce + //TODO cnyce: look back (void)o; } void DatabaseCheckpointQuery::traceValue(std::ostream& o, chkpt_id_t id, const ArchData* container, uint32_t offset, uint32_t size) { - //TODO cnyce (void)o; (void)id; (void)container; (void)offset; (void)size; + + sparta_assert(false, "Not implemented"); } -std::unique_ptr DatabaseCheckpointQuery::findCheckpoint(chkpt_id_t id) +std::shared_ptr DatabaseCheckpointQuery::findCheckpoint(chkpt_id_t id, bool must_exist) const { - //TODO cnyce - (void)id; + // "Undo" task 6 (write to the database) + auto query = db_mgr_->createQuery("ChkptWindowIDs"); + query->addConstraintForUInt64("ChkptID", simdb::Constraints::EQUAL, id); + + int window_id; + query->select("ChkptWindowBytesID", window_id); + + auto results1 = query->getResultSet(); + if (!results1.getNextRecord()) { + if (must_exist) { + throw CheckpointError("There is no checkpoint with ID ") << id; + } + return nullptr; + } + + query = db_mgr_->createQuery("ChkptWindowBytes"); + query->addConstraintForInt("Id", simdb::Constraints::EQUAL, window_id); + + std::vector bytes; + query->select("WindowBytes", bytes); + + auto results2 = query->getResultSet(); + sparta_assert(results2.getNextRecord()); + + // "Undo" task 5 (zlib compression) + std::vector uncompressed; + simdb::decompressData(bytes, uncompressed); + + // "Undo" task 4 (boost::serialization) + namespace bio = boost::iostreams; + bio::array_source src(uncompressed.data(), uncompressed.size()); + bio::stream is(src); + + boost::archive::binary_iarchive ia(is); + ChkptWindow window; + ia >> window; + + for (auto& chkpt : window.chkpts) { + if (chkpt->getID() == id) { + return chkpt; + } + } + + sparta_assert(false, "Should not be reachable"); return nullptr; } chkpt_id_t DatabaseCheckpointQuery::getPrevID(chkpt_id_t id) const { - //TODO cnyce - (void)id; - return 0; + auto chkpt = findCheckpoint(id, true); + return chkpt->getPrevID(); } std::vector DatabaseCheckpointQuery::getNextIDs(chkpt_id_t id) const { - //TODO cnyce - (void)id; - return {}; + auto chkpt = findCheckpoint(id, true); + return chkpt->getNextIDs(); } uint32_t DatabaseCheckpointQuery::getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept @@ -122,27 +209,35 @@ uint32_t DatabaseCheckpointQuery::getDistanceToPrevSnapshot(chkpt_id_t id) const void DatabaseCheckpointQuery::createHead_() { + throw CheckpointError("Cannot create checkpoint head for DatabaseCheckpointQuery"); } -chkpt_id_t DatabaseCheckpointQuery::createCheckpoint_(bool force_snapshot) +chkpt_id_t DatabaseCheckpointQuery::createCheckpoint_(bool) { - //TODO cnyce - (void)force_snapshot; - return 0; + throw CheckpointError("Cannot create checkpoint head for DatabaseCheckpointQuery"); } void DatabaseCheckpointQuery::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const { - //TODO cnyce - (void)id; - (void)o; + static std::string SNAPSHOT_NOTICE = "(s)"; + auto chkpt = findCheckpoint(id, true); + + // Draw data for this checkpoint + if (chkpt->isFlaggedDeleted()) { + o << chkpt->getDeletedRepr(); + } else { + o << chkpt->getID(); + } + + // Show that this is a snapshot + if (chkpt->isSnapshot()) { + o << ' ' << SNAPSHOT_NOTICE; + } } std::vector DatabaseCheckpointQuery::getNextIDs_(chkpt_id_t id) const { - //TODO cnyce - (void)id; - return {}; + return getNextIDs(id); } } // namespace sparta::serialization::checkpoint diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index 8cb42e5238..e3cb0515dc 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -25,45 +25,11 @@ using chkpt_id_t = typename CheckpointBase::chkpt_id_t; using checkpoint_type = DatabaseCheckpoint; using checkpoint_ptr = std::shared_ptr; using checkpoint_ptrs = std::vector; - -struct ChkptWindow { - std::vector chkpt_ids; - checkpoint_ptrs chkpts; - - // TODO cnyce: Try to avoid use of unique_ptr. Everything is already movable - // and has default constructors. - template - void serialize(Archive& ar, const unsigned int /*version*/) { - ar & chkpt_ids; - - if (chkpts.empty()) { - // We are loading checkpoint window from disk - chkpts.reserve(chkpt_ids.size()); - for (size_t i = 0; i < chkpt_ids.size(); ++i) { - chkpts.emplace_back(new DatabaseCheckpoint); - ar & *chkpts.back(); - } - } - - else { - // We are saving a checkpoint window to disk - for (auto& chkpt : chkpts) { - ar & *chkpt; - } - } - } -}; - -struct ChkptWindowBytes { - std::vector chkpt_ids; - std::vector chkpt_bytes; -}; - using EvictedChkptIDs = std::vector; DatabaseCheckpointer::DatabaseCheckpointer(simdb::DatabaseManager* db_mgr, TreeNode& root, Scheduler* sched) : Checkpointer(root, sched), - chkpt_query_(std::make_shared(db_mgr, root, sched)), + chkpt_query_(std::make_shared(this, db_mgr, root, sched)), db_mgr_(db_mgr), snap_thresh_(DEFAULT_SNAPSHOT_THRESH), next_chkpt_id_(checkpoint_type::MIN_CHECKPOINT), @@ -85,6 +51,13 @@ void DatabaseCheckpointer::defineSchema(simdb::Schema& schema) window_ids.addColumn("ChkptID", dt::int32_t); window_ids.createIndexOn("ChkptID"); window_ids.disableAutoIncPrimaryKey(); + + auto& window_ticks = schema.addTable("ChkptWindowTicks"); + window_ticks.addColumn("ChkptWindowBytesID", dt::int32_t); + window_ticks.addColumn("StartTick", dt::uint64_t); + window_ticks.addColumn("EndTick", dt::uint64_t); + window_ticks.createCompoundIndexOn({"StartTick", "EndTick"}); + window_ticks.disableAutoIncPrimaryKey(); } std::unique_ptr DatabaseCheckpointer::createPipeline( @@ -92,40 +65,61 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( { auto pipeline = std::make_unique(db_mgr_, NAME); - // Task 1: Clone the next checkpoint from the cache to send down pipeline - auto feed_pipeline = simdb::pipeline::createTask>( - [this](simdb::ConcurrentQueue& out, bool /*simulation_terminating*/) mutable + // Task 1: Clone an entire checkpoint window (snapshot plus all deltas until next snapshot) + auto clone_window = simdb::pipeline::createTask>( + [this](simdb::ConcurrentQueue& out, bool /*simulation_terminating*/) mutable -> bool { - checkpoint_ptr next_chkpt; - if (cloneNextPipelineHeadCheckpoint_(next_chkpt)) { - out.emplace(std::move(next_chkpt)); - return true; + std::lock_guard lock(cache_mutex_); + + if (chkpts_queue_.size() < getSnapshotThreshold()) { + return false; } - return false; + + checkpoint_ptrs chkpts; + auto it = chkpts_queue_.begin(); + while (chkpts.size() < getSnapshotThreshold()) { + if (auto c = it->lock()) { + if (chkpts.empty() && !c->isSnapshot()) { + throw CheckpointError("Invalid checkpoint - only one snapshot per window"); + } else if (!chkpts.empty() && c->isSnapshot()) { + throw CheckpointError("Invalid checkpoint - only one snapshot per window"); + } + + chkpts.emplace_back(c->clone()); + ++it; + } else { + throw CheckpointError("Invalid checkpoint - has been deleted"); + } + } + + chkpts_queue_.erase(chkpts_queue_.begin(), it); + return true; } ); - // Task 2: Buffer snapshots and their deltas into checkpoint windows - const auto window_len = getSnapshotThreshold(); - const auto flush_partial = true; - auto create_window = simdb::pipeline::createTask>(window_len, flush_partial); - - // Task 3: Add the IDs of all checkpoints in this window + // Task 2: Add the IDs of all checkpoints in this window auto add_chkpt_ids = simdb::pipeline::createTask>( [](checkpoint_ptrs&& chkpts, simdb::ConcurrentQueue& windows, bool /*simulation_terminating*/) { + uint64_t start_tick = std::numeric_limits::max(); + uint64_t end_tick = 0; + ChkptWindow window; window.chkpts = std::move(chkpts); for (auto& chkpt : window.chkpts) { window.chkpt_ids.push_back(chkpt->getID()); + start_tick = std::min(start_tick, chkpt->getTick()); + end_tick = std::max(end_tick, chkpt->getTick()); } + window.start_tick = start_tick; + window.end_tick = end_tick; windows.emplace(std::move(window)); } ); - // Task 4: Serialize a checkpoint window into a char buffer + // Task 3: Serialize a checkpoint window into a char buffer auto window_to_bytes = simdb::pipeline::createTask>( [](ChkptWindow&& window, simdb::ConcurrentQueue& window_bytes, @@ -142,11 +136,13 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( bytes.chkpt_ids.push_back(chkpt->getID()); } + bytes.start_tick = window.start_tick; + bytes.end_tick = window.end_tick; window_bytes.emplace(std::move(bytes)); } ); - // Task 5: Perform zlib compression on the checkpoint window bytes + // Task 4: Perform zlib compression on the checkpoint window bytes auto zlib_bytes = simdb::pipeline::createTask>( [](ChkptWindowBytes&& bytes_in, simdb::ConcurrentQueue& bytes_out, @@ -159,7 +155,7 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( } ); - // Task 6: Write to the database + // Task 5: Write to the database auto write_to_db = db_accessor->createAsyncWriter( [](ChkptWindowBytes&& bytes_in, simdb::ConcurrentQueue& evicted_ids, @@ -177,32 +173,40 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( chkpt_ids_inserter->createRecord(); } + auto chkpt_ticks_inserter = tables->getPreparedINSERT("ChkptWindowTicks"); + chkpt_ticks_inserter->setColumnValue(0, bytes_id); + chkpt_ticks_inserter->setColumnValue(1, bytes_in.start_tick); + chkpt_ticks_inserter->setColumnValue(2, bytes_in.end_tick); + chkpt_ticks_inserter->createRecord(); + evicted_ids.emplace(std::move(bytes_in.chkpt_ids)); } ); - // Task 7: Perform cache eviction after a window of checkpoints has been written to SimDB + // Task 6: Perform cache eviction after a window of checkpoints has been written to SimDB auto evict_from_cache = simdb::pipeline::createTask>( [this](EvictedChkptIDs&& evicted_ids, bool /*simulation_terminating*/) mutable { + // TODO cnyce: We are allocating and deallocating a LOT of checkpoints. + // See if we can reuse a pool of them. Could also try to just add a pool + // to the VectorStorage::Segment class. + std::lock_guard lock(cache_mutex_); + for (auto id : evicted_ids) { sparta_assert(id != head_id_); sparta_assert(id != current_id_); - - // TODO cnyce: We are allocating and deallocating a LOT of checkpoints. - // See if we can reuse a pool of them. Could also try to just add a pool - // to the VectorStorage::Segment class. - std::lock_guard lock(mutex_); chkpts_cache_.erase(id); } } ); - *feed_pipeline >> *create_window >> *add_chkpt_ids >> *window_to_bytes >> *zlib_bytes >> *write_to_db >> *evict_from_cache; + *clone_window >> *add_chkpt_ids >> *window_to_bytes >> *zlib_bytes >> *write_to_db >> *evict_from_cache; + + pipeline_ = pipeline.get(); pipeline->createTaskGroup("CheckpointPipeline") - ->addTask(std::move(feed_pipeline)) - ->addTask(std::move(create_window)) + ->addTask(std::move(clone_window)) + ->addTask(std::move(add_chkpt_ids)) ->addTask(std::move(window_to_bytes)) ->addTask(std::move(zlib_bytes)) ->addTask(std::move(evict_from_cache)); @@ -222,66 +226,35 @@ void DatabaseCheckpointer::setSnapshotThreshold(uint32_t thresh) noexcept uint64_t DatabaseCheckpointer::getTotalMemoryUse() const noexcept { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); uint64_t mem = 0; for (const auto& [id, chkpt] : chkpts_cache_) { mem += chkpt->getTotalMemoryUse(); } - mem += chkpt_query_->getTotalMemoryUse(); return mem; } uint64_t DatabaseCheckpointer::getContentMemoryUse() const noexcept { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); uint64_t mem = 0; for (const auto& [id, chkpt] : chkpts_cache_) { - mem += chkpt->getTotalMemoryUse(); + mem += chkpt->getContentMemoryUse(); } - mem += chkpt_query_->getContentMemoryUse(); return mem; } -void DatabaseCheckpointer::deleteCheckpoint(chkpt_id_t id) +void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) { - if (!hasCheckpoint(id)) { - throw CheckpointError("Could not delete checkpoint ID=") - << id << " because no checkpoint by this ID was found"; - } - - std::lock_guard lock(mutex_); - - if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { - checkpoint_type* chkpt = it->second.get(); - - // Allow deletion and change ID to UNIDENTIFIED_CHECKPOINT. - // This is still part of a chain though until there are no - // dependencies on it. - if (!chkpt->isFlaggedDeleted()) { - num_dead_checkpoints_++; - if (chkpt->isSnapshot()) { - num_alive_snapshots_--; - } - num_alive_checkpoints_--; - chkpt->flagDeleted(); - } - - // Delete this and all contiguous previous checkpoint which were - // flagged deleted if possible. Stop if current_ is encountered - cleanupChain_(chkpt->getID()); + if (auto c = getCurrent_(); c && c->getID() == id) { + return; } - chkpt_query_->deleteCheckpoint(id); -} - -void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) -{ - auto chkpt = findCheckpoint(id); + auto chkpt = cloneCheckpoint(id); chkpt->load(getArchDatas()); // Move current to another checkpoint. Anything between head and the // old current_ is fair game for removal if allowed - checkpoint_type* rmv = static_cast(getCurrent_()); setCurrent_(chkpt.get()); addToCache_(std::move(chkpt)); @@ -290,48 +263,51 @@ void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) sched_->restartAt(getCurrentTick()); } - // Remove all checkpoints which can be. Stop if the new current_ is - // encountered again. - // Note that is is OK if current_ was moved to a later position in - // the chain. No important checkpoints will be removed. The - // important thing is never to remove current_. - cleanupChain_(rmv->getID()); + // Delete all future checkpoints past this one. Do this from the cache + // as well as from the database. + auto next_ids = chkpt->getNextIDs(); + if (!next_ids.empty()) { + if (next_ids.size() != 1) { + throw CheckpointError("DatabaseCheckpointer does not support multiple checkpoint branches"); + } + deleteCheckpoint_(next_ids[0]); + } } std::vector DatabaseCheckpointer::getCheckpointsAt(tick_t t) const { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); - std::vector results; + std::unordered_set results; for (const auto& [id, chkpt] : chkpts_cache_) { if (chkpt->getTick() == t && !chkpt->isFlaggedDeleted()) { - results.push_back(id); + results.insert(id); } } for (auto id : chkpt_query_->getCheckpointsAt(t)) { - results.push_back(id); + results.insert(id); } - return results; + return {results.begin(), results.end()}; } std::vector DatabaseCheckpointer::getCheckpoints() const { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); - std::vector results; + std::unordered_set results; for (const auto& [id, chkpt] : chkpts_cache_) { if (!chkpt->isFlaggedDeleted()) { - results.push_back(id); + results.insert(id); } } for (auto id : chkpt_query_->getCheckpoints()) { - results.push_back(id); + results.insert(id); } - return results; + return {results.begin(), results.end()}; } uint32_t DatabaseCheckpointer::getNumCheckpoints() const noexcept @@ -356,7 +332,7 @@ uint32_t DatabaseCheckpointer::getNumDeadCheckpoints() const noexcept std::deque DatabaseCheckpointer::getCheckpointChain(chkpt_id_t id) const { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); std::deque chain; if (!getHead()) { @@ -382,33 +358,36 @@ std::deque DatabaseCheckpointer::getCheckpointChain(chkpt_id_t id) c return chain; } -std::unique_ptr DatabaseCheckpointer::findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from) +std::shared_ptr DatabaseCheckpointer::findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from) { - if (!hasCheckpoint(from)) { - throw CheckpointError("There is no checkpoint with ID ") << from; - } - - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); auto id = from; do { - auto chkpt = findCheckpoint(id); + auto chkpt = cloneCheckpoint(id); if (chkpt->getTick() <= tick) { break; } id = chkpt->getPrevID(); } while (id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); - return findCheckpoint(id); + return cloneCheckpoint(id); } -std::unique_ptr DatabaseCheckpointer::findCheckpoint(chkpt_id_t id, bool must_exist) const +std::weak_ptr DatabaseCheckpointer::findCheckpoint(chkpt_id_t id) const { - if (!hasCheckpoint(id)) { - throw CheckpointError("There is no checkpoint with ID ") << id; + std::lock_guard lock(cache_mutex_); + + if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { + return it->second; } - std::lock_guard lock(mutex_); + return std::weak_ptr(); +} + +std::shared_ptr DatabaseCheckpointer::cloneCheckpoint(chkpt_id_t id) const +{ + std::lock_guard lock(cache_mutex_); if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { return it->second->clone(); @@ -419,7 +398,7 @@ std::unique_ptr DatabaseCheckpointer::findCheckpoint(chkpt_i bool DatabaseCheckpointer::hasCheckpoint(chkpt_id_t id) const noexcept { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); if (chkpts_cache_.find(id) != chkpts_cache_.end()) { return true; @@ -432,7 +411,7 @@ void DatabaseCheckpointer::dumpRestoreChain(std::ostream& o, chkpt_id_t id) cons { auto rc = getRestoreChain(id); while (true) { - const auto chkpt = findCheckpoint(rc.top()); + const auto chkpt = cloneCheckpoint(rc.top()); rc.pop(); if (chkpt->isSnapshot()) { o << '('; @@ -455,7 +434,7 @@ void DatabaseCheckpointer::dumpRestoreChain(std::ostream& o, chkpt_id_t id) cons std::stack DatabaseCheckpointer::getHistoryChain(chkpt_id_t id) const { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); std::stack chain; auto it = chkpts_cache_.find(id); @@ -479,7 +458,7 @@ std::stack DatabaseCheckpointer::getRestoreChain(chkpt_id_t id) cons std::stack chkpts; while (true) { chkpts.push(id); - auto chkpt = findCheckpoint(id); + auto chkpt = cloneCheckpoint(id); if (chkpt->isSnapshot()) { break; } @@ -490,12 +469,12 @@ std::stack DatabaseCheckpointer::getRestoreChain(chkpt_id_t id) cons std::vector DatabaseCheckpointer::getNextIDs(chkpt_id_t id) const { - return findCheckpoint(id)->getNextIDs(); + return cloneCheckpoint(id)->getNextIDs();//colby } uint32_t DatabaseCheckpointer::getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); uint32_t dist = 0; auto it = chkpts_cache_.find(id); @@ -531,14 +510,14 @@ uint32_t DatabaseCheckpointer::getDistanceToPrevSnapshot(chkpt_id_t id) const no bool DatabaseCheckpointer::isSnapshot(chkpt_id_t id) const noexcept { - return findCheckpoint(id)->isSnapshot(); + return cloneCheckpoint(id)->isSnapshot(); } bool DatabaseCheckpointer::canDelete(chkpt_id_t id) const noexcept { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); - auto chkpt = findCheckpoint(id); + auto chkpt = cloneCheckpoint(id); if (!chkpt->isFlaggedDeleted()) { return false; } @@ -561,7 +540,7 @@ std::string DatabaseCheckpointer::stringize() const void DatabaseCheckpointer::dumpList(std::ostream& o) const { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); for (const auto& [id, chkpt] : chkpts_cache_) { o << chkpt->stringize() << std::endl; @@ -572,7 +551,7 @@ void DatabaseCheckpointer::dumpList(std::ostream& o) const void DatabaseCheckpointer::dumpData(std::ostream& o) const { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); for (const auto& [id, chkpt] : chkpts_cache_) { chkpt->dumpData(o); @@ -584,7 +563,7 @@ void DatabaseCheckpointer::dumpData(std::ostream& o) const void DatabaseCheckpointer::dumpAnnotatedData(std::ostream& o) const { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); for (const auto& [id, chkpt] : chkpts_cache_) { o << chkpt->stringize() << std::endl; @@ -702,143 +681,46 @@ chkpt_id_t DatabaseCheckpointer::createCheckpoint_(bool force_snapshot) num_alive_checkpoints_++; num_alive_snapshots_ += (current->isSnapshot() == true) ? 1 : 0; - if (current->isSnapshot()) { - // Clean up starting with this snapshot and moving back. - // May have an opportunity to free older deltas right now - // (instead of upon next deletion) - cleanupChain_(current->getID()); - } - return current->getID(); } -void DatabaseCheckpointer::cleanupChain_(chkpt_id_t id) +void DatabaseCheckpointer::deleteCheckpoint_(chkpt_id_t id) { - std::lock_guard lock(mutex_); - - // In order to truly delete any checkpoints, we must traverse back - // to the previous snapshot (or the head) and forward to the another - // snapshot or the end of the chain. - // ONLY if both of those points can be reached without encountering - // a living checkpoint or the current checkpoint (forward - // only) can the whole chain (including the leading shapshot) be - // deleted. + std::lock_guard lock(cache_mutex_); - if (id == getHeadID()) { - // Cannot delete head of checkpoint tree - return; - } - - // Walk forward to another snapshot or current - const bool needed_later = (getCurrentID() == id) || recursForwardFindAlive_(id); - if (needed_later) { - // Cannot delete because a later living checkpoint (or current) depends on this. - auto chkpt = findCheckpoint(id); - if (chkpt->isSnapshot()) { - // This snapshot is needed later. Move to previous delta and work from there. - id = chkpt->getPrevID(); - } else { - return; // This delta is needed. Therefore all preceeding deltas are needed. - } - } - - // Delete backward until current, head, or a non-flagged-deleted checkpoint is hit. - // It is possible to fracture the checkpoint tree by deleting a segment - // between two snapshots, so prev can end up with nothing leading up to it while (true) { - if (id == checkpoint_type::UNIDENTIFIED_CHECKPOINT) { - break; - } - - if (id == getHeadID()) { + auto it = chkpts_cache_.find(id); + if (it == chkpts_cache_.end()) { break; } - auto chkpt = findCheckpoint(id); - if (!chkpt->isFlaggedDeleted()) { - break; - } - - // If the checkpoint to delete is the current checkpoint, then - // We cannot just set current to the previous checkpoint because - // we may have run forward and storing a checkpoint in the - // future would depend on the checkpoint we are about to delete. - // This could be fixed by requiring the next checkpoint to be a - // spapshot. Instead, point to the flagged-deleted checkpoint - // and do not delete - if (getCurrentID() == id) { - return; + { + std::lock_guard lock2(dead_chkpts_mutex_); + dead_chkpt_ids_.insert(id); } - auto prev = findCheckpoint(chkpt->getPrevID(), false); - - // If nothing later in the chain (tree) depends on d's data, it can be deleted. - // This also patches the checkpoint tree around the deleted checkpoint - //! \todo canDelete is recursive at worst and might benefit from optimization - if (chkpt->canDelete()) { - // Get checkpoint id regardless of whether alive or dead - chkpt_id_t id = chkpt->getID(); - if (chkpt->isFlaggedDeleted()) { - id = chkpt->getDeletedID(); + auto next_ids = it->second->getNextIDs(); + if (!next_ids.empty()) { + if (next_ids.size() != 1) { + throw CheckpointError("DatabaseCheckpointer does not support multiple checkpoint branches"); } - - num_dead_checkpoints_--; - - // Erase element in the cache/DB - disconnectChainLink_(id); - } - - // Continue until head is reached - if (prev) { - id = prev->getID(); + id = next_ids[0]; + chkpts_cache_.erase(it); } else { + id = checkpoint_type::UNIDENTIFIED_CHECKPOINT; break; } } -} -void DatabaseCheckpointer::disconnectChainLink_(chkpt_id_t id) -{ - //TODO cnyce - (void)id; -} - -bool DatabaseCheckpointer::recursForwardFindAlive_(chkpt_id_t id) const -{ - const auto next_ids = getNextIDs(id); - - for (const auto next_id : next_ids) { - auto chkpt = findCheckpoint(next_id); - // Only check descendants for snapshot-ness - if (chkpt->isSnapshot()) { - // Found a live snapshot that ends this branch. chkpt is not needed - // after this - return false; - } - if (next_id == getCurrentID()) { - // Found current in this search chain - return true; - } - if (chkpt->isFlaggedDeleted() == false) { - // Encountered a checkpoint later in the chain that still - // depends on this. - return true; - } - - // Continue the search recursively - if (recursForwardFindAlive_(next_id)) { - return true; - } + if (id != checkpoint_type::UNIDENTIFIED_CHECKPOINT) { + chkpt_query_->deleteCheckpoint(id); } - - // Found nothing alive. - return false; } void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const { static std::string SNAPSHOT_NOTICE = "(s)"; - auto cp = findCheckpoint(id); + auto cp = cloneCheckpoint(id); // Draw data for this checkpoint if (cp->isFlaggedDeleted()) { @@ -854,7 +736,7 @@ void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream std::vector DatabaseCheckpointer::getNextIDs_(chkpt_id_t id) const { - return findCheckpoint(id)->getNextIDs(); + return cloneCheckpoint(id)->getNextIDs(); } void DatabaseCheckpointer::setHead_(CheckpointBase* head) @@ -871,7 +753,7 @@ void DatabaseCheckpointer::setCurrent_(CheckpointBase* current) void DatabaseCheckpointer::setHeadID_(chkpt_id_t id) { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); sparta_assert(id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); sparta_assert(head_id_ == checkpoint_type::UNIDENTIFIED_CHECKPOINT); head_id_ = id; @@ -879,38 +761,19 @@ void DatabaseCheckpointer::setHeadID_(chkpt_id_t id) void DatabaseCheckpointer::setCurrentID_(chkpt_id_t id) { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); sparta_assert(id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); current_id_ = id; } void DatabaseCheckpointer::addToCache_(std::shared_ptr chkpt) { - std::lock_guard lock(mutex_); + std::lock_guard lock(cache_mutex_); auto id = chkpt->getID(); - chkpt_ids_for_pipeline_head_.push(id); - auto& cp = chkpts_cache_[id]; sparta_assert(!cp); cp = std::move(chkpt); -} - -bool DatabaseCheckpointer::cloneNextPipelineHeadCheckpoint_(std::shared_ptr& next) -{ - std::lock_guard lock(mutex_); - if (chkpt_ids_for_pipeline_head_.empty()) { - return false; - } - - auto next_id = chkpt_ids_for_pipeline_head_.front(); - chkpt_ids_for_pipeline_head_.pop(); - - auto it = chkpts_cache_.find(next_id); - sparta_assert(it != chkpts_cache_.end()); - - auto& next_chkpt = it->second; - next = next_chkpt->clone(); - return true; + chkpts_queue_.emplace_back(cp); } REGISTER_SIMDB_APPLICATION(DatabaseCheckpointer); diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp index 9982b82b28..ba647353d9 100644 --- a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp +++ b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp @@ -142,7 +142,7 @@ void generalTest() for (auto id : chkpt_ids) { EXPECT_NOTHROW(dbcp.loadCheckpoint(id)); - auto chkpt = dbcp.findCheckpoint(id); + auto chkpt = dbcp.cloneCheckpoint(id); uint32_t expected_r1 = id * 5ul; EXPECT_EQUAL(r1->read(), expected_r1); From 5cfeb76c667e52037f29efb55b40f55c4b308075 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 26 Aug 2025 13:50:48 -0500 Subject: [PATCH 08/30] Database-backed checkpointer --- .../serialization/checkpoint/DatabaseCheckpointQuery.hpp | 3 --- sparta/src/DatabaseCheckpointQuery.cpp | 6 +++--- sparta/test/FastCheckpoint/CMakeLists.txt | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp index 8bf2a10eec..8844244e1f 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp @@ -28,7 +28,6 @@ class DatabaseCheckpointQuery : public Checkpointer TreeNode& root, Scheduler* sched=nullptr) : Checkpointer(root, sched) - , checkpointer_(checkpointer) , db_mgr_(db_mgr) {} @@ -75,8 +74,6 @@ class DatabaseCheckpointQuery : public Checkpointer std::vector getNextIDs_(chkpt_id_t id) const override; - mutable DatabaseCheckpointer* checkpointer_ = nullptr; - mutable simdb::DatabaseManager* db_mgr_ = nullptr; }; diff --git a/sparta/src/DatabaseCheckpointQuery.cpp b/sparta/src/DatabaseCheckpointQuery.cpp index 22d96fb03f..acb3e6a54e 100644 --- a/sparta/src/DatabaseCheckpointQuery.cpp +++ b/sparta/src/DatabaseCheckpointQuery.cpp @@ -112,19 +112,19 @@ bool DatabaseCheckpointQuery::hasCheckpoint(chkpt_id_t id) const noexcept void DatabaseCheckpointQuery::dumpList(std::ostream& o) const { - //TODO cnyce: look back + //TODO cnyce (void)o; } void DatabaseCheckpointQuery::dumpData(std::ostream& o) const { - //TODO cnyce: look back + //TODO cnyce (void)o; } void DatabaseCheckpointQuery::dumpAnnotatedData(std::ostream& o) const { - //TODO cnyce: look back + //TODO cnyce (void)o; } diff --git a/sparta/test/FastCheckpoint/CMakeLists.txt b/sparta/test/FastCheckpoint/CMakeLists.txt index 5a94bdd494..6e3e8c6f72 100644 --- a/sparta/test/FastCheckpoint/CMakeLists.txt +++ b/sparta/test/FastCheckpoint/CMakeLists.txt @@ -6,4 +6,4 @@ sparta_test(FastCheckpoint_test FastCheckpoint_test_RUN) add_subdirectory(FILEStream) add_subdirectory(PersistentFastCheckpoint) -add_subdirectory(DatabaseCheckpoint) +#add_subdirectory(DatabaseCheckpoint) From 830c46364381ddcea4cd31c9f35780d9a2c8000e Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 16 Sep 2025 08:42:01 -0500 Subject: [PATCH 09/30] Database-backed checkpointer --- sparta/simdb | 2 +- .../checkpoint/DatabaseCheckpoint.hpp | 6 +- .../checkpoint/DatabaseCheckpointQuery.hpp | 12 +- .../checkpoint/DatabaseCheckpointer.hpp | 23 +- sparta/src/DatabaseCheckpoint.cpp | 10 +- sparta/src/DatabaseCheckpointQuery.cpp | 76 ++-- sparta/src/DatabaseCheckpointer.cpp | 279 +++++++++++---- sparta/test/FastCheckpoint/CMakeLists.txt | 2 +- .../DatabaseCheckpoint_test.cpp | 331 ++++++++++++++++-- 9 files changed, 594 insertions(+), 147 deletions(-) diff --git a/sparta/simdb b/sparta/simdb index 394f8bea37..8408761136 160000 --- a/sparta/simdb +++ b/sparta/simdb @@ -1 +1 @@ -Subproject commit 394f8bea37cf6684f938fc4ffe7cd4471b4a9f83 +Subproject commit 84087611361581ed390501bb0dd2b0e6c9f714ea diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp index ea27e94689..3ae07eed43 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp @@ -69,8 +69,10 @@ namespace sparta::serialization::checkpoint bool is_snapshot, DatabaseCheckpointer* checkpointer); - //! \brief This constructor is called during checkpoing cloning - DatabaseCheckpoint(chkpt_id_t prev_id, + //! \brief This constructor is called during checkpoint cloning + DatabaseCheckpoint(chkpt_id_t id, + tick_t tick, + chkpt_id_t prev_id, const std::vector& next_ids, chkpt_id_t deleted_id, bool is_snapshot, diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp index 8844244e1f..8d6d7287e6 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp @@ -3,6 +3,7 @@ #pragma once #include "sparta/serialization/checkpoint/Checkpointer.hpp" +#include namespace simdb { @@ -29,7 +30,8 @@ class DatabaseCheckpointQuery : public Checkpointer Scheduler* sched=nullptr) : Checkpointer(root, sched) , db_mgr_(db_mgr) - {} + { + } uint64_t getTotalMemoryUse() const noexcept override; @@ -49,6 +51,10 @@ class DatabaseCheckpointQuery : public Checkpointer bool hasCheckpoint(chkpt_id_t id) const noexcept override; + bool isSnapshot(chkpt_id_t id) const noexcept; + + bool canDelete(chkpt_id_t id) const noexcept; + void dumpList(std::ostream& o) const override; void dumpData(std::ostream& o) const override; @@ -61,7 +67,7 @@ class DatabaseCheckpointQuery : public Checkpointer chkpt_id_t getPrevID(chkpt_id_t id) const; - std::vector getNextIDs(chkpt_id_t id) const; + std::vector getNextIDs(chkpt_id_t id, bool immediate_only = true) const; uint32_t getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept; @@ -75,6 +81,8 @@ class DatabaseCheckpointQuery : public Checkpointer std::vector getNextIDs_(chkpt_id_t id) const override; mutable simdb::DatabaseManager* db_mgr_ = nullptr; + + std::unordered_set tagged_deleted_ids_; }; } // namespace sparta::serialization::checkpoint diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp index 257619a7ee..ff317056a7 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -97,14 +97,12 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer /*! * \brief Explicit checkpoint deletion is NOT supported by this checkpointer. */ - void deleteCheckpoint(chkpt_id_t) override final { - throw CheckpointError("deleteCheckpoint() not supported"); - } + void deleteCheckpoint(chkpt_id_t) override final; /*! * \brief Loads state from a specific checkpoint by ID - * \note Does not delete checkpoints. Checkpoints must be explicitly - * deleted by deleteCheckpoint + * \note This implicitly deletes all future checkpoints since this checkpointer + * does not allow more than one branch. * \throw CheckpointError if id does not refer to checkpoint that exists * or if checkpoint could not be load. * \warning If checkpoint fails during loading for reasons other than an @@ -208,7 +206,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \note SEARCHES BOTH THE CACHE AND THE DATABASE * \return Checkpoint with ID of \a id if found or nullptr if not found */ - std::shared_ptr cloneCheckpoint(chkpt_id_t id) const; + std::shared_ptr cloneCheckpoint(chkpt_id_t id, bool must_exist=true) const; /*! * \brief Tests whether this checkpoint manager has a checkpoint with @@ -410,23 +408,20 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer //! \brief Subset (or all of) our checkpoints that we currently are holding in memory. std::unordered_map> chkpts_cache_; - //! \brief Subset (or all of) our checkpoints in the cache that haven't been send down the pipeline. - std::vector> chkpts_queue_; + //! \brief Ordered list of checkpoint windows (snapshot + deltas). + std::deque> chkpt_windows_; //! \brief SQLite query object to "extend" the checkpoint search space from just the //! cache to include the database. Combinations of in-memory checkpoints, recreated //! checkpoints, and database schema/query optimizations are used for performance. std::shared_ptr chkpt_query_; + //! \brief IDs of checkpoints pending eviction from the cache once they are no longer current. + std::queue pending_eviction_ids_; + //! \brief Mutex to protect our checkpoints cache. mutable std::recursive_mutex cache_mutex_; - //! \brief Set of dead checkpoint IDs. - std::unordered_set dead_chkpt_ids_; - - //! \brief Mutex to protect our set of dead checkpoint IDs. - mutable std::recursive_mutex dead_chkpts_mutex_; - //! \brief SimDB instance simdb::DatabaseManager* db_mgr_ = nullptr; diff --git a/sparta/src/DatabaseCheckpoint.cpp b/sparta/src/DatabaseCheckpoint.cpp index 9934602d9e..014f4adfd0 100644 --- a/sparta/src/DatabaseCheckpoint.cpp +++ b/sparta/src/DatabaseCheckpoint.cpp @@ -47,13 +47,15 @@ DatabaseCheckpoint::DatabaseCheckpoint(TreeNode& root, } } -DatabaseCheckpoint::DatabaseCheckpoint(chkpt_id_t prev_id, +DatabaseCheckpoint::DatabaseCheckpoint(chkpt_id_t id, + tick_t tick, + chkpt_id_t prev_id, const std::vector& next_ids, chkpt_id_t deleted_id, bool is_snapshot, const storage::VectorStorage& storage, DatabaseCheckpointer* checkpointer) - : CheckpointBase(getID(), getTick()) + : CheckpointBase(id, tick) , prev_id_(prev_id) , next_ids_(next_ids) , deleted_id_(deleted_id) @@ -120,7 +122,7 @@ std::vector DatabaseCheckpoint::getNextIDs() const void DatabaseCheckpoint::load(const std::vector& dats) { - // BUild stack up to last snapshot + // Build stack up to last snapshot std::stack chkpt_ids = getRestoreChain(); // Load in proper order @@ -193,7 +195,7 @@ void DatabaseCheckpoint::loadState(const std::vector& dats) std::unique_ptr DatabaseCheckpoint::clone() const { - auto clone = new DatabaseCheckpoint(prev_id_, next_ids_, deleted_id_, is_snapshot_, data_, checkpointer_); + auto clone = new DatabaseCheckpoint(getID(), getTick(), prev_id_, next_ids_, deleted_id_, is_snapshot_, data_, checkpointer_); return std::unique_ptr(clone); } diff --git a/sparta/src/DatabaseCheckpointQuery.cpp b/sparta/src/DatabaseCheckpointQuery.cpp index acb3e6a54e..182a255be4 100644 --- a/sparta/src/DatabaseCheckpointQuery.cpp +++ b/sparta/src/DatabaseCheckpointQuery.cpp @@ -19,22 +19,24 @@ using tick_t = typename DatabaseCheckpointQuery::tick_t; uint64_t DatabaseCheckpointQuery::getTotalMemoryUse() const noexcept { + //TODO cnyce return 0; } uint64_t DatabaseCheckpointQuery::getContentMemoryUse() const noexcept { + //TODO cnyce return 0; } -void DatabaseCheckpointQuery::deleteCheckpoint(chkpt_id_t) +void DatabaseCheckpointQuery::deleteCheckpoint(chkpt_id_t id) { - throw CheckpointError("deleteCheckpoint() not supported"); + tagged_deleted_ids_.insert(id); } void DatabaseCheckpointQuery::loadCheckpoint(chkpt_id_t) { - throw CheckpointError("loadCheckpoint() not supported"); + throw CheckpointError("DatabaseCheckpointQuery::loadCheckpoint() not supported"); } std::vector DatabaseCheckpointQuery::getCheckpointsAt(tick_t t) const @@ -106,10 +108,26 @@ bool DatabaseCheckpointQuery::hasCheckpoint(chkpt_id_t id) const noexcept { auto query = db_mgr_->createQuery("ChkptWindowIDs"); query->addConstraintForUInt64("ChkptID", simdb::Constraints::EQUAL, id); + + int bytes_id; + query->select("ChkptWindowBytesID", bytes_id); + auto results = query->getResultSet(); return results.getNextRecord(); } +bool DatabaseCheckpointQuery::isSnapshot(chkpt_id_t id) const noexcept +{ + auto chkpt = findCheckpoint(id); + return chkpt ? chkpt->isSnapshot() : false; +} + +bool DatabaseCheckpointQuery::canDelete(chkpt_id_t id) const noexcept +{ + auto chkpt = findCheckpoint(id); + return chkpt ? chkpt->canDelete() : false; +} + void DatabaseCheckpointQuery::dumpList(std::ostream& o) const { //TODO cnyce @@ -145,7 +163,7 @@ std::shared_ptr DatabaseCheckpointQuery::findCheckpoint(chkp auto query = db_mgr_->createQuery("ChkptWindowIDs"); query->addConstraintForUInt64("ChkptID", simdb::Constraints::EQUAL, id); - int window_id; + int window_id = 404; query->select("ChkptWindowBytesID", window_id); auto results1 = query->getResultSet(); @@ -163,7 +181,12 @@ std::shared_ptr DatabaseCheckpointQuery::findCheckpoint(chkp query->select("WindowBytes", bytes); auto results2 = query->getResultSet(); - sparta_assert(results2.getNextRecord()); + if (!results2.getNextRecord()) { + if (must_exist) { + throw CheckpointError("There is no checkpoint with ID ") << id; + } + return nullptr; + } // "Undo" task 5 (zlib compression) std::vector uncompressed; @@ -194,10 +217,30 @@ chkpt_id_t DatabaseCheckpointQuery::getPrevID(chkpt_id_t id) const return chkpt->getPrevID(); } -std::vector DatabaseCheckpointQuery::getNextIDs(chkpt_id_t id) const +std::vector DatabaseCheckpointQuery::getNextIDs(chkpt_id_t id, bool immediate_only) const { - auto chkpt = findCheckpoint(id, true); - return chkpt->getNextIDs(); + std::vector next_ids; + while (true) { + auto chkpt = findCheckpoint(id, false); + if (!chkpt) { + break; + } + + auto ids = chkpt->getNextIDs(); + if (ids.empty()) { + break; + } + + next_ids.insert(next_ids.end(), ids.begin(), ids.end()); + if (immediate_only) { + break; + } + + assert(ids.size() == 1); + id = ids[0]; + } + + return next_ids; } uint32_t DatabaseCheckpointQuery::getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept @@ -217,22 +260,9 @@ chkpt_id_t DatabaseCheckpointQuery::createCheckpoint_(bool) throw CheckpointError("Cannot create checkpoint head for DatabaseCheckpointQuery"); } -void DatabaseCheckpointQuery::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const +void DatabaseCheckpointQuery::dumpCheckpointNode_(const chkpt_id_t, std::ostream&) const { - static std::string SNAPSHOT_NOTICE = "(s)"; - auto chkpt = findCheckpoint(id, true); - - // Draw data for this checkpoint - if (chkpt->isFlaggedDeleted()) { - o << chkpt->getDeletedRepr(); - } else { - o << chkpt->getID(); - } - - // Show that this is a snapshot - if (chkpt->isSnapshot()) { - o << ' ' << SNAPSHOT_NOTICE; - } + throw CheckpointError("Cannot dump checkpoint node for DatabaseCheckpointQuery"); } std::vector DatabaseCheckpointQuery::getNextIDs_(chkpt_id_t id) const diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index e3cb0515dc..bfc377b8f3 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -50,14 +50,14 @@ void DatabaseCheckpointer::defineSchema(simdb::Schema& schema) window_ids.addColumn("ChkptWindowBytesID", dt::int32_t); window_ids.addColumn("ChkptID", dt::int32_t); window_ids.createIndexOn("ChkptID"); - window_ids.disableAutoIncPrimaryKey(); + //window_ids.disableAutoIncPrimaryKey(); auto& window_ticks = schema.addTable("ChkptWindowTicks"); window_ticks.addColumn("ChkptWindowBytesID", dt::int32_t); - window_ticks.addColumn("StartTick", dt::uint64_t); - window_ticks.addColumn("EndTick", dt::uint64_t); + window_ticks.addColumn("StartTick", dt::int32_t); + window_ticks.addColumn("EndTick", dt::int32_t); window_ticks.createCompoundIndexOn({"StartTick", "EndTick"}); - window_ticks.disableAutoIncPrimaryKey(); + //window_ticks.disableAutoIncPrimaryKey(); } std::unique_ptr DatabaseCheckpointer::createPipeline( @@ -67,33 +67,51 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( // Task 1: Clone an entire checkpoint window (snapshot plus all deltas until next snapshot) auto clone_window = simdb::pipeline::createTask>( - [this](simdb::ConcurrentQueue& out, bool /*simulation_terminating*/) mutable -> bool + [this](simdb::ConcurrentQueue& out, bool simulation_terminating) mutable -> bool { std::lock_guard lock(cache_mutex_); - if (chkpts_queue_.size() < getSnapshotThreshold()) { - return false; - } + bool sent = false; - checkpoint_ptrs chkpts; - auto it = chkpts_queue_.begin(); - while (chkpts.size() < getSnapshotThreshold()) { - if (auto c = it->lock()) { + auto send_window = [&]() { + auto window = std::move(chkpt_windows_.front()); + chkpt_windows_.pop_front(); + + checkpoint_ptrs chkpts; + for (auto id : window) { + auto it = chkpts_cache_.find(id); + if (it == chkpts_cache_.end()) { + throw CheckpointError("Invalid checkpoint - has been deleted"); + } + const auto& c = it->second; if (chkpts.empty() && !c->isSnapshot()) { - throw CheckpointError("Invalid checkpoint - only one snapshot per window"); + throw CheckpointError("Invalid checkpoint - first in window is not a snapshot"); } else if (!chkpts.empty() && c->isSnapshot()) { throw CheckpointError("Invalid checkpoint - only one snapshot per window"); } chkpts.emplace_back(c->clone()); - ++it; - } else { - throw CheckpointError("Invalid checkpoint - has been deleted"); } + + if (!chkpts.empty()) { + out.emplace(std::move(chkpts)); + sent = true; + } + }; + + // Note the >2 is to ensure we always have at least one complete window + // in the cache for fast APIs on very recent checkpoints. The second + // window may be partial so we can't send it yet. + while (chkpt_windows_.size() > 2) { + send_window(); + } + + // If we are terminating, send all remaining windows. + while (!chkpt_windows_.empty() && simulation_terminating) { + send_window(); } - chkpts_queue_.erase(chkpts_queue_.begin(), it); - return true; + return sent; } ); @@ -169,14 +187,14 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( auto chkpt_ids_inserter = tables->getPreparedINSERT("ChkptWindowIDs"); chkpt_ids_inserter->setColumnValue(0, bytes_id); for (auto id : bytes_in.chkpt_ids) { - chkpt_ids_inserter->setColumnValue(1, id); + chkpt_ids_inserter->setColumnValue(1, (int)id); chkpt_ids_inserter->createRecord(); } auto chkpt_ticks_inserter = tables->getPreparedINSERT("ChkptWindowTicks"); chkpt_ticks_inserter->setColumnValue(0, bytes_id); - chkpt_ticks_inserter->setColumnValue(1, bytes_in.start_tick); - chkpt_ticks_inserter->setColumnValue(2, bytes_in.end_tick); + chkpt_ticks_inserter->setColumnValue(1, (int)bytes_in.start_tick); + chkpt_ticks_inserter->setColumnValue(2, (int)bytes_in.end_tick); chkpt_ticks_inserter->createRecord(); evicted_ids.emplace(std::move(bytes_in.chkpt_ids)); @@ -185,16 +203,25 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( // Task 6: Perform cache eviction after a window of checkpoints has been written to SimDB auto evict_from_cache = simdb::pipeline::createTask>( - [this](EvictedChkptIDs&& evicted_ids, bool /*simulation_terminating*/) mutable + [this](EvictedChkptIDs&& evicted_ids, bool simulation_terminating) mutable { + return;//TODO cnyce + // TODO cnyce: We are allocating and deallocating a LOT of checkpoints. // See if we can reuse a pool of them. Could also try to just add a pool // to the VectorStorage::Segment class. std::lock_guard lock(cache_mutex_); for (auto id : evicted_ids) { - sparta_assert(id != head_id_); - sparta_assert(id != current_id_); + if (id == head_id_) { + continue; + } + + if (id == current_id_) { + pending_eviction_ids_.push(id); + continue; + } + chkpts_cache_.erase(id); } } @@ -244,25 +271,28 @@ uint64_t DatabaseCheckpointer::getContentMemoryUse() const noexcept return mem; } +void DatabaseCheckpointer::deleteCheckpoint(chkpt_id_t) +{ + // TODO cnyce + throw CheckpointError("deleteCheckpoint() not supported"); +} + void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) { - if (auto c = getCurrent_(); c && c->getID() == id) { + std::lock_guard lock(cache_mutex_); + + if (auto c = getCurrent_(); !c || (c && c->getID() == id)) { return; } - auto chkpt = cloneCheckpoint(id); - chkpt->load(getArchDatas()); - - // Move current to another checkpoint. Anything between head and the - // old current_ is fair game for removal if allowed - setCurrent_(chkpt.get()); - addToCache_(std::move(chkpt)); - - // Restore scheduler tick number - if (sched_) { - sched_->restartAt(getCurrentTick()); + auto chkpt = (id == head_id_) ? findCheckpoint(id).lock() : cloneCheckpoint(id); + if (!chkpt) { + throw CheckpointError("There is no checkpoint with ID ") << id; } + chkpt->load(getArchDatas()); + //TODO cnyce: chkpts_cache_.erase(id); + // Delete all future checkpoints past this one. Do this from the cache // as well as from the database. auto next_ids = chkpt->getNextIDs(); @@ -272,6 +302,27 @@ void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) } deleteCheckpoint_(next_ids[0]); } + + // Detach future checkpoints from this one as they are deleted. + chkpt->next_ids_.clear(); + + // Move current to this checkpoint. + setCurrent_(chkpt.get()); + + // Add this checkpoint to the cache if not the head checkpoint. + // The head checkpoint is always in the cache. + if (id != head_id_) { + addToCache_(std::move(chkpt)); + } + + // Increasing-by-one, starting-at-zero checkpoint IDs guarantee we can do this: + num_alive_checkpoints_ = id + 1; + next_chkpt_id_ = id + 1; + + // Restore scheduler tick number + if (sched_) { + sched_->restartAt(getCurrentTick()); + } } std::vector DatabaseCheckpointer::getCheckpointsAt(tick_t t) const @@ -289,7 +340,9 @@ std::vector DatabaseCheckpointer::getCheckpointsAt(tick_t t) const results.insert(id); } - return {results.begin(), results.end()}; + std::vector chkpts(results.begin(), results.end()); + std::sort(chkpts.begin(), chkpts.end()); + return chkpts; } std::vector DatabaseCheckpointer::getCheckpoints() const @@ -307,7 +360,9 @@ std::vector DatabaseCheckpointer::getCheckpoints() const results.insert(id); } - return {results.begin(), results.end()}; + std::vector chkpts(results.begin(), results.end()); + std::sort(chkpts.begin(), chkpts.end()); + return chkpts; } uint32_t DatabaseCheckpointer::getNumCheckpoints() const noexcept @@ -385,7 +440,7 @@ std::weak_ptr DatabaseCheckpointer::findCheckpoint(chkpt_id_ return std::weak_ptr(); } -std::shared_ptr DatabaseCheckpointer::cloneCheckpoint(chkpt_id_t id) const +std::shared_ptr DatabaseCheckpointer::cloneCheckpoint(chkpt_id_t id, bool must_exist) const { std::lock_guard lock(cache_mutex_); @@ -393,7 +448,15 @@ std::shared_ptr DatabaseCheckpointer::cloneCheckpoint(chkpt_ return it->second->clone(); } - return chkpt_query_->findCheckpoint(id); + auto chkpt = chkpt_query_->findCheckpoint(id); + if (!chkpt && must_exist) { + throw CheckpointError("There is no checkpoint with ID ") << id; + } else if (!chkpt) { + return nullptr; + } + + chkpt->checkpointer_ = const_cast(this); + return chkpt; } bool DatabaseCheckpointer::hasCheckpoint(chkpt_id_t id) const noexcept @@ -469,7 +532,13 @@ std::stack DatabaseCheckpointer::getRestoreChain(chkpt_id_t id) cons std::vector DatabaseCheckpointer::getNextIDs(chkpt_id_t id) const { - return cloneCheckpoint(id)->getNextIDs();//colby + std::lock_guard lock(cache_mutex_); + + if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { + return it->second->getNextIDs(); + } + + return chkpt_query_->getNextIDs(id); } uint32_t DatabaseCheckpointer::getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept @@ -510,25 +579,34 @@ uint32_t DatabaseCheckpointer::getDistanceToPrevSnapshot(chkpt_id_t id) const no bool DatabaseCheckpointer::isSnapshot(chkpt_id_t id) const noexcept { - return cloneCheckpoint(id)->isSnapshot(); + std::lock_guard lock(cache_mutex_); + + if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { + return it->second->isSnapshot(); + } + + return chkpt_query_->isSnapshot(id); } bool DatabaseCheckpointer::canDelete(chkpt_id_t id) const noexcept { std::lock_guard lock(cache_mutex_); - auto chkpt = cloneCheckpoint(id); - if (!chkpt->isFlaggedDeleted()) { - return false; - } - - for (auto next_id : chkpt->getNextIDs()) { - if (!canDelete(next_id) && !isSnapshot(next_id)) { + if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { + if (!it->second->isFlaggedDeleted()) { return false; } + + for (auto next_id : it->second->getNextIDs()) { + if (!canDelete(next_id) && !isSnapshot(next_id)) { + return false; + } + } + + return true; } - return false; + return chkpt_query_->canDelete(id); } std::string DatabaseCheckpointer::stringize() const @@ -592,6 +670,8 @@ void DatabaseCheckpointer::traceValue( void DatabaseCheckpointer::createHead_() { + std::lock_guard lock(cache_mutex_); + tick_t tick = 0; if (sched_) { tick = sched_->getCurrentTick(); @@ -626,6 +706,8 @@ void DatabaseCheckpointer::createHead_() chkpt_id_t DatabaseCheckpointer::createCheckpoint_(bool force_snapshot) { + std::lock_guard lock(cache_mutex_); + bool is_snapshot; checkpoint_type* prev; @@ -688,39 +770,76 @@ void DatabaseCheckpointer::deleteCheckpoint_(chkpt_id_t id) { std::lock_guard lock(cache_mutex_); + // Purge all future checkpoints from chkpt_windows_. + for (auto it = chkpt_windows_.begin(); it != chkpt_windows_.end(); ++it) { + auto& window = *it; + + // Because IDs are monotonically increasing, we can skip windows + if (window.empty() || id < window.front()) { + // ID cannot be in this or any future window + chkpt_windows_.erase(it, chkpt_windows_.end()); + break; + } + + if (id > window.back()) { + // ID cannot be in this window, continue searching + continue; + } + + // ID must be within this window + auto pos = std::find(window.begin(), window.end(), id); + if (pos != window.end()) { + window.erase(pos, window.end()); + if (window.empty()) { + it = chkpt_windows_.erase(it); + } else { + ++it; + } + if (it != chkpt_windows_.end()) { + chkpt_windows_.erase(it, chkpt_windows_.end()); + } + break; + } + } + + // Purge from the database + chkpt_query_->deleteCheckpoint(id); + + // Purge from the cache while (true) { auto it = chkpts_cache_.find(id); if (it == chkpts_cache_.end()) { break; } - { - std::lock_guard lock2(dead_chkpts_mutex_); - dead_chkpt_ids_.insert(id); - } - auto next_ids = it->second->getNextIDs(); + //TODO cnyce: chkpts_cache_.erase(it); + if (!next_ids.empty()) { if (next_ids.size() != 1) { throw CheckpointError("DatabaseCheckpointer does not support multiple checkpoint branches"); } id = next_ids[0]; - chkpts_cache_.erase(it); } else { - id = checkpoint_type::UNIDENTIFIED_CHECKPOINT; break; } } - - if (id != checkpoint_type::UNIDENTIFIED_CHECKPOINT) { - chkpt_query_->deleteCheckpoint(id); - } } void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const { static std::string SNAPSHOT_NOTICE = "(s)"; - auto cp = cloneCheckpoint(id); + + std::lock_guard lock(cache_mutex_); + + checkpoint_ptr chkpt_ptr; + if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { + chkpt_ptr = it->second; + } else { + chkpt_ptr = chkpt_query_->findCheckpoint(id); + } + + auto cp = chkpt_ptr.get(); // Draw data for this checkpoint if (cp->isFlaggedDeleted()) { @@ -736,17 +855,19 @@ void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream std::vector DatabaseCheckpointer::getNextIDs_(chkpt_id_t id) const { - return cloneCheckpoint(id)->getNextIDs(); + return getNextIDs(id); } void DatabaseCheckpointer::setHead_(CheckpointBase* head) { + std::lock_guard lock(cache_mutex_); setHeadID_(head->getID()); Checkpointer::setHead_(head); } void DatabaseCheckpointer::setCurrent_(CheckpointBase* current) { + std::lock_guard lock(cache_mutex_); setCurrentID_(current->getID()); Checkpointer::setCurrent_(current); } @@ -755,7 +876,7 @@ void DatabaseCheckpointer::setHeadID_(chkpt_id_t id) { std::lock_guard lock(cache_mutex_); sparta_assert(id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); - sparta_assert(head_id_ == checkpoint_type::UNIDENTIFIED_CHECKPOINT); + sparta_assert(head_id_ == checkpoint_type::UNIDENTIFIED_CHECKPOINT || head_id_ == id); head_id_ = id; } @@ -764,16 +885,38 @@ void DatabaseCheckpointer::setCurrentID_(chkpt_id_t id) std::lock_guard lock(cache_mutex_); sparta_assert(id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); current_id_ = id; + + // If we are moving current_, see if we can evict any pending IDs + while (!pending_eviction_ids_.empty()) { + auto id = pending_eviction_ids_.front(); + pending_eviction_ids_.pop(); + if (id == current_id_) { + pending_eviction_ids_.push(id); + } else { + //TODO cnyce: chkpts_cache_.erase(id); + } + } } void DatabaseCheckpointer::addToCache_(std::shared_ptr chkpt) { std::lock_guard lock(cache_mutex_); + auto id = chkpt->getID(); - auto& cp = chkpts_cache_[id]; - sparta_assert(!cp); - cp = std::move(chkpt); - chkpts_queue_.emplace_back(cp); + chkpts_cache_[id] = chkpt; + + if (!chkpt_windows_.empty() && !chkpt_windows_.back().empty() && chkpt_windows_.back().back() == id) { + return; + } + + if (chkpt->isSnapshot()) { + chkpt_windows_.emplace_back(); + } + + auto& window = chkpt_windows_.back(); + if (window.empty() || window.back() != id) { + window.push_back(id); + } } REGISTER_SIMDB_APPLICATION(DatabaseCheckpointer); diff --git a/sparta/test/FastCheckpoint/CMakeLists.txt b/sparta/test/FastCheckpoint/CMakeLists.txt index 6e3e8c6f72..5a94bdd494 100644 --- a/sparta/test/FastCheckpoint/CMakeLists.txt +++ b/sparta/test/FastCheckpoint/CMakeLists.txt @@ -6,4 +6,4 @@ sparta_test(FastCheckpoint_test FastCheckpoint_test_RUN) add_subdirectory(FILEStream) add_subdirectory(PersistentFastCheckpoint) -#add_subdirectory(DatabaseCheckpoint) +add_subdirectory(DatabaseCheckpoint) diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp index ba647353d9..8c8a3dc002 100644 --- a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp +++ b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp @@ -38,6 +38,7 @@ using sparta::RootTreeNode; using sparta::memory::MemoryObject; using sparta::memory::BlockingMemoryObjectIFNode; using sparta::serialization::checkpoint::DatabaseCheckpointer; +using sparta::serialization::checkpoint::DatabaseCheckpoint; static const uint16_t HINT_NONE=0; @@ -92,68 +93,334 @@ void generalTest() app_mgr.getAppFactory()->setSpartaElems(root, &sched); app_mgr.enableApp(DatabaseCheckpointer::NAME); app_mgr.createEnabledApps(); + app_mgr.createSchemas(); app_mgr.postInit(0, nullptr); app_mgr.openPipelines(); auto& dbcp = *app_mgr.getApp(); - dbcp.setSnapshotThreshold(100); + dbcp.setSnapshotThreshold(9); root.enterConfiguring(); root.enterFinalized(); sched.finalize(); - EXPECT_EQUAL(sched.getCurrentTick(), 0); // Unfinalized sched at tick 0 + EXPECT_EQUAL(sched.getCurrentTick(), 0); - // CHECKPOINT: HEAD + // CHECKPOINT: Head DatabaseCheckpointer::chkpt_id_t head_id; EXPECT_NOTHROW(dbcp.createHead()); head_id = dbcp.getHeadID(); - EXPECT_EQUAL(head_id, 0); + EXPECT_NOTEQUAL(dbcp.getHead(), nullptr); + EXPECT_EQUAL(head_id, dbcp.getHead()->getID()); + EXPECT_EQUAL(dbcp.getCurrentID(), head_id); + EXPECT_EQUAL(dbcp.getCurrentTick(), 0); - // Checkpoints 1 through 10000. Save a few of the register values - // with their checkpoint IDs so we can verify the correct registers - // after rolling back to previous checkpoints. - std::vector chkpt_ids; - - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution<> distrib(1, 10); - - for (uint32_t i = 1; i <= 10000; ++i) { + auto step_checkpointer = [&](uint32_t i) { r1->write(i * 5ul); r2->write(i % 5ul); sched.run(1, true, false); - EXPECT_EQUAL(i, sched.getCurrentTick()); - EXPECT_EQUAL(i, dbcp.getCurrentTick()); + //EXPECT_EQUAL(i, sched.getCurrentTick()); + //EXPECT_EQUAL(i, dbcp.getCurrentTick()); DatabaseCheckpointer::chkpt_id_t id; EXPECT_NOTHROW(id = dbcp.createCheckpoint()); EXPECT_EQUAL(id, i); + EXPECT_EQUAL(id, dbcp.getCurrentID()); + return id; + }; + + // Create 1000 checkpoints, and periodically access an old one. Also + // go to sleep sometimes to increase the chances we have to go to the + // database to retrieve a checkpoint. + for (uint32_t i = 1; i <= 100; ++i) { + step_checkpointer(i); + + // Access most recent from the cache directly + auto cached_cp = dbcp.findCheckpoint(i).lock(); + EXPECT_NOTEQUAL(cached_cp, nullptr); + if (cached_cp) { + EXPECT_EQUAL(cached_cp->getID(), i); + EXPECT_EQUAL(cached_cp->getPrevID(), i - 1); + } - if (distrib(gen) == 5) { - chkpt_ids.push_back(id); + // Access an old one, which may or may not be in the cache + if (rand() % 10 == 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(rand() % 50)); + auto old_id = static_cast(rand() % i); + auto old_cp = dbcp.cloneCheckpoint(old_id); + EXPECT_NOTEQUAL(old_cp, nullptr); + if (old_cp) { + EXPECT_EQUAL(old_cp->getID(), old_id); + EXPECT_EQUAL(old_cp->getPrevID(), old_id - 1); + } } } - // Shuffle up the checkpoint IDs and wait a bit before we start - // loading checkpoints and verifying the registers. - std::shuffle(chkpt_ids.begin(), chkpt_ids.end(), gen); - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - - for (auto id : chkpt_ids) { + auto verif_load_chkpt = [&](DatabaseCheckpointer::chkpt_id_t id) { EXPECT_NOTHROW(dbcp.loadCheckpoint(id)); + EXPECT_EQUAL(dbcp.getCurrentID(), id); + EXPECT_EQUAL(dbcp.getNumCheckpoints(), id + 1); + EXPECT_EQUAL(sched.getCurrentTick(), id); + // TODO cnyce: verify registers + }; + + // Load very recent checkpoints that are definitely in the cache + verif_load_chkpt(100); + verif_load_chkpt(99); + verif_load_chkpt(95); + verif_load_chkpt(90); + verif_load_chkpt(89); + + // Load checkpoints that have probably already been evicted from the cache + std::this_thread::sleep_for(std::chrono::seconds(1)); + verif_load_chkpt(49); + verif_load_chkpt(45); + verif_load_chkpt(40); + verif_load_chkpt(39); + + step_checkpointer(40); + step_checkpointer(41); + step_checkpointer(42); + verif_load_chkpt(40); + + // Go back to checkpoint 1 + verif_load_chkpt(1); + + // Take 3 more checkpoints with IDs 2, 3, and 4 + step_checkpointer(2); + step_checkpointer(3); + step_checkpointer(4); + + // Go back to head + verif_load_chkpt(head_id); + + // Take some checkpoints and ensure that the current ID is always increasing by 1 with no gaps + step_checkpointer(1); + step_checkpointer(2); + step_checkpointer(3); + verif_load_chkpt(2); + verif_load_chkpt(1); + verif_load_chkpt(head_id); + + // Ensure exception is thrown when loading a non-existent checkpoint + EXPECT_THROW(dbcp.loadCheckpoint(9999)); + EXPECT_THROW(dbcp.cloneCheckpoint(9999)); + EXPECT_NOTHROW(dbcp.cloneCheckpoint(9999, false)); + + // Create checkpoints 1-50. Keep a clone of checkpoint 3 for later. + std::unique_ptr clone3; + for (uint32_t i = 1; i <= 50; ++i) { + step_checkpointer(i); + if (i == 3) { + clone3 = dbcp.findCheckpoint(3).lock()->clone(); + } + } + + // Verify checkpoint chain: 0-50 + auto chain = dbcp.getCheckpointChain(dbcp.getCurrentID()); + EXPECT_EQUAL(chain.size(), 51); + uint32_t chain_idx = 0; + for (uint32_t i = 0; i <= 50; ++i) { + EXPECT_EQUAL(chain[chain_idx++], 50-i); + } + + // Sleep for a bit to flush the pipeline to ensure the checkpoint chain + // can be retrieved from the database. + std::this_thread::sleep_for(std::chrono::seconds(1)); + chain = dbcp.getCheckpointChain(dbcp.getCurrentID()); + EXPECT_EQUAL(chain.size(), 51); + chain_idx = 0; + for (uint32_t i = 0; i <= 50; ++i) { + EXPECT_EQUAL(chain[chain_idx++], 50-i); + } + + // Load checkpoint 45 + verif_load_chkpt(45); - auto chkpt = dbcp.cloneCheckpoint(id); - uint32_t expected_r1 = id * 5ul; - EXPECT_EQUAL(r1->read(), expected_r1); + // Verify that checkpoints 46+ have been implicitly deleted + // TODO cnyce: EXPECT_FALSE(dbcp.hasCheckpoint(46)); - uint32_t expected_r2 = id % 5ul; - EXPECT_EQUAL(r2->read(), expected_r2); + // Create checkpoints 46-55 + for (uint32_t i = 46; i <= 55; ++i) { + step_checkpointer(i); + } + + // Verify checkpoint chain: 0-55 + chain = dbcp.getCheckpointChain(dbcp.getCurrentID()); + EXPECT_EQUAL(chain.size(), 56); + chain_idx = 0; + for (uint32_t i = 0; i <= 55; ++i) { + EXPECT_EQUAL(chain[chain_idx++], 55-i); + } + + // Create checkpoints 56-58 + for (uint32_t i = 56; i <= 58; ++i) { + step_checkpointer(i); + } + + // Delete checkpoint always throws + // TODO cnyce: relax this restriction? + EXPECT_THROW(dbcp.deleteCheckpoint(57)); - auto expected_tick = dbcp.getCurrentTick(); - EXPECT_EQUAL(sched.getCurrentTick(), expected_tick); - EXPECT_EQUAL(chkpt->getTick(), expected_tick); + // Create checkpoints 59-70 + for (uint32_t i = 59; i <= 70; ++i) { + step_checkpointer(i); } + // Load checkpoint 58 + verif_load_chkpt(58); + + // Finish + app_mgr.postSimLoopTeardown(); + root.enterTeardown(); + clocks.enterTeardown(); + return; + + + + // Verify all checkpoints: 0-58 + auto all_chkpts = dbcp.getCheckpoints(); + EXPECT_EQUAL(all_chkpts.size(), 59); + EXPECT_EQUAL(dbcp.getNumCheckpoints(), 59); + uint32_t idx = 0; + for (uint32_t i = 0; i <= 58; ++i) { + EXPECT_EQUAL(all_chkpts[idx++], i); + } + EXPECT_EQUAL(idx, all_chkpts.size()); + + // Create checkpoints 59-75 + for (uint32_t i = 59; i <= 75; ++i) { + step_checkpointer(i); + } + + // Verify all checkpoints: 0-75 + all_chkpts = dbcp.getCheckpoints(); + EXPECT_EQUAL(all_chkpts.size(), 76); + EXPECT_EQUAL(dbcp.getNumCheckpoints(), 76); + idx = 0; + for (uint32_t i = 0; i <= 75; ++i) { + EXPECT_EQUAL(all_chkpts[idx++], i); + } + EXPECT_EQUAL(idx, all_chkpts.size()); + + uint32_t all_idx = 0; + for (uint32_t i = 0; i <= 45; ++i) { + EXPECT_EQUAL(all_chkpts[all_idx++], i); + } + for (uint32_t i = 51; i <= 56; ++i) { + EXPECT_EQUAL(all_chkpts[all_idx++], i); + } + EXPECT_EQUAL(all_chkpts[all_idx++], 58); + for (uint32_t i = 71; i <= 75; ++i) { + EXPECT_EQUAL(all_chkpts[all_idx++], i); + } + EXPECT_EQUAL(all_idx, all_chkpts.size()); + all_idx = 0; + + // Nothing to test, just call dumpRestoreChain() + dbcp.dumpRestoreChain(std::cout, 73); + + // Verify history chain up to current checkpoint + auto history_chain = dbcp.getHistoryChain(dbcp.getCurrentID()); + while (!history_chain.empty()) { + EXPECT_EQUAL(history_chain.top(), all_chkpts[all_idx++]); + history_chain.pop(); + } + all_idx = 0; + + // Verify restore chain up to current checkpoint + auto restore_chain = dbcp.getRestoreChain(dbcp.getCurrentID()); + auto id = restore_chain.top(); + restore_chain.pop(); + std::weak_ptr chkpt; + EXPECT_NOTHROW(chkpt = dbcp.findCheckpoint(id)); + auto c = chkpt.lock(); + EXPECT_NOTEQUAL(c, nullptr); + EXPECT_TRUE(c->isSnapshot()); + + while (!restore_chain.empty()) { + id = restore_chain.top(); + restore_chain.pop(); + EXPECT_NOTHROW(chkpt = dbcp.findCheckpoint(id)); + c = chkpt.lock(); + EXPECT_NOTEQUAL(c, nullptr); + EXPECT_FALSE(c->isSnapshot()); + } + + // Verify that cached checkpoints are clonable + auto cache73 = dbcp.findCheckpoint(73).lock(); + auto clone73 = dbcp.cloneCheckpoint(73); + + std::ostringstream cache_oss; + std::ostringstream clone_oss; + + cache73->dumpData(cache_oss); + clone73->dumpData(clone_oss); + + EXPECT_EQUAL(cache_oss.str(), clone_oss.str()); + EXPECT_EQUAL(cache73->getTotalMemoryUse(), clone73->getTotalMemoryUse()); + EXPECT_EQUAL(cache73->getContentMemoryUse(), clone73->getContentMemoryUse()); + EXPECT_TRUE(cache73->getHistoryChain() == clone73->getHistoryChain()); + EXPECT_TRUE(cache73->getRestoreChain() == clone73->getRestoreChain()); + EXPECT_EQUAL(cache73->getPrevID(), clone73->getPrevID()); + EXPECT_EQUAL(cache73->getNextIDs(), clone73->getNextIDs()); + EXPECT_EQUAL(cache73->getTick(), clone73->getTick()); + EXPECT_EQUAL(cache73->isSnapshot(), clone73->isSnapshot()); + EXPECT_EQUAL(cache73->getDistanceToPrevSnapshot(), clone73->getDistanceToPrevSnapshot()); + + // Wait until checkpoint 3 is evicted from cache + uint32_t num_tries = 0; + while (dbcp.findCheckpoint(3).lock() != nullptr) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + EXPECT_NOTEQUAL(++num_tries, 100); // 1-second timeout + } + + // Ask the checkpointer to retrieve checkpoint 3 from the database + auto dbchkpt3 = dbcp.cloneCheckpoint(3); + EXPECT_EQUAL(dbchkpt3->getID(), clone3->getID()); + + // Verify that the database checkpoint matches the original clone of 3 + std::ostringstream clone3_oss, dbchkpt3_oss; + clone3->dumpData(clone3_oss); + dbchkpt3->dumpData(dbchkpt3_oss); + EXPECT_EQUAL(clone3_oss.str(), dbchkpt3_oss.str()); + + // Verify history chain for a db-recreated checkpoint + auto hist_chain3 = dbcp.getHistoryChain(3); + for (auto hist_id : {3,2,1,0}) { + EXPECT_FALSE(hist_chain3.empty()); + EXPECT_EQUAL(hist_chain3.top(), hist_id); + hist_chain3.pop(); + } + + // Verify restore chain for a db-recreated checkpoint + auto rest_chain3 = dbcp.getRestoreChain(3); + for (auto rest_id : {3,2,1,0}) { + EXPECT_FALSE(rest_chain3.empty()); + EXPECT_EQUAL(rest_chain3.top(), rest_id); + rest_chain3.pop(); + } + + // Verify distance to previous snapshot for a db-recreated checkpoint + EXPECT_EQUAL(dbchkpt3->getDistanceToPrevSnapshot(), 3); + + // Nothing to test, just call dumpRestoreChain() + dbcp.dumpRestoreChain(std::cout, 3); + + // Nothing to test, just call dumpList/dumpData/dumpAnnotatedData + dbcp.dumpList(std::cout); + std::cout << std::endl; + dbcp.dumpData(std::cout); + std::cout << std::endl; + dbcp.dumpAnnotatedData(std::cout); + std::cout << std::endl; + + // Load checkpoint 8 and verify registers + EXPECT_NOTHROW(dbcp.loadCheckpoint(8)); + EXPECT_EQUAL(r1->read(), 40ul); // 8 * 5 + EXPECT_EQUAL(r2->read(), 3ul); // 8 % 5 + EXPECT_EQUAL(sched.getCurrentTick(), 8); + EXPECT_EQUAL(dbcp.getNumCheckpoints(), 9); + // Finish... app_mgr.postSimLoopTeardown(); } From 57dccf48f6d637dd9ee3a35a9d7ead942f2ac022 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Wed, 17 Sep 2025 08:07:33 -0500 Subject: [PATCH 10/30] Last set of changes before redesign --- .../checkpoint/DatabaseCheckpoint.hpp | 16 ++ sparta/src/DatabaseCheckpoint.cpp | 14 +- sparta/src/DatabaseCheckpointer.cpp | 41 ++--- .../DatabaseCheckpoint_test.cpp | 147 ++++++++---------- 4 files changed, 115 insertions(+), 103 deletions(-) diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp index 3ae07eed43..b6a232ecd3 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp @@ -75,6 +75,7 @@ namespace sparta::serialization::checkpoint chkpt_id_t prev_id, const std::vector& next_ids, chkpt_id_t deleted_id, + bool decached, bool is_snapshot, const storage::VectorStorage& storage, DatabaseCheckpointer* checkpointer); @@ -203,6 +204,18 @@ namespace sparta::serialization::checkpoint */ std::string getDeletedRepr() const override; + /*! + * \brief Mark this checkpoint as no longer in the cache. It will still + * live in the cache until the checkpointer has a chance to evict it. + */ + void flagDecached(); + + /*! + * \brief Should this checkpoint be considered ready for eviction from + * the cache? + */ + bool isFlaggedDecached() const noexcept; + /*! * \brief Is this checkpoint a snapshot (contains ALL simulator state) */ @@ -267,6 +280,9 @@ namespace sparta::serialization::checkpoint */ chkpt_id_t deleted_id_; + //! \brief Has this checkpoint been flagged as ready to be decached? + bool decached_ = false; + //! \brief Is this node a snapshot? bool is_snapshot_; diff --git a/sparta/src/DatabaseCheckpoint.cpp b/sparta/src/DatabaseCheckpoint.cpp index 014f4adfd0..e02f10322c 100644 --- a/sparta/src/DatabaseCheckpoint.cpp +++ b/sparta/src/DatabaseCheckpoint.cpp @@ -52,6 +52,7 @@ DatabaseCheckpoint::DatabaseCheckpoint(chkpt_id_t id, chkpt_id_t prev_id, const std::vector& next_ids, chkpt_id_t deleted_id, + bool decached, bool is_snapshot, const storage::VectorStorage& storage, DatabaseCheckpointer* checkpointer) @@ -59,6 +60,7 @@ DatabaseCheckpoint::DatabaseCheckpoint(chkpt_id_t id, , prev_id_(prev_id) , next_ids_(next_ids) , deleted_id_(deleted_id) + , decached_(decached) , is_snapshot_(is_snapshot) , data_(storage) , checkpointer_(checkpointer) @@ -167,6 +169,16 @@ std::string DatabaseCheckpoint::getDeletedRepr() const return ss.str(); } +void DatabaseCheckpoint::flagDecached() +{ + decached_ = true; +} + +bool DatabaseCheckpoint::isFlaggedDecached() const noexcept +{ + return decached_; +} + bool DatabaseCheckpoint::isSnapshot() const noexcept { return is_snapshot_; @@ -195,7 +207,7 @@ void DatabaseCheckpoint::loadState(const std::vector& dats) std::unique_ptr DatabaseCheckpoint::clone() const { - auto clone = new DatabaseCheckpoint(getID(), getTick(), prev_id_, next_ids_, deleted_id_, is_snapshot_, data_, checkpointer_); + auto clone = new DatabaseCheckpoint(getID(), getTick(), prev_id_, next_ids_, deleted_id_, decached_, is_snapshot_, data_, checkpointer_); return std::unique_ptr(clone); } diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index bfc377b8f3..97b1e7fdde 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -205,8 +205,6 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( auto evict_from_cache = simdb::pipeline::createTask>( [this](EvictedChkptIDs&& evicted_ids, bool simulation_terminating) mutable { - return;//TODO cnyce - // TODO cnyce: We are allocating and deallocating a LOT of checkpoints. // See if we can reuse a pool of them. Could also try to just add a pool // to the VectorStorage::Segment class. @@ -214,15 +212,16 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( for (auto id : evicted_ids) { if (id == head_id_) { + // Never evict the head checkpoint continue; } - if (id == current_id_) { - pending_eviction_ids_.push(id); - continue; + if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { + it->second->flagDecached(); + if (findCheckpoint(id).lock() != nullptr) { + throw CheckpointError("Internal error - checkpoint should be marked as decached"); + } } - - chkpts_cache_.erase(id); } } ); @@ -271,10 +270,9 @@ uint64_t DatabaseCheckpointer::getContentMemoryUse() const noexcept return mem; } -void DatabaseCheckpointer::deleteCheckpoint(chkpt_id_t) +void DatabaseCheckpointer::deleteCheckpoint(chkpt_id_t) { - // TODO cnyce - throw CheckpointError("deleteCheckpoint() not supported"); + throw CheckpointError("Explicit checkpoint deletion is not supported by DatabaseCheckpointer"); } void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) @@ -291,7 +289,6 @@ void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) } chkpt->load(getArchDatas()); - //TODO cnyce: chkpts_cache_.erase(id); // Delete all future checkpoints past this one. Do this from the cache // as well as from the database. @@ -356,9 +353,10 @@ std::vector DatabaseCheckpointer::getCheckpoints() const } } - for (auto id : chkpt_query_->getCheckpoints()) { - results.insert(id); - } + //TODO cnyce: Put this back when the cache is actually purged + //for (auto id : chkpt_query_->getCheckpoints()) { + // results.insert(id); + //} std::vector chkpts(results.begin(), results.end()); std::sort(chkpts.begin(), chkpts.end()); @@ -434,6 +432,9 @@ std::weak_ptr DatabaseCheckpointer::findCheckpoint(chkpt_id_ std::lock_guard lock(cache_mutex_); if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { + if (it->second->isFlaggedDeleted() || it->second->isFlaggedDecached()) { + return std::weak_ptr(); + } return it->second; } @@ -444,7 +445,7 @@ std::shared_ptr DatabaseCheckpointer::cloneCheckpoint(chkpt_ { std::lock_guard lock(cache_mutex_); - if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { + if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) {//TODO cnyce: && !it->second->isFlaggedDecached()) { return it->second->clone(); } @@ -463,8 +464,8 @@ bool DatabaseCheckpointer::hasCheckpoint(chkpt_id_t id) const noexcept { std::lock_guard lock(cache_mutex_); - if (chkpts_cache_.find(id) != chkpts_cache_.end()) { - return true; + if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { + return !it->second->isFlaggedDeleted(); } return chkpt_query_->hasCheckpoint(id); @@ -813,7 +814,7 @@ void DatabaseCheckpointer::deleteCheckpoint_(chkpt_id_t id) } auto next_ids = it->second->getNextIDs(); - //TODO cnyce: chkpts_cache_.erase(it); + it->second->flagDeleted(); if (!next_ids.empty()) { if (next_ids.size() != 1) { @@ -892,8 +893,8 @@ void DatabaseCheckpointer::setCurrentID_(chkpt_id_t id) pending_eviction_ids_.pop(); if (id == current_id_) { pending_eviction_ids_.push(id); - } else { - //TODO cnyce: chkpts_cache_.erase(id); + } else if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { + it->second->flagDecached(); } } } diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp index 8c8a3dc002..3f242c9311 100644 --- a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp +++ b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp @@ -85,6 +85,8 @@ void generalTest() auto r1 = rset->getRegister("reg2"); auto r2 = rset2->getRegister("reg2"); assert(r1 != r2); + r1->write(0 * 5ul); + r2->write(0 % 5ul); simdb::DatabaseManager db_mgr("test.db", true); simdb::AppManager app_mgr(&db_mgr); @@ -130,7 +132,9 @@ void generalTest() // Create 1000 checkpoints, and periodically access an old one. Also // go to sleep sometimes to increase the chances we have to go to the - // database to retrieve a checkpoint. + // database to retrieve a checkpoint. Keep a clone of checkpoint 3 for + // later verification. + std::shared_ptr clone3; for (uint32_t i = 1; i <= 100; ++i) { step_checkpointer(i); @@ -140,6 +144,9 @@ void generalTest() if (cached_cp) { EXPECT_EQUAL(cached_cp->getID(), i); EXPECT_EQUAL(cached_cp->getPrevID(), i - 1); + if (i == 3) { + clone3 = dbcp.cloneCheckpoint(3); + } } // Access an old one, which may or may not be in the cache @@ -159,8 +166,13 @@ void generalTest() EXPECT_NOTHROW(dbcp.loadCheckpoint(id)); EXPECT_EQUAL(dbcp.getCurrentID(), id); EXPECT_EQUAL(dbcp.getNumCheckpoints(), id + 1); + EXPECT_FALSE(dbcp.hasCheckpoint(id + 1)); EXPECT_EQUAL(sched.getCurrentTick(), id); - // TODO cnyce: verify registers + + auto r1_val = r1->read(); + auto r2_val = r2->read(); + EXPECT_EQUAL(r1_val, id * 5ul); + EXPECT_EQUAL(r2_val, id % 5ul); }; // Load very recent checkpoints that are definitely in the cache @@ -182,6 +194,48 @@ void generalTest() step_checkpointer(42); verif_load_chkpt(40); + // Wait until checkpoint 3 is evicted from cache with a 3-second timeout + uint32_t num_tries = 0; + while (dbcp.findCheckpoint(3).lock() != nullptr) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + EXPECT_NOTEQUAL(++num_tries, 300); + } + +// TODO cnyce +#if 0 + // Ask the checkpointer to retrieve checkpoint 3 from the database + auto dbchkpt3 = dbcp.cloneCheckpoint(3); + EXPECT_EQUAL(dbchkpt3->getID(), clone3->getID()); + + // Verify that the database checkpoint matches the original clone of 3 + std::ostringstream clone3_oss, dbchkpt3_oss; + clone3->dumpData(clone3_oss); + dbchkpt3->dumpData(dbchkpt3_oss); + EXPECT_EQUAL(clone3_oss.str(), dbchkpt3_oss.str()); + + // Verify history chain for a db-recreated checkpoint + auto hist_chain3 = dbcp.getHistoryChain(3); + for (auto hist_id : {3,2,1,0}) { + EXPECT_FALSE(hist_chain3.empty()); + EXPECT_EQUAL(hist_chain3.top(), hist_id); + hist_chain3.pop(); + } + + // Verify restore chain for a db-recreated checkpoint + auto rest_chain3 = dbcp.getRestoreChain(3); + for (auto rest_id : {3,2,1,0}) { + EXPECT_FALSE(rest_chain3.empty()); + EXPECT_EQUAL(rest_chain3.top(), rest_id); + rest_chain3.pop(); + } + + // Verify distance to previous snapshot for a db-recreated checkpoint + EXPECT_EQUAL(dbchkpt3->getDistanceToPrevSnapshot(), 3); + + // Nothing to test, just call dumpRestoreChain() + dbcp.dumpRestoreChain(std::cout, 3); +#endif + // Go back to checkpoint 1 verif_load_chkpt(1); @@ -206,13 +260,9 @@ void generalTest() EXPECT_THROW(dbcp.cloneCheckpoint(9999)); EXPECT_NOTHROW(dbcp.cloneCheckpoint(9999, false)); - // Create checkpoints 1-50. Keep a clone of checkpoint 3 for later. - std::unique_ptr clone3; + // Create checkpoints 1-50. for (uint32_t i = 1; i <= 50; ++i) { step_checkpointer(i); - if (i == 3) { - clone3 = dbcp.findCheckpoint(3).lock()->clone(); - } } // Verify checkpoint chain: 0-50 @@ -237,7 +287,7 @@ void generalTest() verif_load_chkpt(45); // Verify that checkpoints 46+ have been implicitly deleted - // TODO cnyce: EXPECT_FALSE(dbcp.hasCheckpoint(46)); + EXPECT_FALSE(dbcp.hasCheckpoint(46)); // Create checkpoints 46-55 for (uint32_t i = 46; i <= 55; ++i) { @@ -258,7 +308,6 @@ void generalTest() } // Delete checkpoint always throws - // TODO cnyce: relax this restriction? EXPECT_THROW(dbcp.deleteCheckpoint(57)); // Create checkpoints 59-70 @@ -269,14 +318,6 @@ void generalTest() // Load checkpoint 58 verif_load_chkpt(58); - // Finish - app_mgr.postSimLoopTeardown(); - root.enterTeardown(); - clocks.enterTeardown(); - return; - - - // Verify all checkpoints: 0-58 auto all_chkpts = dbcp.getCheckpoints(); EXPECT_EQUAL(all_chkpts.size(), 59); @@ -302,30 +343,16 @@ void generalTest() } EXPECT_EQUAL(idx, all_chkpts.size()); - uint32_t all_idx = 0; - for (uint32_t i = 0; i <= 45; ++i) { - EXPECT_EQUAL(all_chkpts[all_idx++], i); - } - for (uint32_t i = 51; i <= 56; ++i) { - EXPECT_EQUAL(all_chkpts[all_idx++], i); - } - EXPECT_EQUAL(all_chkpts[all_idx++], 58); - for (uint32_t i = 71; i <= 75; ++i) { - EXPECT_EQUAL(all_chkpts[all_idx++], i); - } - EXPECT_EQUAL(all_idx, all_chkpts.size()); - all_idx = 0; - // Nothing to test, just call dumpRestoreChain() - dbcp.dumpRestoreChain(std::cout, 73); + EXPECT_NOTHROW(dbcp.dumpRestoreChain(std::cout, 73)); // Verify history chain up to current checkpoint + size_t all_idx = 0; auto history_chain = dbcp.getHistoryChain(dbcp.getCurrentID()); while (!history_chain.empty()) { EXPECT_EQUAL(history_chain.top(), all_chkpts[all_idx++]); history_chain.pop(); } - all_idx = 0; // Verify restore chain up to current checkpoint auto restore_chain = dbcp.getRestoreChain(dbcp.getCurrentID()); @@ -346,7 +373,7 @@ void generalTest() EXPECT_FALSE(c->isSnapshot()); } - // Verify that cached checkpoints are clonable + // Verify that checkpoint clones are as expected auto cache73 = dbcp.findCheckpoint(73).lock(); auto clone73 = dbcp.cloneCheckpoint(73); @@ -367,44 +394,10 @@ void generalTest() EXPECT_EQUAL(cache73->isSnapshot(), clone73->isSnapshot()); EXPECT_EQUAL(cache73->getDistanceToPrevSnapshot(), clone73->getDistanceToPrevSnapshot()); - // Wait until checkpoint 3 is evicted from cache - uint32_t num_tries = 0; - while (dbcp.findCheckpoint(3).lock() != nullptr) { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - EXPECT_NOTEQUAL(++num_tries, 100); // 1-second timeout - } - - // Ask the checkpointer to retrieve checkpoint 3 from the database - auto dbchkpt3 = dbcp.cloneCheckpoint(3); - EXPECT_EQUAL(dbchkpt3->getID(), clone3->getID()); - - // Verify that the database checkpoint matches the original clone of 3 - std::ostringstream clone3_oss, dbchkpt3_oss; - clone3->dumpData(clone3_oss); - dbchkpt3->dumpData(dbchkpt3_oss); - EXPECT_EQUAL(clone3_oss.str(), dbchkpt3_oss.str()); - - // Verify history chain for a db-recreated checkpoint - auto hist_chain3 = dbcp.getHistoryChain(3); - for (auto hist_id : {3,2,1,0}) { - EXPECT_FALSE(hist_chain3.empty()); - EXPECT_EQUAL(hist_chain3.top(), hist_id); - hist_chain3.pop(); - } - - // Verify restore chain for a db-recreated checkpoint - auto rest_chain3 = dbcp.getRestoreChain(3); - for (auto rest_id : {3,2,1,0}) { - EXPECT_FALSE(rest_chain3.empty()); - EXPECT_EQUAL(rest_chain3.top(), rest_id); - rest_chain3.pop(); - } - - // Verify distance to previous snapshot for a db-recreated checkpoint - EXPECT_EQUAL(dbchkpt3->getDistanceToPrevSnapshot(), 3); - - // Nothing to test, just call dumpRestoreChain() - dbcp.dumpRestoreChain(std::cout, 3); + // Finish + app_mgr.postSimLoopTeardown(); + root.enterTeardown(); + clocks.enterTeardown(); // Nothing to test, just call dumpList/dumpData/dumpAnnotatedData dbcp.dumpList(std::cout); @@ -413,16 +406,6 @@ void generalTest() std::cout << std::endl; dbcp.dumpAnnotatedData(std::cout); std::cout << std::endl; - - // Load checkpoint 8 and verify registers - EXPECT_NOTHROW(dbcp.loadCheckpoint(8)); - EXPECT_EQUAL(r1->read(), 40ul); // 8 * 5 - EXPECT_EQUAL(r2->read(), 3ul); // 8 % 5 - EXPECT_EQUAL(sched.getCurrentTick(), 8); - EXPECT_EQUAL(dbcp.getNumCheckpoints(), 9); - - // Finish... - app_mgr.postSimLoopTeardown(); } int main() From b0b9d0b3dd3ca52d83d5bd211532f9478b9ed01e Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Mon, 22 Sep 2025 18:47:20 -0500 Subject: [PATCH 11/30] Database-backed checkpointer --- sparta/CMakeLists.txt | 1 - sparta/simdb | 2 +- .../serialization/checkpoint/Checkpointer.hpp | 24 +- .../checkpoint/DatabaseCheckpoint.hpp | 85 +- .../checkpoint/DatabaseCheckpointQuery.hpp | 88 -- .../checkpoint/DatabaseCheckpointer.hpp | 193 ++-- .../checkpoint/FastCheckpointer.hpp | 20 +- sparta/src/DatabaseCheckpoint.cpp | 23 +- sparta/src/DatabaseCheckpointQuery.cpp | 273 ------ sparta/src/DatabaseCheckpointer.cpp | 875 ++++++++---------- .../DatabaseCheckpoint_test.cpp | 138 +-- 11 files changed, 629 insertions(+), 1093 deletions(-) delete mode 100644 sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp delete mode 100644 sparta/src/DatabaseCheckpointQuery.cpp diff --git a/sparta/CMakeLists.txt b/sparta/CMakeLists.txt index 211bdb4acd..e604953fc8 100644 --- a/sparta/CMakeLists.txt +++ b/sparta/CMakeLists.txt @@ -42,7 +42,6 @@ list (APPEND SourceCppFiles src/DAG.cpp src/DatabaseCheckpoint.cpp src/DatabaseCheckpointer.cpp - src/DatabaseCheckpointQuery.cpp src/Destination.cpp src/EdgeFactory.cpp src/EventNode.cpp diff --git a/sparta/simdb b/sparta/simdb index 8408761136..f94fd60cc5 160000 --- a/sparta/simdb +++ b/sparta/simdb @@ -1 +1 @@ -Subproject commit 84087611361581ed390501bb0dd2b0e6c9f714ea +Subproject commit f94fd60cc595f0ded277131093c2106f25a41e82 diff --git a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp index eaab6a77e7..4a53d0a3e9 100644 --- a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp @@ -311,7 +311,7 @@ namespace sparta::serialization::checkpoint * \note Makes a new vector of results. This should not be called in a * performance-critical path. */ - virtual std::vector getCheckpointsAt(tick_t t) const = 0; + virtual std::vector getCheckpointsAt(tick_t t) = 0; /*! * \brief Gets all known checkpoint IDs available on any timeline sorted @@ -321,7 +321,7 @@ namespace sparta::serialization::checkpoint * \note Makes a new vector of results. This should not be called in a * performance-critical path. */ - virtual std::vector getCheckpoints() const = 0; + virtual std::vector getCheckpoints() = 0; /*! * \brief Gets the current number of checkpoints having valid IDs @@ -330,7 +330,7 @@ namespace sparta::serialization::checkpoint * Ignores any internal temporary or deleted checkpoints without * visible IDs */ - virtual uint32_t getNumCheckpoints() const noexcept = 0; + virtual uint32_t getNumCheckpoints() noexcept = 0; /*! * \brief Debugging utility which gets a deque of checkpoints @@ -347,7 +347,7 @@ namespace sparta::serialization::checkpoint * \note Makes a new vector of results. This should not be called in the * critical path. */ - virtual std::deque getCheckpointChain(chkpt_id_t id) const = 0; + virtual std::deque getCheckpointChain(chkpt_id_t id) = 0; /*! * \brief Tests whether this checkpoint manager has a checkpoint with @@ -356,7 +356,7 @@ namespace sparta::serialization::checkpoint * and false if not. If id == Checkpoint::UNIDENTIFIED_CHECKPOINT, * always returns false */ - virtual bool hasCheckpoint(chkpt_id_t id) const noexcept = 0; + virtual bool hasCheckpoint(chkpt_id_t id) noexcept = 0; /*! * \brief Returns the head checkpoint which is equivalent to the @@ -461,14 +461,14 @@ namespace sparta::serialization::checkpoint * ostream with a newline following each checkpoint * \param o ostream to dump to */ - virtual void dumpList(std::ostream& o) const = 0; + virtual void dumpList(std::ostream& o) = 0; /*! * \brief Dumps this checkpointer's data to an ostream with a newline * following each checkpoint * \param o ostream to dump to */ - virtual void dumpData(std::ostream& o) const = 0; + virtual void dumpData(std::ostream& o) = 0; /*! * \brief Dumps this checkpointer's data to an @@ -476,7 +476,7 @@ namespace sparta::serialization::checkpoint * following each checkpoint description and each checkpoint data dump * \param o ostream to dump to */ - virtual void dumpAnnotatedData(std::ostream& o) const = 0; + virtual void dumpAnnotatedData(std::ostream& o) = 0; /*! * \brief Debugging utility which dumps values in some bytes across a @@ -500,7 +500,7 @@ namespace sparta::serialization::checkpoint * for deep branches will be difficult to read * \param o ostream to dump to */ - void dumpTree(std::ostream& o) const { + void dumpTree(std::ostream& o) { std::deque c; dumpBranch(o, getHeadID(), 0, 0, c); o << '\n'; @@ -522,7 +522,7 @@ namespace sparta::serialization::checkpoint const chkpt_id_t chkpt, uint32_t indent, uint32_t pos, - std::deque& continues) const { + std::deque& continues) { //! \todo Move the constants somewhere static outside this function (especially the assert) static const std::string SEP_STR = "-> "; // Normal checkpoint chain static const std::string CONT_SEP_STR = "`> "; // Checkpoint branch from higher line @@ -616,7 +616,7 @@ namespace sparta::serialization::checkpoint */ virtual chkpt_id_t createCheckpoint_(bool force_snapshot=false) = 0; - virtual void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const { + virtual void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) { o << id; } @@ -678,7 +678,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Returns IDs of the checkpoints immediately following the given checkpoint. */ - virtual std::vector getNextIDs_(chkpt_id_t id) const = 0; + virtual std::vector getNextIDs_(chkpt_id_t id) = 0; /*! * \brief Scheduler whose tick count will be set and read. Cannnot be diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp index b6a232ecd3..5fb2eb191e 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp @@ -13,18 +13,20 @@ namespace sparta::serialization::checkpoint struct ChkptWindowBytes { using chkpt_id_t = CheckpointBase::chkpt_id_t; - std::vector chkpt_ids; std::vector chkpt_bytes; - uint64_t start_tick; - uint64_t end_tick; + chkpt_id_t start_chkpt_id = CheckpointBase::UNIDENTIFIED_CHECKPOINT; + chkpt_id_t end_chkpt_id = CheckpointBase::UNIDENTIFIED_CHECKPOINT; + uint64_t start_tick = 0; + uint64_t end_tick = 0; }; struct ChkptWindow { using chkpt_id_t = CheckpointBase::chkpt_id_t; - std::vector chkpt_ids; std::vector> chkpts; - uint64_t start_tick; - uint64_t end_tick; + chkpt_id_t start_chkpt_id = CheckpointBase::UNIDENTIFIED_CHECKPOINT; + chkpt_id_t end_chkpt_id = CheckpointBase::UNIDENTIFIED_CHECKPOINT; + uint64_t start_tick = 0; + uint64_t end_tick = 0; //! \brief Support boost::serialization template @@ -75,7 +77,6 @@ namespace sparta::serialization::checkpoint chkpt_id_t prev_id, const std::vector& next_ids, chkpt_id_t deleted_id, - bool decached, bool is_snapshot, const storage::VectorStorage& storage, DatabaseCheckpointer* checkpointer); @@ -89,10 +90,12 @@ namespace sparta::serialization::checkpoint template void serialize(Archive& ar, const unsigned int version) { + sparta_assert(deleted_id_ == CheckpointBase::UNIDENTIFIED_CHECKPOINT, + "Cannot serialize a DatabaseCheckpoint that was already deleted"); + CheckpointBase::serialize(ar, version); ar & prev_id_; ar & next_ids_; - ar & deleted_id_; ar & is_snapshot_; ar & data_; } @@ -157,29 +160,6 @@ namespace sparta::serialization::checkpoint */ void load(const std::vector& dats) override; - /*! - * \brief Can this checkpoint be deleted - * Cannot be deleted if: - * \li This checkpoint has any ancestors which are not deletable and not snapshots - * \li This checkpoint was not flagged for deletion with flagDeleted - * \warning This is a recursive search of a checkpoint tree which has potentially many - * branches and could have high time cost - */ - bool canDelete() const noexcept; - - /*! - * \brief Allows this checkpoint to be deleted if it is no longer a - * previous delta of some other delta (i.e. getNexts() returns an - * empty vector). Sets the checkpoint ID to invalid. Calling multiple - * times has no effect - * \pre Must not already be flagged deleted - * \post isFlaggedDeleted() will return true - * \post getDeletedID() will return the current ID (if any) - * \see canDelete - * \see isFlaggedDeleted - */ - void flagDeleted(); - /*! * \brief Indicates whether this checkpoint has been flagged deleted. * \note Does not imply that the checkpoint can safely be deleted; @@ -204,18 +184,6 @@ namespace sparta::serialization::checkpoint */ std::string getDeletedRepr() const override; - /*! - * \brief Mark this checkpoint as no longer in the cache. It will still - * live in the cache until the checkpointer has a chance to evict it. - */ - void flagDecached(); - - /*! - * \brief Should this checkpoint be considered ready for eviction from - * the cache? - */ - bool isFlaggedDecached() const noexcept; - /*! * \brief Is this checkpoint a snapshot (contains ALL simulator state) */ @@ -263,6 +231,19 @@ namespace sparta::serialization::checkpoint */ void storeDelta_(const std::vector& dats); + /*! + * \brief Allows this checkpoint to be deleted if it is no longer a + * previous delta of some other delta (i.e. getNexts() returns an + * empty vector). Sets the checkpoint ID to invalid. Calling multiple + * times has no effect + * \pre Must not already be flagged deleted + * \post isFlaggedDeleted() will return true + * \post getDeletedID() will return the current ID (if any) + * \see canDelete + * \see isFlaggedDeleted + */ + void flagDeleted_(); + /*! * \brief ID of the previous checkpoint. */ @@ -280,9 +261,6 @@ namespace sparta::serialization::checkpoint */ chkpt_id_t deleted_id_; - //! \brief Has this checkpoint been flagged as ready to be decached? - bool decached_ = false; - //! \brief Is this node a snapshot? bool is_snapshot_; @@ -296,22 +274,19 @@ namespace sparta::serialization::checkpoint //! Defined down here for "new DatabaseCheckpoint" template inline void ChkptWindow::serialize(Archive& ar, const unsigned int /*version*/) { - // TODO cnyce: Try to avoid use of unique_ptr. Everything is already movable - // and has default constructors. - ar & chkpt_ids; + ar & start_chkpt_id; + ar & end_chkpt_id; ar & start_tick; ar & end_tick; if (chkpts.empty()) { - // We are loading checkpoint window from disk - chkpts.reserve(chkpt_ids.size()); - for (size_t i = 0; i < chkpt_ids.size(); ++i) { + // We are loading a checkpoint window from disk + const auto num_chkpts = end_chkpt_id - start_chkpt_id + 1; + for (size_t i = 0; i < num_chkpts; ++i) { chkpts.emplace_back(new DatabaseCheckpoint); ar & *chkpts.back(); } - } - - else { + } else { // We are saving a checkpoint window to disk for (auto& chkpt : chkpts) { ar & *chkpt; diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp deleted file mode 100644 index 8d6d7287e6..0000000000 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp +++ /dev/null @@ -1,88 +0,0 @@ -// -*- C++ -*- - -#pragma once - -#include "sparta/serialization/checkpoint/Checkpointer.hpp" -#include - -namespace simdb -{ - class DatabaseManager; -} - -namespace sparta::serialization::checkpoint -{ - -class DatabaseCheckpoint; -class DatabaseCheckpointer; - -/*! - * \brief SQLite query object to "extend" the checkpoint search space from just the - * cache to include the database. Combinations of in-memory checkpoints, recreated - * checkpoints, and database schema/query optimizations are used for performance. - */ -class DatabaseCheckpointQuery : public Checkpointer -{ -public: - DatabaseCheckpointQuery(DatabaseCheckpointer* checkpointer, - simdb::DatabaseManager* db_mgr, - TreeNode& root, - Scheduler* sched=nullptr) - : Checkpointer(root, sched) - , db_mgr_(db_mgr) - { - } - - uint64_t getTotalMemoryUse() const noexcept override; - - uint64_t getContentMemoryUse() const noexcept override; - - void deleteCheckpoint(chkpt_id_t id) override; - - void loadCheckpoint(chkpt_id_t id) override; - - std::vector getCheckpointsAt(tick_t t) const override; - - std::vector getCheckpoints() const override; - - uint32_t getNumCheckpoints() const noexcept override; - - std::deque getCheckpointChain(chkpt_id_t id) const override; - - bool hasCheckpoint(chkpt_id_t id) const noexcept override; - - bool isSnapshot(chkpt_id_t id) const noexcept; - - bool canDelete(chkpt_id_t id) const noexcept; - - void dumpList(std::ostream& o) const override; - - void dumpData(std::ostream& o) const override; - - void dumpAnnotatedData(std::ostream& o) const override; - - void traceValue(std::ostream& o, chkpt_id_t id, const ArchData* container, uint32_t offset, uint32_t size) override; - - std::shared_ptr findCheckpoint(chkpt_id_t id, bool must_exist=false) const; - - chkpt_id_t getPrevID(chkpt_id_t id) const; - - std::vector getNextIDs(chkpt_id_t id, bool immediate_only = true) const; - - uint32_t getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept; - -private: - void createHead_() override; - - chkpt_id_t createCheckpoint_(bool force_snapshot=false) override; - - void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const override; - - std::vector getNextIDs_(chkpt_id_t id) const override; - - mutable simdb::DatabaseManager* db_mgr_ = nullptr; - - std::unordered_set tagged_deleted_ids_; -}; - -} // namespace sparta::serialization::checkpoint diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp index ff317056a7..2e97b030c5 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -17,7 +17,6 @@ namespace sparta::serialization::checkpoint { class DatabaseCheckpointer; -class DatabaseCheckpointQuery; /*! * \brief Implementation of the FastCheckpointer which only holds @@ -30,6 +29,9 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer static constexpr auto NAME = "db-checkpointer"; using checkpoint_type = DatabaseCheckpoint; + using checkpoint_ptr = std::shared_ptr; + using checkpoint_ptrs = std::vector; + using window_id_t = uint64_t; /*! * \brief FastCheckpointer Constructor @@ -60,6 +62,11 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer std::unique_ptr createPipeline( simdb::pipeline::AsyncDatabaseAccessor* db_accessor) override; + /*! + * \brief Flush all windows down the pipeline before threads are shut down. + */ + void preTeardown() override; + /*! * \brief Returns the next-shapshot threshold. * @@ -72,13 +79,18 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * * This value is a performance/space tradeoff knob. */ - uint32_t getSnapshotThreshold() const noexcept; + uint32_t getSnapshotThreshold() const; /*! * \brief Sets the snapshot threshold * \see getSnapshotThreshold */ - void setSnapshotThreshold(uint32_t thresh) noexcept; + void setSnapshotThreshold(uint32_t thresh); + + /*! + * \brief Sets the max number of cached windows (LRU) + */ + void setMaxCachedWindows(uint32_t max_windows); /*! * \brief Computes and returns the memory usage by this checkpointer at @@ -121,7 +133,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \note Makes a new vector of results. This should not be called in the * critical path. */ - std::vector getCheckpointsAt(tick_t t) const override; + std::vector getCheckpointsAt(tick_t t) override; /*! * \brief Gets all checkpoint IDs available on any timeline sorted by @@ -131,28 +143,22 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \note Makes a new vector of results. This should not be called in the * critical path. */ - std::vector getCheckpoints() const override; + std::vector getCheckpoints() override; /*! * \brief Gets the current number of checkpoints having valid IDs */ - uint32_t getNumCheckpoints() const noexcept override; + uint32_t getNumCheckpoints() noexcept override; /*! * \brief Gets the current number of snapshots with valid IDs */ - uint32_t getNumSnapshots() const noexcept; + uint32_t getNumSnapshots() noexcept; /*! * \brief Gets the current number of delta checkpoints with valid IDs */ - uint32_t getNumDeltas() const noexcept; - - /*! - * \brief Gets the curent number of checkpoints (delta or snapshot) - * withOUT valid IDs. - */ - uint32_t getNumDeadCheckpoints() const noexcept; + uint32_t getNumDeltas() noexcept; /*! * \brief Debugging utility which gets a deque of checkpoints @@ -169,44 +175,16 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \note Makes a new vector of results. This should not be called in the * critical path. */ - std::deque getCheckpointChain(chkpt_id_t id) const override; - - /*! - * \brief Finds the latest checkpoint at or before the given tick - * starting at the \a from checkpoint and working backward. - * If no checkpoints before or at tick are found, returns nullptr. - * \param tick Tick to search for - * \param from Checkpoint at which to begin searching for a tick. - * Must be a valid checkpoint known by this checkpointer. - * See hasCheckpoint. - * \return The latest checkpoint with a tick number less than or equal - * to the \a tick argument. Returns nullptr if no checkpoints before \a - * tick were found. It is possible for the checkpoint identified by \a - * from could be returned. - * \warning This is not a high-performance method. Generally, - * a client of this interface knows a paticular ID. - * \throw CheckpointError if \a from does not refer to a valid - * checkpoint. - */ - std::shared_ptr findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from); + std::deque getCheckpointChain(chkpt_id_t id) override; /*! * \brief Finds a checkpoint by its ID. * \param id ID of checkpoint to find. Guaranteed not to be flagged as * deleted - * \note ONLY SEARCHES CHECKPOINT CACHE. Use cloneCheckpoint() to also search the database. + * \note ONLY SEARCHES CHECKPOINT CACHE. * \return Checkpoint with ID of \a id if found or nullptr if not found */ - std::weak_ptr findCheckpoint(chkpt_id_t id) const; - - /*! - * \brief Finds a checkpoint by its ID. - * \param id ID of checkpoint to find. Guaranteed not to be flagged as - * deleted - * \note SEARCHES BOTH THE CACHE AND THE DATABASE - * \return Checkpoint with ID of \a id if found or nullptr if not found - */ - std::shared_ptr cloneCheckpoint(chkpt_id_t id, bool must_exist=true) const; + std::shared_ptr findCheckpoint(chkpt_id_t id, bool must_exist=false); /*! * \brief Tests whether this checkpoint manager has a checkpoint with @@ -215,7 +193,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * and false if not. If id == Checkpoint::UNIDENTIFIED_CHECKPOINT, * always returns false */ - bool hasCheckpoint(chkpt_id_t id) const noexcept override; + bool hasCheckpoint(chkpt_id_t id) noexcept override; /*! * \brief Dumps the restore chain for this checkpoint. @@ -223,7 +201,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \param o ostream to which chain data will be dumped * \param id ID of starting checkpoint */ - void dumpRestoreChain(std::ostream& o, chkpt_id_t id) const; + void dumpRestoreChain(std::ostream& o, chkpt_id_t id); /*! * \brief Returns a stack of checkpoints from this checkpoint as far @@ -232,20 +210,20 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * to be inspected for restoring this checkpoint's data. This may reach * the head checkpoint if no gaps are encountered. */ - std::stack getHistoryChain(chkpt_id_t id) const; + std::stack getHistoryChain(chkpt_id_t id); /*! * \brief Returns a stack of checkpoints that must be restored from * top-to-bottom to fully restore the state associated with this * checkpoint. */ - std::stack getRestoreChain(chkpt_id_t id) const; + std::stack getRestoreChain(chkpt_id_t id); /*! * \brief Returns next checkpoint following *this. May be an empty * vector if there are no later checkpoints. */ - std::vector getNextIDs(chkpt_id_t id) const; + std::vector getNextIDs(chkpt_id_t id); /*! * \brief Determines how many checkpoints away the closest, earlier @@ -257,23 +235,13 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \note This is a noexcept function, which means that the exception if * no snapshot is encountered is uncatchable. This is intentional. */ - uint32_t getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept; + uint32_t getDistanceToPrevSnapshot(chkpt_id_t id) noexcept; /*! * \brief Check if the given checkpoint is a snapshot (not a delta). * \return Returns false if not a snapshot or the id is not a checkpoint. */ - bool isSnapshot(chkpt_id_t id) const noexcept; - - /*! - * \brief Can this checkpoint be deleted - * Cannot be deleted if: - * \li This checkpoint has any ancestors which are not deletable and not snapshots - * \li This checkpoint was not flagged for deletion with flagDeleted - * \warning This is a recursive search of a checkpoint tree which has potentially many - * branches and could have high time cost - */ - bool canDelete(chkpt_id_t id) const noexcept; + bool isSnapshot(chkpt_id_t id) noexcept; /*! * \brief Returns a string describing this object @@ -285,14 +253,14 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * ostream with a newline following each checkpoint * \param o ostream to dump to */ - void dumpList(std::ostream& o) const override; + void dumpList(std::ostream& o) override; /*! * \brief Dumps this checkpointer's data to an ostream with a newline * following each checkpoint * \param o ostream to dump to */ - void dumpData(std::ostream& o) const override; + void dumpData(std::ostream& o) override; /*! * \brief Dumps this checkpointer's data to an @@ -300,7 +268,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * following each checkpoint description and each checkpoint data dump * \param o ostream to dump to */ - void dumpAnnotatedData(std::ostream& o) const override; + void dumpAnnotatedData(std::ostream& o) override; /*! * \brief Debugging utility which dumps values in some bytes across a @@ -366,12 +334,12 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer /*! * \brief Implements Checkpointer::dumpCheckpointNode_ */ - void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const override; + void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) override; /*! * \brief Returns IDs of the checkpoints immediately following the given checkpoint. */ - std::vector getNextIDs_(chkpt_id_t id) const override; + std::vector getNextIDs_(chkpt_id_t id) override; /*! * \brief Intercept calls to Checkpointer::setHead_() and ensure we do not delete it. @@ -399,25 +367,72 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer */ void addToCache_(std::shared_ptr chkpt); + /*! + * \brief Get the window ID for the given checkpoint ID + */ + window_id_t getWindowID_(chkpt_id_t id) const { + return id / (snap_thresh_ + 1); + } + + /*! + * \brief Get the window ID for the given checkpoint + */ + template + window_id_t getWindowID_(const CheckpointPtrT& chkpt) const { + return getWindowID_(chkpt->getID()); + } + + /*! + * \brief Bump the given window ID to the front of the LRU cache + */ + void touchWindow_(window_id_t id); + + /*! + * \brief Evict the least recently used window from the cache if needed + */ + void evictWindowsIfNeeded_(bool force_flush=false); + + /*! + * \brief Ensure this checkpoint's window is loaded in the LRU cache + */ + bool ensureWindowLoaded_(chkpt_id_t id, bool must_succeed=true); + + /*! + * \brief Retrieve a checkpoint window from the database + */ + checkpoint_ptrs getWindowFromDatabase_(window_id_t win_id); + + /*! + * \brief "Undo" the pipeline for a ChkptWindows.WindowBytes blob + * into the original vector of checkpoints + */ + std::unique_ptr deserializeWindow_(const std::vector& window_bytes) const; + + /*! + * \brief Apply the given callback to every checkpoint (cached and database). + */ + void forEachCheckpoint_(const std::function& cb); + //! \brief Checkpointer head ID. Used to prevent the head from being deleted from the cache. chkpt_id_t head_id_ = checkpoint_type::UNIDENTIFIED_CHECKPOINT; //! \brief Checkpointer current ID. Used to prevent the current node from being deleted from the cache. chkpt_id_t current_id_ = checkpoint_type::UNIDENTIFIED_CHECKPOINT; + //! \brief Pipeline input queue from which new checkpoints to be processed are read. + simdb::ConcurrentQueue* pipeline_head_ = nullptr; + //! \brief Subset (or all of) our checkpoints that we currently are holding in memory. - std::unordered_map> chkpts_cache_; + std::unordered_map chkpts_cache_; - //! \brief Ordered list of checkpoint windows (snapshot + deltas). - std::deque> chkpt_windows_; + //! \brief LRU list of window IDs in our cache. Most recently used at front. + std::list lru_list_; - //! \brief SQLite query object to "extend" the checkpoint search space from just the - //! cache to include the database. Combinations of in-memory checkpoints, recreated - //! checkpoints, and database schema/query optimizations are used for performance. - std::shared_ptr chkpt_query_; + //! \brief Map of window ID to its position in the LRU list for O(1) access. + std::unordered_map::iterator> lru_map_; - //! \brief IDs of checkpoints pending eviction from the cache once they are no longer current. - std::queue pending_eviction_ids_; + //! \brief Maximum number of windows to hold in memory at any given time. + utils::ValidValue max_cached_windows_; //! \brief Mutex to protect our checkpoints cache. mutable std::recursive_mutex cache_mutex_; @@ -425,39 +440,19 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer //! \brief SimDB instance simdb::DatabaseManager* db_mgr_ = nullptr; - //! \brief Pipeline. Held onto to enable flushing. - simdb::pipeline::Pipeline* pipeline_ = nullptr; + //! \brief Checkpoint pipeline flusher + std::unique_ptr pipeline_flusher_; /*! * \brief Snapshot generation threshold. Every n checkpoints in a chain * are taken as snapshots instead of deltas */ - uint32_t snap_thresh_; + utils::ValidValue snap_thresh_; /*! * \brief Next checkpoint ID value */ chkpt_id_t next_chkpt_id_; - - /*! - * \brief Number of living checkpoints of either snapshot or delta type. - * (where checkpoint isFlaggedDeleted()=false) - */ - uint32_t num_alive_checkpoints_; - - /*! - * \brief Number of living snapshot checkpoints (where checkpoint - * isFlaggedDeleted()=false). Will be <= num_alive_checkpoints_ - * The number of delta checkpoints (not snapshots) can be computed as - * num_alive_checkpoints_ - num_alive_snapshots_. - */ - uint32_t num_alive_snapshots_; - - /*! - * \brief Number of checkpoints which have been flagged as deleted but - * still exist in the checkpointer. - */ - uint32_t num_dead_checkpoints_; }; } // namespace sparta::serialization::checkpoint diff --git a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp index be1bd00bf6..b917e40583 100644 --- a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp @@ -266,7 +266,7 @@ namespace sparta::serialization::checkpoint * \note Makes a new vector of results. This should not be called in the * critical path. */ - std::vector getCheckpointsAt(tick_t t) const override { + std::vector getCheckpointsAt(tick_t t) override { std::vector results; for(auto& p : chkpts_){ const Checkpoint* cp = p.second.get(); @@ -286,7 +286,7 @@ namespace sparta::serialization::checkpoint * \note Makes a new vector of results. This should not be called in the * critical path. */ - std::vector getCheckpoints() const override { + std::vector getCheckpoints() override { std::vector results; for(auto& p : chkpts_){ const Checkpoint* cp = p.second.get(); @@ -301,7 +301,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Gets the current number of checkpoints having valid IDs */ - uint32_t getNumCheckpoints() const noexcept override { + uint32_t getNumCheckpoints() noexcept override { return num_alive_checkpoints_; } @@ -342,7 +342,7 @@ namespace sparta::serialization::checkpoint * \note Makes a new vector of results. This should not be called in the * critical path. */ - std::deque getCheckpointChain(chkpt_id_t id) const override { + std::deque getCheckpointChain(chkpt_id_t id) override { std::deque results; if(!getHead()){ return results; @@ -414,7 +414,7 @@ namespace sparta::serialization::checkpoint * and false if not. If id == Checkpoint::UNIDENTIFIED_CHECKPOINT, * always returns false */ - bool hasCheckpoint(chkpt_id_t id) const noexcept override { + bool hasCheckpoint(chkpt_id_t id) noexcept override { return chkpts_.find(id) != chkpts_.end(); } @@ -439,7 +439,7 @@ namespace sparta::serialization::checkpoint * ostream with a newline following each checkpoint * \param o ostream to dump to */ - void dumpList(std::ostream& o) const override { + void dumpList(std::ostream& o) override { for(auto& cp : chkpts_){ o << cp.second->stringize() << std::endl; } @@ -450,7 +450,7 @@ namespace sparta::serialization::checkpoint * following each checkpoint * \param o ostream to dump to */ - void dumpData(std::ostream& o) const override { + void dumpData(std::ostream& o) override { for(auto& cp : chkpts_){ cp.second->dumpData(o); o << std::endl; @@ -463,7 +463,7 @@ namespace sparta::serialization::checkpoint * following each checkpoint description and each checkpoint data dump * \param o ostream to dump to */ - void dumpAnnotatedData(std::ostream& o) const override { + void dumpAnnotatedData(std::ostream& o) override { for(auto& cp : chkpts_){ o << cp.second->stringize() << std::endl; cp.second->dumpData(o); @@ -642,7 +642,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Implements Checkpointer::dumpCheckpointNode_ */ - void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const override { + void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) override { static std::string SNAPSHOT_NOTICE = "(s)"; auto cp = findCheckpoint_(id); @@ -763,7 +763,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Returns IDs of the checkpoints immediately following the given checkpoint. */ - std::vector getNextIDs_(chkpt_id_t id) const override final { + std::vector getNextIDs_(chkpt_id_t id) override final { std::vector next_ids; if (const auto chkpt = findCheckpoint_(id)) { for (const auto next : chkpt->getNexts()) { diff --git a/sparta/src/DatabaseCheckpoint.cpp b/sparta/src/DatabaseCheckpoint.cpp index e02f10322c..201c5c2a30 100644 --- a/sparta/src/DatabaseCheckpoint.cpp +++ b/sparta/src/DatabaseCheckpoint.cpp @@ -52,7 +52,6 @@ DatabaseCheckpoint::DatabaseCheckpoint(chkpt_id_t id, chkpt_id_t prev_id, const std::vector& next_ids, chkpt_id_t deleted_id, - bool decached, bool is_snapshot, const storage::VectorStorage& storage, DatabaseCheckpointer* checkpointer) @@ -60,7 +59,6 @@ DatabaseCheckpoint::DatabaseCheckpoint(chkpt_id_t id, , prev_id_(prev_id) , next_ids_(next_ids) , deleted_id_(deleted_id) - , decached_(decached) , is_snapshot_(is_snapshot) , data_(storage) , checkpointer_(checkpointer) @@ -131,16 +129,11 @@ void DatabaseCheckpoint::load(const std::vector& dats) while (!chkpt_ids.empty()) { auto id = chkpt_ids.top(); chkpt_ids.pop(); - checkpointer_->cloneCheckpoint(id)->loadState(dats); + checkpointer_->findCheckpoint(id)->loadState(dats); } } -bool DatabaseCheckpoint::canDelete() const noexcept -{ - return checkpointer_->canDelete(getID()); -} - -void DatabaseCheckpoint::flagDeleted() +void DatabaseCheckpoint::flagDeleted_() { sparta_assert(!isFlaggedDeleted(), "Cannot delete a checkpoint when it is already deleted: " << this); @@ -169,16 +162,6 @@ std::string DatabaseCheckpoint::getDeletedRepr() const return ss.str(); } -void DatabaseCheckpoint::flagDecached() -{ - decached_ = true; -} - -bool DatabaseCheckpoint::isFlaggedDecached() const noexcept -{ - return decached_; -} - bool DatabaseCheckpoint::isSnapshot() const noexcept { return is_snapshot_; @@ -207,7 +190,7 @@ void DatabaseCheckpoint::loadState(const std::vector& dats) std::unique_ptr DatabaseCheckpoint::clone() const { - auto clone = new DatabaseCheckpoint(getID(), getTick(), prev_id_, next_ids_, deleted_id_, decached_, is_snapshot_, data_, checkpointer_); + auto clone = new DatabaseCheckpoint(getID(), getTick(), prev_id_, next_ids_, deleted_id_, is_snapshot_, data_, checkpointer_); return std::unique_ptr(clone); } diff --git a/sparta/src/DatabaseCheckpointQuery.cpp b/sparta/src/DatabaseCheckpointQuery.cpp deleted file mode 100644 index 182a255be4..0000000000 --- a/sparta/src/DatabaseCheckpointQuery.cpp +++ /dev/null @@ -1,273 +0,0 @@ -#include "sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp" -#include "sparta/serialization/checkpoint/DatabaseCheckpoint.hpp" -#include "simdb/sqlite/DatabaseManager.hpp" -#include "simdb/sqlite/Iterator.hpp" -#include "simdb/utils/Compress.hpp" - -#include -#include -#include -#include -#include -#include - -namespace sparta::serialization::checkpoint -{ - -using chkpt_id_t = typename DatabaseCheckpointQuery::chkpt_id_t; -using tick_t = typename DatabaseCheckpointQuery::tick_t; - -uint64_t DatabaseCheckpointQuery::getTotalMemoryUse() const noexcept -{ - //TODO cnyce - return 0; -} - -uint64_t DatabaseCheckpointQuery::getContentMemoryUse() const noexcept -{ - //TODO cnyce - return 0; -} - -void DatabaseCheckpointQuery::deleteCheckpoint(chkpt_id_t id) -{ - tagged_deleted_ids_.insert(id); -} - -void DatabaseCheckpointQuery::loadCheckpoint(chkpt_id_t) -{ - throw CheckpointError("DatabaseCheckpointQuery::loadCheckpoint() not supported"); -} - -std::vector DatabaseCheckpointQuery::getCheckpointsAt(tick_t t) const -{ - // SELECT ChkptWindowBytesID FROM ChkptWindowTicks WHERE t <= EndTick AND t >= StartTick - auto query = db_mgr_->createQuery("ChkptWindowTicks"); - - query->addConstraintForUInt64("StartTick", simdb::Constraints::LESS_EQUAL, t); - query->addConstraintForUInt64("EndTick", simdb::Constraints::GREATER_EQUAL, t); - - int window_id; - query->select("ChkptWindowBytesID", window_id); - - auto results = query->getResultSet(); - if (!results.getNextRecord()) { - return {}; - } - - // SELECT ChkptID FROM ChkptWindowIDs WHERE ChkptWindowBytesID = - query = db_mgr_->createQuery("ChkptWindowIDs"); - - int chkpt_id; - query->select("ChkptID", chkpt_id); - query->addConstraintForInt("ChkptWindowBytesID", simdb::Constraints::EQUAL, window_id); - - auto results2 = query->getResultSet(); - std::vector ids; - while (results2.getNextRecord()) { - if (auto chkpt = findCheckpoint(chkpt_id)) { - if (chkpt->getTick() == t) { - ids.push_back(chkpt_id); - } - } - } - - return ids; -} - -std::vector DatabaseCheckpointQuery::getCheckpoints() const -{ - auto query = db_mgr_->createQuery("ChkptWindowIDs"); - - int chkpt_id; - query->select("ChkptID", chkpt_id); - - auto results = query->getResultSet(); - std::vector ids; - while (results.getNextRecord()) { - ids.push_back(chkpt_id); - } - - return ids; -} - -uint32_t DatabaseCheckpointQuery::getNumCheckpoints() const noexcept -{ - auto query = db_mgr_->createQuery("ChkptWindowIDs"); - return query->count(); -} - -std::deque DatabaseCheckpointQuery::getCheckpointChain(chkpt_id_t id) const -{ - //TODO cnyce - (void)id; - return {}; -} - -bool DatabaseCheckpointQuery::hasCheckpoint(chkpt_id_t id) const noexcept -{ - auto query = db_mgr_->createQuery("ChkptWindowIDs"); - query->addConstraintForUInt64("ChkptID", simdb::Constraints::EQUAL, id); - - int bytes_id; - query->select("ChkptWindowBytesID", bytes_id); - - auto results = query->getResultSet(); - return results.getNextRecord(); -} - -bool DatabaseCheckpointQuery::isSnapshot(chkpt_id_t id) const noexcept -{ - auto chkpt = findCheckpoint(id); - return chkpt ? chkpt->isSnapshot() : false; -} - -bool DatabaseCheckpointQuery::canDelete(chkpt_id_t id) const noexcept -{ - auto chkpt = findCheckpoint(id); - return chkpt ? chkpt->canDelete() : false; -} - -void DatabaseCheckpointQuery::dumpList(std::ostream& o) const -{ - //TODO cnyce - (void)o; -} - -void DatabaseCheckpointQuery::dumpData(std::ostream& o) const -{ - //TODO cnyce - (void)o; -} - -void DatabaseCheckpointQuery::dumpAnnotatedData(std::ostream& o) const -{ - //TODO cnyce - (void)o; -} - -void DatabaseCheckpointQuery::traceValue(std::ostream& o, chkpt_id_t id, const ArchData* container, uint32_t offset, uint32_t size) -{ - (void)o; - (void)id; - (void)container; - (void)offset; - (void)size; - - sparta_assert(false, "Not implemented"); -} - -std::shared_ptr DatabaseCheckpointQuery::findCheckpoint(chkpt_id_t id, bool must_exist) const -{ - // "Undo" task 6 (write to the database) - auto query = db_mgr_->createQuery("ChkptWindowIDs"); - query->addConstraintForUInt64("ChkptID", simdb::Constraints::EQUAL, id); - - int window_id = 404; - query->select("ChkptWindowBytesID", window_id); - - auto results1 = query->getResultSet(); - if (!results1.getNextRecord()) { - if (must_exist) { - throw CheckpointError("There is no checkpoint with ID ") << id; - } - return nullptr; - } - - query = db_mgr_->createQuery("ChkptWindowBytes"); - query->addConstraintForInt("Id", simdb::Constraints::EQUAL, window_id); - - std::vector bytes; - query->select("WindowBytes", bytes); - - auto results2 = query->getResultSet(); - if (!results2.getNextRecord()) { - if (must_exist) { - throw CheckpointError("There is no checkpoint with ID ") << id; - } - return nullptr; - } - - // "Undo" task 5 (zlib compression) - std::vector uncompressed; - simdb::decompressData(bytes, uncompressed); - - // "Undo" task 4 (boost::serialization) - namespace bio = boost::iostreams; - bio::array_source src(uncompressed.data(), uncompressed.size()); - bio::stream is(src); - - boost::archive::binary_iarchive ia(is); - ChkptWindow window; - ia >> window; - - for (auto& chkpt : window.chkpts) { - if (chkpt->getID() == id) { - return chkpt; - } - } - - sparta_assert(false, "Should not be reachable"); - return nullptr; -} - -chkpt_id_t DatabaseCheckpointQuery::getPrevID(chkpt_id_t id) const -{ - auto chkpt = findCheckpoint(id, true); - return chkpt->getPrevID(); -} - -std::vector DatabaseCheckpointQuery::getNextIDs(chkpt_id_t id, bool immediate_only) const -{ - std::vector next_ids; - while (true) { - auto chkpt = findCheckpoint(id, false); - if (!chkpt) { - break; - } - - auto ids = chkpt->getNextIDs(); - if (ids.empty()) { - break; - } - - next_ids.insert(next_ids.end(), ids.begin(), ids.end()); - if (immediate_only) { - break; - } - - assert(ids.size() == 1); - id = ids[0]; - } - - return next_ids; -} - -uint32_t DatabaseCheckpointQuery::getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept -{ - //TODO cnyce - (void)id; - return 0; -} - -void DatabaseCheckpointQuery::createHead_() -{ - throw CheckpointError("Cannot create checkpoint head for DatabaseCheckpointQuery"); -} - -chkpt_id_t DatabaseCheckpointQuery::createCheckpoint_(bool) -{ - throw CheckpointError("Cannot create checkpoint head for DatabaseCheckpointQuery"); -} - -void DatabaseCheckpointQuery::dumpCheckpointNode_(const chkpt_id_t, std::ostream&) const -{ - throw CheckpointError("Cannot dump checkpoint node for DatabaseCheckpointQuery"); -} - -std::vector DatabaseCheckpointQuery::getNextIDs_(chkpt_id_t id) const -{ - return getNextIDs(id); -} - -} // namespace sparta::serialization::checkpoint diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index 97b1e7fdde..f767e5bc98 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -1,7 +1,6 @@ // -*- C++ -*- #include "sparta/serialization/checkpoint/DatabaseCheckpointer.hpp" -#include "sparta/serialization/checkpoint/DatabaseCheckpointQuery.hpp" #include "simdb/apps/AppRegistration.hpp" #include "simdb/schema/SchemaDef.hpp" #include "simdb/pipeline/AsyncDatabaseAccessor.hpp" @@ -22,20 +21,11 @@ namespace sparta::serialization::checkpoint using tick_t = typename CheckpointBase::tick_t; using chkpt_id_t = typename CheckpointBase::chkpt_id_t; -using checkpoint_type = DatabaseCheckpoint; -using checkpoint_ptr = std::shared_ptr; -using checkpoint_ptrs = std::vector; -using EvictedChkptIDs = std::vector; DatabaseCheckpointer::DatabaseCheckpointer(simdb::DatabaseManager* db_mgr, TreeNode& root, Scheduler* sched) : Checkpointer(root, sched), - chkpt_query_(std::make_shared(this, db_mgr, root, sched)), db_mgr_(db_mgr), - snap_thresh_(DEFAULT_SNAPSHOT_THRESH), - next_chkpt_id_(checkpoint_type::MIN_CHECKPOINT), - num_alive_checkpoints_(0), - num_alive_snapshots_(0), - num_dead_checkpoints_(0) + next_chkpt_id_(checkpoint_type::MIN_CHECKPOINT) { } @@ -43,21 +33,17 @@ void DatabaseCheckpointer::defineSchema(simdb::Schema& schema) { using dt = simdb::SqlDataType; - auto& window_bytes = schema.addTable("ChkptWindowBytes"); - window_bytes.addColumn("WindowBytes", dt::blob_t); - - auto& window_ids = schema.addTable("ChkptWindowIDs"); - window_ids.addColumn("ChkptWindowBytesID", dt::int32_t); - window_ids.addColumn("ChkptID", dt::int32_t); - window_ids.createIndexOn("ChkptID"); - //window_ids.disableAutoIncPrimaryKey(); - - auto& window_ticks = schema.addTable("ChkptWindowTicks"); - window_ticks.addColumn("ChkptWindowBytesID", dt::int32_t); - window_ticks.addColumn("StartTick", dt::int32_t); - window_ticks.addColumn("EndTick", dt::int32_t); - window_ticks.createCompoundIndexOn({"StartTick", "EndTick"}); - //window_ticks.disableAutoIncPrimaryKey(); + auto& windows = schema.addTable("ChkptWindows"); + windows.addColumn("WindowID", dt::uint64_t); + windows.addColumn("WindowBytes", dt::blob_t); + windows.addColumn("StartChkpID", dt::uint64_t); + windows.addColumn("EndChkpID", dt::uint64_t); + windows.addColumn("StartTick", dt::uint64_t); + windows.addColumn("EndTick", dt::uint64_t); + windows.createIndexOn("WindowID"); + windows.createCompoundIndexOn({"StartChkpID", "EndChkpID"}); + windows.createCompoundIndexOn({"StartTick", "EndTick"}); + windows.disableAutoIncPrimaryKey(); } std::unique_ptr DatabaseCheckpointer::createPipeline( @@ -65,74 +51,47 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( { auto pipeline = std::make_unique(db_mgr_, NAME); - // Task 1: Clone an entire checkpoint window (snapshot plus all deltas until next snapshot) - auto clone_window = simdb::pipeline::createTask>( - [this](simdb::ConcurrentQueue& out, bool simulation_terminating) mutable -> bool + // Task 1: Chop off "flag deleted" checkpoints from the given window. This is needed + // since the checkpointer many times will grab a window from the database, mark some + // checkpoints as deleted, and leave the window in the cache. When the window gets + // old enough to be evicted from the LRU cache and re-sent down the pipeline, we do + // not want to propagate the deleted checkpoints back into the database. + auto purge_deleted = simdb::pipeline::createTask>( + [](checkpoint_ptrs&& chkpts_in, + simdb::ConcurrentQueue& chkpts_out, + bool /*force_flush*/) { - std::lock_guard lock(cache_mutex_); - - bool sent = false; - - auto send_window = [&]() { - auto window = std::move(chkpt_windows_.front()); - chkpt_windows_.pop_front(); - - checkpoint_ptrs chkpts; - for (auto id : window) { - auto it = chkpts_cache_.find(id); - if (it == chkpts_cache_.end()) { - throw CheckpointError("Invalid checkpoint - has been deleted"); - } - const auto& c = it->second; - if (chkpts.empty() && !c->isSnapshot()) { - throw CheckpointError("Invalid checkpoint - first in window is not a snapshot"); - } else if (!chkpts.empty() && c->isSnapshot()) { - throw CheckpointError("Invalid checkpoint - only one snapshot per window"); - } - - chkpts.emplace_back(c->clone()); - } - - if (!chkpts.empty()) { - out.emplace(std::move(chkpts)); - sent = true; + checkpoint_ptrs alive_chkpts; + bool ensure_rest_deleted = false; + + // Rule: Once we see a deleted checkpoint, all following checkpoints + // in the window must also be deleted. This is to ensure that + // checkpoints are always contiguous in the database. + + for (const auto& chkpt : chkpts_in) { + if (chkpt->isFlaggedDeleted()) { + ensure_rest_deleted = true; + } else if (ensure_rest_deleted) { + throw CheckpointError("Checkpoint window has non-contiguous deleted checkpoints"); + } else { + alive_chkpts.push_back(chkpt); } - }; - - // Note the >2 is to ensure we always have at least one complete window - // in the cache for fast APIs on very recent checkpoints. The second - // window may be partial so we can't send it yet. - while (chkpt_windows_.size() > 2) { - send_window(); - } - - // If we are terminating, send all remaining windows. - while (!chkpt_windows_.empty() && simulation_terminating) { - send_window(); } - - return sent; } ); - // Task 2: Add the IDs of all checkpoints in this window - auto add_chkpt_ids = simdb::pipeline::createTask>( + // Task 2: Package up checkpoints into a checkpoint window + auto create_window = simdb::pipeline::createTask>( [](checkpoint_ptrs&& chkpts, simdb::ConcurrentQueue& windows, - bool /*simulation_terminating*/) + bool /*force_flush*/) { - uint64_t start_tick = std::numeric_limits::max(); - uint64_t end_tick = 0; - ChkptWindow window; + window.start_chkpt_id = chkpts.front()->getID(); + window.end_chkpt_id = chkpts.back()->getID(); + window.start_tick = chkpts.front()->getTick(); + window.end_tick = chkpts.back()->getTick(); window.chkpts = std::move(chkpts); - for (auto& chkpt : window.chkpts) { - window.chkpt_ids.push_back(chkpt->getID()); - start_tick = std::min(start_tick, chkpt->getTick()); - end_tick = std::max(end_tick, chkpt->getTick()); - } - window.start_tick = start_tick; - window.end_tick = end_tick; windows.emplace(std::move(window)); } ); @@ -141,7 +100,7 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( auto window_to_bytes = simdb::pipeline::createTask>( [](ChkptWindow&& window, simdb::ConcurrentQueue& window_bytes, - bool /*simulation_terminating*/) + bool /*force_flush*/) { ChkptWindowBytes bytes; boost::iostreams::back_insert_device> inserter(bytes.chkpt_bytes); @@ -150,10 +109,8 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( oa << window; os.flush(); - for (const auto& chkpt : window.chkpts) { - bytes.chkpt_ids.push_back(chkpt->getID()); - } - + bytes.start_chkpt_id = window.start_chkpt_id; + bytes.end_chkpt_id = window.end_chkpt_id; bytes.start_tick = window.start_tick; bytes.end_tick = window.end_tick; window_bytes.emplace(std::move(bytes)); @@ -164,98 +121,88 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( auto zlib_bytes = simdb::pipeline::createTask>( [](ChkptWindowBytes&& bytes_in, simdb::ConcurrentQueue& bytes_out, - bool /*simulation_terminating*/) + bool /*force_flush*/) { - ChkptWindowBytes compressed; - compressed.chkpt_ids = std::move(bytes_in.chkpt_ids); - simdb::compressData(bytes_in.chkpt_bytes, compressed.chkpt_bytes); - bytes_out.emplace(std::move(compressed)); + std::vector compressed_bytes; + simdb::compressData(bytes_in.chkpt_bytes, compressed_bytes); + std::swap(bytes_in.chkpt_bytes, compressed_bytes); + bytes_out.emplace(std::move(bytes_in)); } ); // Task 5: Write to the database - auto write_to_db = db_accessor->createAsyncWriter( - [](ChkptWindowBytes&& bytes_in, - simdb::ConcurrentQueue& evicted_ids, - simdb::pipeline::AppPreparedINSERTs* tables, - bool /*simulation_terminating*/) + auto write_to_db = db_accessor->createAsyncWriter( + [this](ChkptWindowBytes&& bytes_in, + simdb::pipeline::AppPreparedINSERTs* tables, + bool /*force_flush*/) { - auto bytes_inserter = tables->getPreparedINSERT("ChkptWindowBytes"); - bytes_inserter->setColumnValue(0, bytes_in.chkpt_bytes); - auto bytes_id = bytes_inserter->createRecord(); - - auto chkpt_ids_inserter = tables->getPreparedINSERT("ChkptWindowIDs"); - chkpt_ids_inserter->setColumnValue(0, bytes_id); - for (auto id : bytes_in.chkpt_ids) { - chkpt_ids_inserter->setColumnValue(1, (int)id); - chkpt_ids_inserter->createRecord(); + auto window_inserter = tables->getPreparedINSERT("ChkptWindows"); + + utils::ValidValue win_id; + for (chkpt_id_t cid = bytes_in.start_chkpt_id; cid <= bytes_in.end_chkpt_id; ++cid) { + auto window_id = getWindowID_(cid); + if (!win_id.isValid()) { + win_id = window_id; + } else if (win_id != window_id) { + throw CheckpointError("Checkpoint window has inconsistent window IDs"); + } } - auto chkpt_ticks_inserter = tables->getPreparedINSERT("ChkptWindowTicks"); - chkpt_ticks_inserter->setColumnValue(0, bytes_id); - chkpt_ticks_inserter->setColumnValue(1, (int)bytes_in.start_tick); - chkpt_ticks_inserter->setColumnValue(2, (int)bytes_in.end_tick); - chkpt_ticks_inserter->createRecord(); - - evicted_ids.emplace(std::move(bytes_in.chkpt_ids)); + window_inserter->setColumnValue(0, win_id.getValue()); + window_inserter->setColumnValue(1, bytes_in.chkpt_bytes); + window_inserter->setColumnValue(2, bytes_in.start_chkpt_id); + window_inserter->setColumnValue(3, bytes_in.end_chkpt_id); + window_inserter->setColumnValue(4, bytes_in.start_tick); + window_inserter->setColumnValue(5, bytes_in.end_tick); + window_inserter->createRecord(); } ); - // Task 6: Perform cache eviction after a window of checkpoints has been written to SimDB - auto evict_from_cache = simdb::pipeline::createTask>( - [this](EvictedChkptIDs&& evicted_ids, bool simulation_terminating) mutable - { - // TODO cnyce: We are allocating and deallocating a LOT of checkpoints. - // See if we can reuse a pool of them. Could also try to just add a pool - // to the VectorStorage::Segment class. - std::lock_guard lock(cache_mutex_); - - for (auto id : evicted_ids) { - if (id == head_id_) { - // Never evict the head checkpoint - continue; - } - - if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { - it->second->flagDecached(); - if (findCheckpoint(id).lock() != nullptr) { - throw CheckpointError("Internal error - checkpoint should be marked as decached"); - } - } - } - } - ); + *purge_deleted >> *create_window >> *window_to_bytes >> *zlib_bytes >> *write_to_db; - *clone_window >> *add_chkpt_ids >> *window_to_bytes >> *zlib_bytes >> *write_to_db >> *evict_from_cache; + pipeline_head_ = purge_deleted->getTypedInputQueue(); - pipeline_ = pipeline.get(); + pipeline_flusher_ = std::make_unique( + *db_mgr_, purge_deleted, create_window, window_to_bytes, zlib_bytes, write_to_db); pipeline->createTaskGroup("CheckpointPipeline") - ->addTask(std::move(clone_window)) - ->addTask(std::move(add_chkpt_ids)) + ->addTask(std::move(purge_deleted)) + ->addTask(std::move(create_window)) ->addTask(std::move(window_to_bytes)) - ->addTask(std::move(zlib_bytes)) - ->addTask(std::move(evict_from_cache)); + ->addTask(std::move(zlib_bytes)); return pipeline; } -uint32_t DatabaseCheckpointer::getSnapshotThreshold() const noexcept +uint32_t DatabaseCheckpointer::getSnapshotThreshold() const { return snap_thresh_; } -void DatabaseCheckpointer::setSnapshotThreshold(uint32_t thresh) noexcept +void DatabaseCheckpointer::setSnapshotThreshold(uint32_t thresh) { + sparta_assert(!snap_thresh_.isValid(), "Snapshot threshold cannot be changed once set."); + sparta_assert(thresh > 1, "Snapshot threshold must be greater than 1"); snap_thresh_ = thresh; } +void DatabaseCheckpointer::setMaxCachedWindows(uint32_t max_windows) +{ + sparta_assert(!max_cached_windows_.isValid(), "Max cached windows cannot be changed once set."); + sparta_assert(max_windows > 0, "Max cached windows must be greater than 0"); + max_cached_windows_ = max_windows; +} + uint64_t DatabaseCheckpointer::getTotalMemoryUse() const noexcept { std::lock_guard lock(cache_mutex_); + + // Only add up the memory use from the cache. uint64_t mem = 0; - for (const auto& [id, chkpt] : chkpts_cache_) { - mem += chkpt->getTotalMemoryUse(); + for (const auto& [win_id, window] : chkpts_cache_) { + for (const auto& chkpt : window) { + mem += chkpt->getTotalMemoryUse(); + } } return mem; } @@ -263,9 +210,13 @@ uint64_t DatabaseCheckpointer::getTotalMemoryUse() const noexcept uint64_t DatabaseCheckpointer::getContentMemoryUse() const noexcept { std::lock_guard lock(cache_mutex_); + + // Only add up the memory use from the cache. uint64_t mem = 0; - for (const auto& [id, chkpt] : chkpts_cache_) { - mem += chkpt->getContentMemoryUse(); + for (const auto& [win_id, window] : chkpts_cache_) { + for (const auto& chkpt : window) { + mem += chkpt->getContentMemoryUse(); + } } return mem; } @@ -279,15 +230,11 @@ void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) { std::lock_guard lock(cache_mutex_); - if (auto c = getCurrent_(); !c || (c && c->getID() == id)) { + if (auto c = getCurrent_(); !c || c->getID() == id) { return; } - auto chkpt = (id == head_id_) ? findCheckpoint(id).lock() : cloneCheckpoint(id); - if (!chkpt) { - throw CheckpointError("There is no checkpoint with ID ") << id; - } - + auto chkpt = findCheckpoint(id, true); chkpt->load(getArchDatas()); // Delete all future checkpoints past this one. Do this from the cache @@ -306,14 +253,7 @@ void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) // Move current to this checkpoint. setCurrent_(chkpt.get()); - // Add this checkpoint to the cache if not the head checkpoint. - // The head checkpoint is always in the cache. - if (id != head_id_) { - addToCache_(std::move(chkpt)); - } - // Increasing-by-one, starting-at-zero checkpoint IDs guarantee we can do this: - num_alive_checkpoints_ = id + 1; next_chkpt_id_ = id + 1; // Restore scheduler tick number @@ -322,163 +262,118 @@ void DatabaseCheckpointer::loadCheckpoint(chkpt_id_t id) } } -std::vector DatabaseCheckpointer::getCheckpointsAt(tick_t t) const +void DatabaseCheckpointer::preTeardown() { - std::lock_guard lock(cache_mutex_); + // Send every window down the pipeline and flush it. + evictWindowsIfNeeded_(true); + pipeline_flusher_->flush(); +} - std::unordered_set results; - for (const auto& [id, chkpt] : chkpts_cache_) { - if (chkpt->getTick() == t && !chkpt->isFlaggedDeleted()) { - results.insert(id); - } - } +std::vector DatabaseCheckpointer::getCheckpointsAt(tick_t t) +{ + std::unordered_set ids; - for (auto id : chkpt_query_->getCheckpointsAt(t)) { - results.insert(id); - } + forEachCheckpoint_([t, &ids](const DatabaseCheckpoint* chkpt) { + if (!chkpt->isFlaggedDeleted() && chkpt->getTick() == t) { + ids.insert(chkpt->getID()); + } + }); - std::vector chkpts(results.begin(), results.end()); - std::sort(chkpts.begin(), chkpts.end()); - return chkpts; + std::vector ret(ids.begin(), ids.end()); + std::sort(ret.begin(), ret.end()); + return ret; } -std::vector DatabaseCheckpointer::getCheckpoints() const +std::vector DatabaseCheckpointer::getCheckpoints() { - std::lock_guard lock(cache_mutex_); + std::unordered_set ids; - std::unordered_set results; - for (const auto& [id, chkpt] : chkpts_cache_) { + forEachCheckpoint_([&ids](const DatabaseCheckpoint* chkpt) { if (!chkpt->isFlaggedDeleted()) { - results.insert(id); + ids.insert(chkpt->getID()); } - } - - //TODO cnyce: Put this back when the cache is actually purged - //for (auto id : chkpt_query_->getCheckpoints()) { - // results.insert(id); - //} + }); - std::vector chkpts(results.begin(), results.end()); - std::sort(chkpts.begin(), chkpts.end()); - return chkpts; + std::vector ret(ids.begin(), ids.end()); + std::sort(ret.begin(), ret.end()); + return ret; } -uint32_t DatabaseCheckpointer::getNumCheckpoints() const noexcept +uint32_t DatabaseCheckpointer::getNumCheckpoints() noexcept { - return num_alive_checkpoints_; + return next_chkpt_id_; } -uint32_t DatabaseCheckpointer::getNumSnapshots() const noexcept +uint32_t DatabaseCheckpointer::getNumSnapshots() noexcept { - return num_alive_snapshots_; + return next_chkpt_id_ ? getWindowID_(next_chkpt_id_) + 1 : 0; } -uint32_t DatabaseCheckpointer::getNumDeltas() const noexcept +uint32_t DatabaseCheckpointer::getNumDeltas() noexcept { return getNumCheckpoints() - getNumSnapshots(); } -uint32_t DatabaseCheckpointer::getNumDeadCheckpoints() const noexcept +std::deque DatabaseCheckpointer::getCheckpointChain(chkpt_id_t id) { - return num_dead_checkpoints_; -} - -std::deque DatabaseCheckpointer::getCheckpointChain(chkpt_id_t id) const -{ - std::lock_guard lock(cache_mutex_); - std::deque chain; if (!getHead()) { return chain; } - if (!hasCheckpoint(id)) { + if (hasCheckpoint(id)) { + // This checkpointer guarantees a linear chain of checkpoints with no gaps. + // While we could also walk backwards using getPrevID(), load checkpoints + // into memory, and call getID() on each of them, the result of doing that + // would effectively load every window into our cache only to dump most of + // them (LRU). The cache could very well end up being 100% full of very old + // checkpoints, thus slowing down further API calls to reload newer windows + // into the cache. + do { + chain.push_back(id); + } while (id-- > 0); + } else { throw CheckpointError("There is no checkpoint with ID ") << id; } - auto it = chkpts_cache_.find(id); - while (it != chkpts_cache_.end()) { - chain.push_back(id); - id = it->second->getPrevID(); - it = chkpts_cache_.find(id); - } - - while (id != checkpoint_type::UNIDENTIFIED_CHECKPOINT) { - chain.push_back(id); - id = chkpt_query_->getPrevID(id); - } - return chain; } -std::shared_ptr DatabaseCheckpointer::findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from) +std::shared_ptr DatabaseCheckpointer::findCheckpoint(chkpt_id_t id, bool must_exist) { std::lock_guard lock(cache_mutex_); - auto id = from; - do { - auto chkpt = cloneCheckpoint(id); - if (chkpt->getTick() <= tick) { - break; - } - id = chkpt->getPrevID(); - } while (id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); - - return cloneCheckpoint(id); -} - -std::weak_ptr DatabaseCheckpointer::findCheckpoint(chkpt_id_t id) const -{ - std::lock_guard lock(cache_mutex_); - - if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { - if (it->second->isFlaggedDeleted() || it->second->isFlaggedDecached()) { - return std::weak_ptr(); - } - return it->second; - } - - return std::weak_ptr(); -} - -std::shared_ptr DatabaseCheckpointer::cloneCheckpoint(chkpt_id_t id, bool must_exist) const -{ - std::lock_guard lock(cache_mutex_); - - if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) {//TODO cnyce: && !it->second->isFlaggedDecached()) { - return it->second->clone(); - } - - auto chkpt = chkpt_query_->findCheckpoint(id); - if (!chkpt && must_exist) { - throw CheckpointError("There is no checkpoint with ID ") << id; - } else if (!chkpt) { + if (!ensureWindowLoaded_(id, must_exist)) { return nullptr; } - chkpt->checkpointer_ = const_cast(this); + auto win_id = getWindowID_(id); + auto& window = chkpts_cache_[win_id]; + sparta_assert(!window.empty()); + + // Find the checkpoint in the window in constant time, noting that + // the window will have checkpoints in ascending order by ID with + // no gaps. + auto snapshot_id = window.front()->getID(); + auto& chkpt = window.at(id - snapshot_id); + sparta_assert(chkpt->getID() == id); return chkpt; } -bool DatabaseCheckpointer::hasCheckpoint(chkpt_id_t id) const noexcept +bool DatabaseCheckpointer::hasCheckpoint(chkpt_id_t id) noexcept { - std::lock_guard lock(cache_mutex_); - - if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { - return !it->second->isFlaggedDeleted(); - } - - return chkpt_query_->hasCheckpoint(id); + return findCheckpoint(id) != nullptr; } -void DatabaseCheckpointer::dumpRestoreChain(std::ostream& o, chkpt_id_t id) const +void DatabaseCheckpointer::dumpRestoreChain(std::ostream& o, chkpt_id_t id) { auto rc = getRestoreChain(id); while (true) { - const auto chkpt = cloneCheckpoint(rc.top()); + auto chkpt = findCheckpoint(rc.top()); rc.pop(); + if (chkpt->isSnapshot()) { - o << '('; + o << "("; } if (chkpt->getID() == checkpoint_type::UNIDENTIFIED_CHECKPOINT) { o << "*" << chkpt->getDeletedID(); @@ -486,7 +381,7 @@ void DatabaseCheckpointer::dumpRestoreChain(std::ostream& o, chkpt_id_t id) cons o << chkpt->getID(); } if (chkpt->isSnapshot()) { - o << ')'; + o << ")"; } if (rc.empty()) { break; @@ -496,118 +391,51 @@ void DatabaseCheckpointer::dumpRestoreChain(std::ostream& o, chkpt_id_t id) cons } } -std::stack DatabaseCheckpointer::getHistoryChain(chkpt_id_t id) const +std::stack DatabaseCheckpointer::getHistoryChain(chkpt_id_t id) { - std::lock_guard lock(cache_mutex_); + ensureWindowLoaded_(id, true); std::stack chain; - auto it = chkpts_cache_.find(id); - while (it != chkpts_cache_.end()) { - chain.push(id); - id = it->second->getPrevID(); - it = chkpts_cache_.find(id); - } - - while (id != checkpoint_type::UNIDENTIFIED_CHECKPOINT) { + do { chain.push(id); - id = chkpt_query_->getPrevID(id); - } + } while (id-- > 0); return chain; } -std::stack DatabaseCheckpointer::getRestoreChain(chkpt_id_t id) const -{ - // Build stack up to last snapshot - std::stack chkpts; - while (true) { - chkpts.push(id); - auto chkpt = cloneCheckpoint(id); - if (chkpt->isSnapshot()) { - break; - } - id = chkpt->getPrevID(); - } - return chkpts; -} - -std::vector DatabaseCheckpointer::getNextIDs(chkpt_id_t id) const +std::stack DatabaseCheckpointer::getRestoreChain(chkpt_id_t id) { std::lock_guard lock(cache_mutex_); - if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { - return it->second->getNextIDs(); - } + ensureWindowLoaded_(id, true); + auto win_id = getWindowID_(id); + auto& window = chkpts_cache_[win_id]; + sparta_assert(!window.empty()); - return chkpt_query_->getNextIDs(id); + std::stack chain; + auto snapshot_id = window.front()->getID(); + do { + chain.push(id); + } while (id-- > snapshot_id); + + return chain; } -uint32_t DatabaseCheckpointer::getDistanceToPrevSnapshot(chkpt_id_t id) const noexcept +std::vector DatabaseCheckpointer::getNextIDs(chkpt_id_t id) { - std::lock_guard lock(cache_mutex_); - - uint32_t dist = 0; - auto it = chkpts_cache_.find(id); - while (it != chkpts_cache_.end()) { - if (it->second->isSnapshot()) { - return dist; - } - id = it->second->getPrevID(); - it = chkpts_cache_.find(id); - ++dist; - } + auto chkpt = findCheckpoint(id, true); + return chkpt->getNextIDs(); +} - // Note that we only evict entire checkpoint "windows" from the cache, - // which means the cache never has "partial" windows like: - // - // Snapshot threshold: 10 (window length) - // (1 snapshot, 9 deltas) - // - // Cache: DB: - // 3,4,5,6,7,8,9,10 1,2 <-- never going to happen - // - // Cache: DB: - // 21-30 1-20 <-- always like this ("full" windows only) - // - // This means we either can answer the API question entirely using the - // cache or entirely using the DB. That is why the line of code below - // is not something like: - // - // return dist + chkpt_query_->getDistanceToPrevSnapshot(id); - - return chkpt_query_->getDistanceToPrevSnapshot(id); -} - -bool DatabaseCheckpointer::isSnapshot(chkpt_id_t id) const noexcept +uint32_t DatabaseCheckpointer::getDistanceToPrevSnapshot(chkpt_id_t id) noexcept { - std::lock_guard lock(cache_mutex_); - - if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { - return it->second->isSnapshot(); - } - - return chkpt_query_->isSnapshot(id); + return getRestoreChain(id).size() - 1; } -bool DatabaseCheckpointer::canDelete(chkpt_id_t id) const noexcept +bool DatabaseCheckpointer::isSnapshot(chkpt_id_t id) noexcept { - std::lock_guard lock(cache_mutex_); - - if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { - if (!it->second->isFlaggedDeleted()) { - return false; - } - - for (auto next_id : it->second->getNextIDs()) { - if (!canDelete(next_id) && !isSnapshot(next_id)) { - return false; - } - } - - return true; - } - - return chkpt_query_->canDelete(id); + auto chkpt = findCheckpoint(id, true); + return chkpt->isSnapshot(); } std::string DatabaseCheckpointer::stringize() const @@ -617,40 +445,51 @@ std::string DatabaseCheckpointer::stringize() const return ss.str(); } -void DatabaseCheckpointer::dumpList(std::ostream& o) const +void DatabaseCheckpointer::dumpList(std::ostream& o) { - std::lock_guard lock(cache_mutex_); + std::map chkpt_strings; - for (const auto& [id, chkpt] : chkpts_cache_) { - o << chkpt->stringize() << std::endl; - } + forEachCheckpoint_([&chkpt_strings](const DatabaseCheckpoint* chkpt) { + chkpt_strings[chkpt->getID()] = chkpt->stringize(); + }); - chkpt_query_->dumpList(o); + for (const auto& [id, str] : chkpt_strings) { + o << str << "\n"; + } + o << std::flush; } -void DatabaseCheckpointer::dumpData(std::ostream& o) const +void DatabaseCheckpointer::dumpData(std::ostream& o) { - std::lock_guard lock(cache_mutex_); + std::map chkpt_strings; - for (const auto& [id, chkpt] : chkpts_cache_) { - chkpt->dumpData(o); - o << std::endl; - } + forEachCheckpoint_([&chkpt_strings](const DatabaseCheckpoint* chkpt) { + std::ostringstream oss; + chkpt->dumpData(oss); + chkpt_strings[chkpt->getID()] = oss.str(); + }); - chkpt_query_->dumpData(o); + for (const auto& [id, str] : chkpt_strings) { + o << str << "\n"; + } + o << std::flush; } -void DatabaseCheckpointer::dumpAnnotatedData(std::ostream& o) const +void DatabaseCheckpointer::dumpAnnotatedData(std::ostream& o) { - std::lock_guard lock(cache_mutex_); + std::map chkpt_strings; - for (const auto& [id, chkpt] : chkpts_cache_) { - o << chkpt->stringize() << std::endl; - chkpt->dumpData(o); - o << std::endl; - } + forEachCheckpoint_([&chkpt_strings](const DatabaseCheckpoint* chkpt) { + std::ostringstream oss; + oss << chkpt->stringize() << "\n"; + chkpt->dumpData(oss); + chkpt_strings[chkpt->getID()] = oss.str(); + }); - chkpt_query_->dumpAnnotatedData(o); + for (const auto& [id, str] : chkpt_strings) { + o << str << "\n"; + } + o << std::flush; } void DatabaseCheckpointer::traceValue( @@ -700,13 +539,12 @@ void DatabaseCheckpointer::createHead_() setHead_(chkpt.get()); setCurrent_(chkpt.get()); addToCache_(std::move(chkpt)); - - num_alive_checkpoints_++; - num_alive_snapshots_++; } chkpt_id_t DatabaseCheckpointer::createCheckpoint_(bool force_snapshot) { + sparta_assert(!force_snapshot, "Forced snapshots are not supported by DatabaseCheckpointer"); + std::lock_guard lock(cache_mutex_); bool is_snapshot; @@ -761,65 +599,20 @@ chkpt_id_t DatabaseCheckpointer::createCheckpoint_(bool force_snapshot) auto current = chkpt.get(); setCurrent_(current); addToCache_(std::move(chkpt)); - num_alive_checkpoints_++; - num_alive_snapshots_ += (current->isSnapshot() == true) ? 1 : 0; - return current->getID(); } void DatabaseCheckpointer::deleteCheckpoint_(chkpt_id_t id) { - std::lock_guard lock(cache_mutex_); - - // Purge all future checkpoints from chkpt_windows_. - for (auto it = chkpt_windows_.begin(); it != chkpt_windows_.end(); ++it) { - auto& window = *it; - - // Because IDs are monotonically increasing, we can skip windows - if (window.empty() || id < window.front()) { - // ID cannot be in this or any future window - chkpt_windows_.erase(it, chkpt_windows_.end()); - break; - } - - if (id > window.back()) { - // ID cannot be in this window, continue searching - continue; - } - - // ID must be within this window - auto pos = std::find(window.begin(), window.end(), id); - if (pos != window.end()) { - window.erase(pos, window.end()); - if (window.empty()) { - it = chkpt_windows_.erase(it); - } else { - ++it; - } - if (it != chkpt_windows_.end()) { - chkpt_windows_.erase(it, chkpt_windows_.end()); - } - break; - } - } - - // Purge from the database - chkpt_query_->deleteCheckpoint(id); - - // Purge from the cache while (true) { - auto it = chkpts_cache_.find(id); - if (it == chkpts_cache_.end()) { - break; - } + auto chkpt = findCheckpoint(id, true); + chkpt->flagDeleted_(); - auto next_ids = it->second->getNextIDs(); - it->second->flagDeleted(); + auto next_ids = chkpt->getNextIDs(); + sparta_assert(next_ids.size() <= 1, + "DatabaseCheckpointer does not support multiple checkpoint branches"); if (!next_ids.empty()) { - if (next_ids.size() != 1) { - throw CheckpointError("DatabaseCheckpointer does not support multiple checkpoint branches"); - } id = next_ids[0]; } else { break; @@ -827,34 +620,18 @@ void DatabaseCheckpointer::deleteCheckpoint_(chkpt_id_t id) } } -void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) const +void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) { static std::string SNAPSHOT_NOTICE = "(s)"; - std::lock_guard lock(cache_mutex_); - - checkpoint_ptr chkpt_ptr; - if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { - chkpt_ptr = it->second; - } else { - chkpt_ptr = chkpt_query_->findCheckpoint(id); - } - - auto cp = chkpt_ptr.get(); - - // Draw data for this checkpoint - if (cp->isFlaggedDeleted()) { - o << cp->getDeletedRepr(); - }else{ - o << cp->getID(); - } - // Show that this is a snapshot - if (cp->isSnapshot()) { + auto chkpt = findCheckpoint(id, true); + o << chkpt->getID(); + if (chkpt->isSnapshot()) { o << ' ' << SNAPSHOT_NOTICE; } } -std::vector DatabaseCheckpointer::getNextIDs_(chkpt_id_t id) const +std::vector DatabaseCheckpointer::getNextIDs_(chkpt_id_t id) { return getNextIDs(id); } @@ -886,38 +663,188 @@ void DatabaseCheckpointer::setCurrentID_(chkpt_id_t id) std::lock_guard lock(cache_mutex_); sparta_assert(id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); current_id_ = id; +} - // If we are moving current_, see if we can evict any pending IDs - while (!pending_eviction_ids_.empty()) { - auto id = pending_eviction_ids_.front(); - pending_eviction_ids_.pop(); - if (id == current_id_) { - pending_eviction_ids_.push(id); - } else if (auto it = chkpts_cache_.find(id); it != chkpts_cache_.end()) { - it->second->flagDecached(); - } +void DatabaseCheckpointer::addToCache_(std::shared_ptr chkpt) +{ + std::lock_guard lock(cache_mutex_); + + const auto win_id = chkpt->getID() / (snap_thresh_ + 1); + auto& window = chkpts_cache_[win_id]; + sparta_assert(window.empty() || window.back()->getID() == chkpt->getID() - 1, + "Checkpoints must be added in ID order with no gaps"); + window.emplace_back(std::move(chkpt)); + touchWindow_(win_id); + evictWindowsIfNeeded_(); +} + +void DatabaseCheckpointer::touchWindow_(window_id_t id) +{ + std::lock_guard lock(cache_mutex_); + + auto it = lru_map_.find(id); + if (it != lru_map_.end()) { + lru_list_.erase(it->second); } + lru_list_.push_front(id); + lru_map_[id] = lru_list_.begin(); } -void DatabaseCheckpointer::addToCache_(std::shared_ptr chkpt) +void DatabaseCheckpointer::evictWindowsIfNeeded_(bool force_flush) { std::lock_guard lock(cache_mutex_); - auto id = chkpt->getID(); - chkpts_cache_[id] = chkpt; + uint32_t num_windows_to_evict = 0; - if (!chkpt_windows_.empty() && !chkpt_windows_.back().empty() && chkpt_windows_.back().back() == id) { + if (force_flush) { + num_windows_to_evict = lru_list_.size(); + } else if (lru_list_.size() > max_cached_windows_) { + num_windows_to_evict = lru_list_.size() - max_cached_windows_; + } + + if (num_windows_to_evict == 0) { return; } - if (chkpt->isSnapshot()) { - chkpt_windows_.emplace_back(); + while (num_windows_to_evict > 0) { + // Evict the least recently used window + const auto win_id = lru_list_.back(); + + // Unless we are flushing, do not evict the window containing + // the current checkpoint or the head checkpoint + if (!force_flush) { + auto current = getCurrent_(); + auto current_win_id = getWindowID_(current->getID()); + + auto head = getHead_(); + auto head_win_id = getWindowID_(head->getID()); + + // If the current or head checkpoint is in this window, skip eviction. + // Decrement the number of windows to evict since we are skipping this one. + if (current_win_id == win_id || head_win_id == win_id) { + sparta_assert(num_windows_to_evict-- > 0); + touchWindow_(win_id); + continue; + } + } + + lru_list_.pop_back(); + lru_map_.erase(win_id); + + // Send the window down the pipeline for writing to the database + auto& window = chkpts_cache_[win_id]; + pipeline_head_->emplace(std::move(window)); + + // Cleanup + chkpts_cache_.erase(win_id); + sparta_assert(num_windows_to_evict-- > 0); + } +} + +bool DatabaseCheckpointer::ensureWindowLoaded_(chkpt_id_t chkpt_id, bool must_succeed) +{ + std::lock_guard lock(cache_mutex_); + + window_id_t win_id = getWindowID_(chkpt_id); + if (chkpts_cache_.find(win_id) == chkpts_cache_.end()) { + checkpoint_ptrs window_chkpts = getWindowFromDatabase_(win_id); + if (window_chkpts.empty() && must_succeed) { + throw CheckpointError("Could not find checkpoint window with ID ") << win_id; + } + chkpts_cache_[win_id] = std::move(window_chkpts); + } + + bool success = false; + for (const auto& chkpt : chkpts_cache_[win_id]) { + if (chkpt->getID() == chkpt_id) { + success = true; + break; + } + } + + if (!success && must_succeed) { + throw CheckpointError("Could not find checkpoint with ID ") << chkpt_id; } - auto& window = chkpt_windows_.back(); - if (window.empty() || window.back() != id) { - window.push_back(id); + touchWindow_(win_id); + evictWindowsIfNeeded_(); + return success; +} + +std::vector> DatabaseCheckpointer::getWindowFromDatabase_(window_id_t win_id) +{ + std::vector> window_chkpts; + pipeline_flusher_->flush(); + + db_mgr_->safeTransaction([&]() { + auto query = db_mgr_->createQuery("ChkptWindows"); + query->addConstraintForUInt64("WindowID", simdb::Constraints::EQUAL, win_id); + + std::vector compressed_window_bytes; + query->select("WindowBytes", compressed_window_bytes); + + auto results = query->getResultSet(); + if (results.getNextRecord()) { + std::unique_ptr window_restored = deserializeWindow_(compressed_window_bytes); + sparta_assert(window_restored && !window_restored->chkpts.empty()); + window_chkpts = std::move(window_restored->chkpts); + } + }); + + return window_chkpts; +} + +std::unique_ptr DatabaseCheckpointer::deserializeWindow_(const std::vector& compressed_window_bytes) const +{ + std::vector window_bytes; + simdb::decompressData(compressed_window_bytes, window_bytes); + + auto window_restored = std::make_unique(); + boost::iostreams::basic_array_source device(window_bytes.data(), window_bytes.size()); + boost::iostreams::stream> is(device); + boost::archive::binary_iarchive ia(is); + ia >> *window_restored; + + return window_restored; +} + +void DatabaseCheckpointer::forEachCheckpoint_(const std::function& cb) +{ + // Flush the pipeline so that every checkpoint is either in our cache or on disk. + // There is no guarantee that the cache has newer checkpoints than the database, + // since many APIs load old windows into the cache and "mix them together" with + // whatever is already in the cache (new and old). + pipeline_flusher_->flush(); + + { + std::lock_guard lock(cache_mutex_); + + // Gather up all checkpoint IDs from our cache + for (const auto& [win_id, window] : chkpts_cache_) { + for (const auto& chkpt : window) { + cb(chkpt.get()); + } + } } + + // Query the database for any other checkpoints + db_mgr_->safeTransaction([&]() { + auto query = db_mgr_->createQuery("ChkptWindows"); + + std::vector compressed_window_bytes; + query->select("WindowBytes", compressed_window_bytes); + + auto results = query->getResultSet(); + while (results.getNextRecord()) + { + auto window = deserializeWindow_(compressed_window_bytes); + for (const auto& chkpt : window->chkpts) { + sparta_assert(!chkpt->isFlaggedDeleted(), + "Deleted checkpoints should never make it to the database"); + cb(chkpt.get()); + } + } + }); } REGISTER_SIMDB_APPLICATION(DatabaseCheckpointer); diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp index 3f242c9311..20e16173a0 100644 --- a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp +++ b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp @@ -100,7 +100,8 @@ void generalTest() app_mgr.openPipelines(); auto& dbcp = *app_mgr.getApp(); - dbcp.setSnapshotThreshold(9); + dbcp.setSnapshotThreshold(10); + dbcp.setMaxCachedWindows(10); root.enterConfiguring(); root.enterFinalized(); @@ -120,8 +121,6 @@ void generalTest() r1->write(i * 5ul); r2->write(i % 5ul); sched.run(1, true, false); - //EXPECT_EQUAL(i, sched.getCurrentTick()); - //EXPECT_EQUAL(i, dbcp.getCurrentTick()); DatabaseCheckpointer::chkpt_id_t id; EXPECT_NOTHROW(id = dbcp.createCheckpoint()); @@ -130,38 +129,50 @@ void generalTest() return id; }; + auto find_checkpoint = [&](DatabaseCheckpointer::chkpt_id_t id, bool must_exist = false) { + std::shared_ptr cp; + EXPECT_NOTHROW(cp = dbcp.findCheckpoint(id, must_exist)); + EXPECT_NOTEQUAL(cp, nullptr); + if (cp) { + EXPECT_EQUAL(cp->getID(), id); + EXPECT_EQUAL(cp->getPrevID(), (id > 0) ? (id - 1) : DatabaseCheckpoint::UNIDENTIFIED_CHECKPOINT); + } + return cp; + }; + // Create 1000 checkpoints, and periodically access an old one. Also // go to sleep sometimes to increase the chances we have to go to the // database to retrieve a checkpoint. Keep a clone of checkpoint 3 for // later verification. std::shared_ptr clone3; - for (uint32_t i = 1; i <= 100; ++i) { + for (uint32_t i = 1; i <= 1000; ++i) { step_checkpointer(i); // Access most recent from the cache directly - auto cached_cp = dbcp.findCheckpoint(i).lock(); - EXPECT_NOTEQUAL(cached_cp, nullptr); - if (cached_cp) { - EXPECT_EQUAL(cached_cp->getID(), i); - EXPECT_EQUAL(cached_cp->getPrevID(), i - 1); - if (i == 3) { - clone3 = dbcp.cloneCheckpoint(3); - } + find_checkpoint(i); + + // Store checkpoint 3 for later verification + if (i == 3) { + clone3 = find_checkpoint(3); } +#if 0 // Access an old one, which may or may not be in the cache if (rand() % 10 == 0) { std::this_thread::sleep_for(std::chrono::milliseconds(rand() % 50)); auto old_id = static_cast(rand() % i); - auto old_cp = dbcp.cloneCheckpoint(old_id); - EXPECT_NOTEQUAL(old_cp, nullptr); - if (old_cp) { - EXPECT_EQUAL(old_cp->getID(), old_id); - EXPECT_EQUAL(old_cp->getPrevID(), old_id - 1); - } + find_checkpoint(old_id); } +#endif } + // Finish + app_mgr.postSimLoopTeardown(); + root.enterTeardown(); + clocks.enterTeardown(); + + return; // TODO cnyce + auto verif_load_chkpt = [&](DatabaseCheckpointer::chkpt_id_t id) { EXPECT_NOTHROW(dbcp.loadCheckpoint(id)); EXPECT_EQUAL(dbcp.getCurrentID(), id); @@ -196,22 +207,25 @@ void generalTest() // Wait until checkpoint 3 is evicted from cache with a 3-second timeout uint32_t num_tries = 0; - while (dbcp.findCheckpoint(3).lock() != nullptr) { + while (dbcp.findCheckpoint(3) != nullptr) { std::this_thread::sleep_for(std::chrono::milliseconds(10)); EXPECT_NOTEQUAL(++num_tries, 300); } -// TODO cnyce -#if 0 // Ask the checkpointer to retrieve checkpoint 3 from the database - auto dbchkpt3 = dbcp.cloneCheckpoint(3); - EXPECT_EQUAL(dbchkpt3->getID(), clone3->getID()); + std::shared_ptr dbchkpt3; + EXPECT_NOTHROW(dbchkpt3 = dbcp.findCheckpoint(3, true)); + EXPECT_NOTEQUAL(dbchkpt3, nullptr); + if (dbchkpt3) { + // Verify that the database checkpoint matches the original clone of 3 + EXPECT_EQUAL(dbchkpt3->getID(), clone3->getID()); + + std::ostringstream clone3_oss, dbchkpt3_oss; + clone3->dumpData(clone3_oss); + dbchkpt3->dumpData(dbchkpt3_oss); + EXPECT_EQUAL(clone3_oss.str(), dbchkpt3_oss.str()); + } - // Verify that the database checkpoint matches the original clone of 3 - std::ostringstream clone3_oss, dbchkpt3_oss; - clone3->dumpData(clone3_oss); - dbchkpt3->dumpData(dbchkpt3_oss); - EXPECT_EQUAL(clone3_oss.str(), dbchkpt3_oss.str()); // Verify history chain for a db-recreated checkpoint auto hist_chain3 = dbcp.getHistoryChain(3); @@ -234,7 +248,6 @@ void generalTest() // Nothing to test, just call dumpRestoreChain() dbcp.dumpRestoreChain(std::cout, 3); -#endif // Go back to checkpoint 1 verif_load_chkpt(1); @@ -257,8 +270,8 @@ void generalTest() // Ensure exception is thrown when loading a non-existent checkpoint EXPECT_THROW(dbcp.loadCheckpoint(9999)); - EXPECT_THROW(dbcp.cloneCheckpoint(9999)); - EXPECT_NOTHROW(dbcp.cloneCheckpoint(9999, false)); + EXPECT_THROW(dbcp.findCheckpoint(9999, true)); + EXPECT_NOTHROW(dbcp.findCheckpoint(9999, false)); // Create checkpoints 1-50. for (uint32_t i = 1; i <= 50; ++i) { @@ -358,46 +371,46 @@ void generalTest() auto restore_chain = dbcp.getRestoreChain(dbcp.getCurrentID()); auto id = restore_chain.top(); restore_chain.pop(); - std::weak_ptr chkpt; - EXPECT_NOTHROW(chkpt = dbcp.findCheckpoint(id)); - auto c = chkpt.lock(); + std::shared_ptr chkpt; + EXPECT_NOTHROW(chkpt = dbcp.findCheckpoint(id, true)); + auto c = chkpt; EXPECT_NOTEQUAL(c, nullptr); EXPECT_TRUE(c->isSnapshot()); while (!restore_chain.empty()) { id = restore_chain.top(); restore_chain.pop(); - EXPECT_NOTHROW(chkpt = dbcp.findCheckpoint(id)); - c = chkpt.lock(); + EXPECT_NOTHROW(chkpt = dbcp.findCheckpoint(id, true)); + c = chkpt; EXPECT_NOTEQUAL(c, nullptr); EXPECT_FALSE(c->isSnapshot()); } // Verify that checkpoint clones are as expected - auto cache73 = dbcp.findCheckpoint(73).lock(); - auto clone73 = dbcp.cloneCheckpoint(73); - - std::ostringstream cache_oss; - std::ostringstream clone_oss; - - cache73->dumpData(cache_oss); - clone73->dumpData(clone_oss); - - EXPECT_EQUAL(cache_oss.str(), clone_oss.str()); - EXPECT_EQUAL(cache73->getTotalMemoryUse(), clone73->getTotalMemoryUse()); - EXPECT_EQUAL(cache73->getContentMemoryUse(), clone73->getContentMemoryUse()); - EXPECT_TRUE(cache73->getHistoryChain() == clone73->getHistoryChain()); - EXPECT_TRUE(cache73->getRestoreChain() == clone73->getRestoreChain()); - EXPECT_EQUAL(cache73->getPrevID(), clone73->getPrevID()); - EXPECT_EQUAL(cache73->getNextIDs(), clone73->getNextIDs()); - EXPECT_EQUAL(cache73->getTick(), clone73->getTick()); - EXPECT_EQUAL(cache73->isSnapshot(), clone73->isSnapshot()); - EXPECT_EQUAL(cache73->getDistanceToPrevSnapshot(), clone73->getDistanceToPrevSnapshot()); - - // Finish - app_mgr.postSimLoopTeardown(); - root.enterTeardown(); - clocks.enterTeardown(); + auto cache73 = dbcp.findCheckpoint(73); + auto clone73 = dbcp.findCheckpoint(73); + + EXPECT_NOTEQUAL(cache73, nullptr); + EXPECT_NOTEQUAL(clone73, nullptr); + + if (cache73 && clone73) { + std::ostringstream cache_oss; + std::ostringstream clone_oss; + + cache73->dumpData(cache_oss); + clone73->dumpData(clone_oss); + + EXPECT_EQUAL(cache_oss.str(), clone_oss.str()); + EXPECT_EQUAL(cache73->getTotalMemoryUse(), clone73->getTotalMemoryUse()); + EXPECT_EQUAL(cache73->getContentMemoryUse(), clone73->getContentMemoryUse()); + EXPECT_TRUE(cache73->getHistoryChain() == clone73->getHistoryChain()); + EXPECT_TRUE(cache73->getRestoreChain() == clone73->getRestoreChain()); + EXPECT_EQUAL(cache73->getPrevID(), clone73->getPrevID()); + EXPECT_EQUAL(cache73->getNextIDs(), clone73->getNextIDs()); + EXPECT_EQUAL(cache73->getTick(), clone73->getTick()); + EXPECT_EQUAL(cache73->isSnapshot(), clone73->isSnapshot()); + EXPECT_EQUAL(cache73->getDistanceToPrevSnapshot(), clone73->getDistanceToPrevSnapshot()); + } // Nothing to test, just call dumpList/dumpData/dumpAnnotatedData dbcp.dumpList(std::cout); @@ -406,6 +419,11 @@ void generalTest() std::cout << std::endl; dbcp.dumpAnnotatedData(std::cout); std::cout << std::endl; + + // Finish + app_mgr.postSimLoopTeardown(); + root.enterTeardown(); + clocks.enterTeardown(); } int main() From 5398d34b6ff92bdbb414b788d238b49f44f9e16c Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 23 Sep 2025 08:13:40 -0500 Subject: [PATCH 12/30] Database-backed checkpointer --- .../checkpoint/DatabaseCheckpoint.hpp | 5 +- .../checkpoint/DatabaseCheckpointer.hpp | 13 +- sparta/src/DatabaseCheckpointer.cpp | 92 +++++- .../DatabaseCheckpoint_test.cpp | 269 +++++++++--------- 4 files changed, 221 insertions(+), 158 deletions(-) diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp index 5fb2eb191e..112fd219b4 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp @@ -90,9 +90,6 @@ namespace sparta::serialization::checkpoint template void serialize(Archive& ar, const unsigned int version) { - sparta_assert(deleted_id_ == CheckpointBase::UNIDENTIFIED_CHECKPOINT, - "Cannot serialize a DatabaseCheckpoint that was already deleted"); - CheckpointBase::serialize(ar, version); ar & prev_id_; ar & next_ids_; @@ -289,6 +286,8 @@ namespace sparta::serialization::checkpoint } else { // We are saving a checkpoint window to disk for (auto& chkpt : chkpts) { + sparta_assert(!chkpt->isFlaggedDeleted(), + "Cannot serialize a ChkptWindow that contains a deleted checkpoint"); ar & *chkpt; } } diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp index 2e97b030c5..3558af77d1 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -286,6 +286,9 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer */ void traceValue(std::ostream& o, chkpt_id_t id, const ArchData* container, uint32_t offset, uint32_t size) override; + //! \brief Check if the given checkpoint is currently cached in memory. + bool isCheckpointCached(chkpt_id_t id) const noexcept; + private: /*! @@ -443,15 +446,11 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer //! \brief Checkpoint pipeline flusher std::unique_ptr pipeline_flusher_; - /*! - * \brief Snapshot generation threshold. Every n checkpoints in a chain - * are taken as snapshots instead of deltas - */ + //! \brief Snapshot generation threshold. Every n checkpoints in a chain + //! are taken as snapshots instead of deltas. utils::ValidValue snap_thresh_; - /*! - * \brief Next checkpoint ID value - */ + //! \brief Next checkpoint ID value chkpt_id_t next_chkpt_id_; }; diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index f767e5bc98..f3f326df77 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -71,12 +71,16 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( for (const auto& chkpt : chkpts_in) { if (chkpt->isFlaggedDeleted()) { ensure_rest_deleted = true; - } else if (ensure_rest_deleted) { + } else if (ensure_rest_deleted && !chkpt->isFlaggedDeleted()) { throw CheckpointError("Checkpoint window has non-contiguous deleted checkpoints"); } else { alive_chkpts.push_back(chkpt); } } + + if (!alive_chkpts.empty()) { + chkpts_out.emplace(std::move(alive_chkpts)); + } } ); @@ -508,6 +512,14 @@ void DatabaseCheckpointer::traceValue( sparta_assert(false, "Not implemented"); } +bool DatabaseCheckpointer::isCheckpointCached(chkpt_id_t id) const noexcept +{ + std::lock_guard lock(cache_mutex_); + const auto win_id = getWindowID_(id); + const auto it = chkpts_cache_.find(win_id); + return it != chkpts_cache_.end(); +} + void DatabaseCheckpointer::createHead_() { std::lock_guard lock(cache_mutex_); @@ -604,20 +616,70 @@ chkpt_id_t DatabaseCheckpointer::createCheckpoint_(bool force_snapshot) void DatabaseCheckpointer::deleteCheckpoint_(chkpt_id_t id) { - while (true) { - auto chkpt = findCheckpoint(id, true); - chkpt->flagDeleted_(); - - auto next_ids = chkpt->getNextIDs(); - sparta_assert(next_ids.size() <= 1, - "DatabaseCheckpointer does not support multiple checkpoint branches"); - - if (!next_ids.empty()) { - id = next_ids[0]; - } else { - break; + window_id_t start_win_id = getWindowID_(id); + window_id_t end_win_id = getWindowID_(next_chkpt_id_ - 1); + + for (window_id_t win_id = start_win_id; win_id <= end_win_id; ++win_id) { + auto it = chkpts_cache_.find(win_id); + if (it != chkpts_cache_.end()) { + if (win_id == start_win_id) { + // Only delete checkpoints in this window >= id + auto& window = it->second; + auto new_end = std::remove_if(window.begin(), window.end(), + [id](const std::shared_ptr& chkpt) { + return chkpt->getID() >= id; + }); + + window.erase(new_end, window.end()); + if (window.empty()) { + chkpts_cache_.erase(it); + } + } else { + // Delete the entire window + chkpts_cache_.erase(it); + } } } + + // Now delete from the database + pipeline_flusher_->flush(); + + db_mgr_->safeTransaction( + [&]() + { + // DELETE FROM ChkptWindows WHERE WindowID > start_win_id + auto query = db_mgr_->createQuery("ChkptWindows"); + query->addConstraintForUInt64("WindowID", simdb::Constraints::GREATER, start_win_id); + query->deleteResultSet(); + + // Now update the window containing start_win_id to remove checkpoints >= id + query->resetConstraints(); + query->addConstraintForUInt64("WindowID", simdb::Constraints::EQUAL, start_win_id); + + std::vector compressed_window_bytes; + query->select("WindowBytes", compressed_window_bytes); + + auto results = query->getResultSet(); + if (results.getNextRecord()) { + // DELETE FROM ChkptWindows WHERE WindowID = start_win_id + query->deleteResultSet(); + + // Deserialize the window + auto window = deserializeWindow_(compressed_window_bytes); + + // Remove checkpoints >= id + auto new_end = std::remove_if(window->chkpts.begin(), window->chkpts.end(), + [id](const std::shared_ptr& chkpt) { + return chkpt->getID() >= id; + }); + window->chkpts.erase(new_end, window->chkpts.end()); + + // Send down the pipeline + if (!window->chkpts.empty()) { + pipeline_head_->emplace(std::move(window->chkpts)); + } + } + }); } void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) @@ -805,6 +867,10 @@ std::unique_ptr DatabaseCheckpointer::deserializeWindow_(const std: boost::archive::binary_iarchive ia(is); ia >> *window_restored; + for (auto& chkpt : window_restored->chkpts) { + chkpt->checkpointer_ = const_cast(this); + } + return window_restored; } diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp index 20e16173a0..d001fbc723 100644 --- a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp +++ b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp @@ -62,13 +62,22 @@ class DummyDevice : public sparta::TreeNode { public: DummyDevice(sparta::TreeNode* parent) : - sparta::TreeNode(parent, "dummy", "", sparta::TreeNode::GROUP_IDX_NONE, "dummy node for register test") + sparta::TreeNode(parent, "dummy", "", sparta::TreeNode::GROUP_IDX_NONE, "dummy node for checkpoint test") {} }; -//! General test for saving and loading checkpoints to/from SimDB -void generalTest() +int main() { + auto warn_cerr = std::make_unique( + sparta::TreeNode::getVirtualGlobalNode(), + sparta::log::categories::WARN, + std::cerr); + + auto warn_file = std::make_unique( + sparta::TreeNode::getVirtualGlobalNode(), + sparta::log::categories::WARN, + "warnings.log"); + sparta::Scheduler sched; RootTreeNode clocks("clocks"); sparta::Clock clk(&clocks, "clock", &sched); @@ -117,22 +126,21 @@ void generalTest() EXPECT_EQUAL(dbcp.getCurrentID(), head_id); EXPECT_EQUAL(dbcp.getCurrentTick(), 0); - auto step_checkpointer = [&](uint32_t i) { - r1->write(i * 5ul); - r2->write(i % 5ul); + auto step_checkpointer = [&](DatabaseCheckpointer::chkpt_id_t expected_id) { + r1->write(expected_id * 5ul); + r2->write(expected_id % 5ul); sched.run(1, true, false); - DatabaseCheckpointer::chkpt_id_t id; - EXPECT_NOTHROW(id = dbcp.createCheckpoint()); - EXPECT_EQUAL(id, i); - EXPECT_EQUAL(id, dbcp.getCurrentID()); - return id; + DatabaseCheckpointer::chkpt_id_t actual_id; + EXPECT_NOTHROW(actual_id = dbcp.createCheckpoint()); + EXPECT_EQUAL(actual_id, expected_id); + EXPECT_EQUAL(actual_id, dbcp.getCurrentID()); + return actual_id; }; - auto find_checkpoint = [&](DatabaseCheckpointer::chkpt_id_t id, bool must_exist = false) { + auto verif_find_checkpoint = [&](DatabaseCheckpointer::chkpt_id_t id, bool must_exist = true) { std::shared_ptr cp; EXPECT_NOTHROW(cp = dbcp.findCheckpoint(id, must_exist)); - EXPECT_NOTEQUAL(cp, nullptr); if (cp) { EXPECT_EQUAL(cp->getID(), id); EXPECT_EQUAL(cp->getPrevID(), (id > 0) ? (id - 1) : DatabaseCheckpoint::UNIDENTIFIED_CHECKPOINT); @@ -140,39 +148,34 @@ void generalTest() return cp; }; + auto wait_until_evicted = [&](DatabaseCheckpointer::chkpt_id_t id) { + size_t num_tries = 0; + while (dbcp.isCheckpointCached(id) && num_tries < 3) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + ++num_tries; + } + EXPECT_FALSE(num_tries == 3); + EXPECT_FALSE(dbcp.isCheckpointCached(id)); + }; + // Create 1000 checkpoints, and periodically access an old one. Also // go to sleep sometimes to increase the chances we have to go to the - // database to retrieve a checkpoint. Keep a clone of checkpoint 3 for - // later verification. - std::shared_ptr clone3; + // database to retrieve a checkpoint. for (uint32_t i = 1; i <= 1000; ++i) { + // Step the scheduler and take a checkpoint step_checkpointer(i); // Access most recent from the cache directly - find_checkpoint(i); + verif_find_checkpoint(i); - // Store checkpoint 3 for later verification - if (i == 3) { - clone3 = find_checkpoint(3); + // Force some of the checkpoints to be retrieved from the database + if (i % 100 == 0 && i > 250) { + auto old_id = i - 100; + wait_until_evicted(old_id); + verif_find_checkpoint(old_id); } - -#if 0 - // Access an old one, which may or may not be in the cache - if (rand() % 10 == 0) { - std::this_thread::sleep_for(std::chrono::milliseconds(rand() % 50)); - auto old_id = static_cast(rand() % i); - find_checkpoint(old_id); - } -#endif } - // Finish - app_mgr.postSimLoopTeardown(); - root.enterTeardown(); - clocks.enterTeardown(); - - return; // TODO cnyce - auto verif_load_chkpt = [&](DatabaseCheckpointer::chkpt_id_t id) { EXPECT_NOTHROW(dbcp.loadCheckpoint(id)); EXPECT_EQUAL(dbcp.getCurrentID(), id); @@ -186,68 +189,107 @@ void generalTest() EXPECT_EQUAL(r2_val, id % 5ul); }; - // Load very recent checkpoints that are definitely in the cache - verif_load_chkpt(100); - verif_load_chkpt(99); - verif_load_chkpt(95); - verif_load_chkpt(90); - verif_load_chkpt(89); + // Nothing to test, just call dumpList/dumpData/dumpAnnotatedData. + // Do this while we have a lot of checkpoints in the cache and + // the database for max code coverage. + dbcp.dumpList(std::cout); + std::cout << std::endl; + dbcp.dumpData(std::cout); + std::cout << std::endl; + dbcp.dumpAnnotatedData(std::cout); + std::cout << std::endl; - // Load checkpoints that have probably already been evicted from the cache - std::this_thread::sleep_for(std::chrono::seconds(1)); - verif_load_chkpt(49); - verif_load_chkpt(45); - verif_load_chkpt(40); - verif_load_chkpt(39); - - step_checkpointer(40); - step_checkpointer(41); - step_checkpointer(42); - verif_load_chkpt(40); - - // Wait until checkpoint 3 is evicted from cache with a 3-second timeout - uint32_t num_tries = 0; - while (dbcp.findCheckpoint(3) != nullptr) { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - EXPECT_NOTEQUAL(++num_tries, 300); - } + // Verify that cached / DB-recreated checkpoints are identical: + // 1. Get the current checkpoint from the cache + auto cached_cp1000 = dbcp.findCheckpoint(dbcp.getCurrentID()); + EXPECT_TRUE(dbcp.isCheckpointCached(cached_cp1000->getID())); - // Ask the checkpointer to retrieve checkpoint 3 from the database - std::shared_ptr dbchkpt3; - EXPECT_NOTHROW(dbchkpt3 = dbcp.findCheckpoint(3, true)); - EXPECT_NOTEQUAL(dbchkpt3, nullptr); - if (dbchkpt3) { - // Verify that the database checkpoint matches the original clone of 3 - EXPECT_EQUAL(dbchkpt3->getID(), clone3->getID()); - - std::ostringstream clone3_oss, dbchkpt3_oss; - clone3->dumpData(clone3_oss); - dbchkpt3->dumpData(dbchkpt3_oss); - EXPECT_EQUAL(clone3_oss.str(), dbchkpt3_oss.str()); + // 2. Write a lot more checkpoints to force the oldest ones out of the cache + for (uint32_t i = 1001; i <= 1500; ++i) { + step_checkpointer(i); + } + wait_until_evicted(cached_cp1000->getID()); + + // 3. Recreate the same checkpoint from the database + EXPECT_FALSE(dbcp.isCheckpointCached(cached_cp1000->getID())); + auto recreated_cp1000 = dbcp.findCheckpoint(cached_cp1000->getID()); + + EXPECT_NOTEQUAL(cached_cp1000, nullptr); + EXPECT_NOTEQUAL(recreated_cp1000, nullptr); + + if (cached_cp1000 && recreated_cp1000) { + std::ostringstream oss1; + std::ostringstream oss2; + + cached_cp1000->dumpData(oss1); + recreated_cp1000->dumpData(oss2); + + EXPECT_EQUAL(oss1.str(), oss2.str()); + EXPECT_EQUAL(cached_cp1000->getTotalMemoryUse(), recreated_cp1000->getTotalMemoryUse()); + EXPECT_EQUAL(cached_cp1000->getContentMemoryUse(), recreated_cp1000->getContentMemoryUse()); + EXPECT_TRUE(cached_cp1000->getHistoryChain() == recreated_cp1000->getHistoryChain()); + EXPECT_TRUE(cached_cp1000->getRestoreChain() == recreated_cp1000->getRestoreChain()); + EXPECT_EQUAL(cached_cp1000->getPrevID(), recreated_cp1000->getPrevID()); + EXPECT_EQUAL(cached_cp1000->getNextIDs(), recreated_cp1000->getNextIDs()); + EXPECT_EQUAL(cached_cp1000->getTick(), recreated_cp1000->getTick()); + EXPECT_EQUAL(cached_cp1000->isSnapshot(), recreated_cp1000->isSnapshot()); + EXPECT_EQUAL(cached_cp1000->getDistanceToPrevSnapshot(), recreated_cp1000->getDistanceToPrevSnapshot()); } - - // Verify history chain for a db-recreated checkpoint - auto hist_chain3 = dbcp.getHistoryChain(3); - for (auto hist_id : {3,2,1,0}) { - EXPECT_FALSE(hist_chain3.empty()); - EXPECT_EQUAL(hist_chain3.top(), hist_id); - hist_chain3.pop(); + // Load very recent checkpoints that are definitely in the cache + for (size_t i = 1500; i > 1475; --i) { + EXPECT_TRUE(dbcp.isCheckpointCached(i)); + verif_load_chkpt(i); } - // Verify restore chain for a db-recreated checkpoint - auto rest_chain3 = dbcp.getRestoreChain(3); - for (auto rest_id : {3,2,1,0}) { - EXPECT_FALSE(rest_chain3.empty()); - EXPECT_EQUAL(rest_chain3.top(), rest_id); - rest_chain3.pop(); + // Load checkpoints that have already been evicted from the cache + for (size_t i = 250; i > 225; --i) { + wait_until_evicted(i); + } + for (size_t i = 250; i > 225; --i) { + verif_load_chkpt(i); } - // Verify distance to previous snapshot for a db-recreated checkpoint - EXPECT_EQUAL(dbchkpt3->getDistanceToPrevSnapshot(), 3); + // Verify history chain + auto hist_chain13 = dbcp.getHistoryChain(13); + for (auto hist_id : {0,1,2,3,4,5,6,7,8,9,10,11,12,13}) { + EXPECT_FALSE(hist_chain13.empty()); + EXPECT_EQUAL(hist_chain13.top(), hist_id); + hist_chain13.pop(); + } + EXPECT_TRUE(hist_chain13.empty()); + + // Verify restore chain + auto rest_chain13 = dbcp.getRestoreChain(13); + for (auto rest_id : {11,12,13}) { + EXPECT_FALSE(rest_chain13.empty()); + EXPECT_EQUAL(rest_chain13.top(), rest_id); + rest_chain13.pop(); + } + EXPECT_TRUE(rest_chain13.empty()); + + // Verify distance to previous snapshot: + // + // Checkpoint ID Snapshot? + // 0 Yes (head) + // 1-10 No + // 11 Yes + // 12-21 No + // 22 Yes + // 23-32 No + // 33 Yes + std::shared_ptr cp; + EXPECT_NOTHROW(cp = dbcp.findCheckpoint(33, true)); + EXPECT_EQUAL(cp->getDistanceToPrevSnapshot(), 0); + EXPECT_NOTHROW(cp = dbcp.findCheckpoint(32, true)); + EXPECT_EQUAL(cp->getDistanceToPrevSnapshot(), 10); + EXPECT_NOTHROW(cp = dbcp.findCheckpoint(22, true)); + EXPECT_EQUAL(cp->getDistanceToPrevSnapshot(), 0); + EXPECT_NOTHROW(cp = dbcp.findCheckpoint(5, true)); + EXPECT_EQUAL(cp->getDistanceToPrevSnapshot(), 5); // Nothing to test, just call dumpRestoreChain() - dbcp.dumpRestoreChain(std::cout, 3); + dbcp.dumpRestoreChain(std::cout, 32); // Go back to checkpoint 1 verif_load_chkpt(1); @@ -386,59 +428,16 @@ void generalTest() EXPECT_FALSE(c->isSnapshot()); } - // Verify that checkpoint clones are as expected - auto cache73 = dbcp.findCheckpoint(73); - auto clone73 = dbcp.findCheckpoint(73); - - EXPECT_NOTEQUAL(cache73, nullptr); - EXPECT_NOTEQUAL(clone73, nullptr); - - if (cache73 && clone73) { - std::ostringstream cache_oss; - std::ostringstream clone_oss; - - cache73->dumpData(cache_oss); - clone73->dumpData(clone_oss); - - EXPECT_EQUAL(cache_oss.str(), clone_oss.str()); - EXPECT_EQUAL(cache73->getTotalMemoryUse(), clone73->getTotalMemoryUse()); - EXPECT_EQUAL(cache73->getContentMemoryUse(), clone73->getContentMemoryUse()); - EXPECT_TRUE(cache73->getHistoryChain() == clone73->getHistoryChain()); - EXPECT_TRUE(cache73->getRestoreChain() == clone73->getRestoreChain()); - EXPECT_EQUAL(cache73->getPrevID(), clone73->getPrevID()); - EXPECT_EQUAL(cache73->getNextIDs(), clone73->getNextIDs()); - EXPECT_EQUAL(cache73->getTick(), clone73->getTick()); - EXPECT_EQUAL(cache73->isSnapshot(), clone73->isSnapshot()); - EXPECT_EQUAL(cache73->getDistanceToPrevSnapshot(), clone73->getDistanceToPrevSnapshot()); - } - - // Nothing to test, just call dumpList/dumpData/dumpAnnotatedData - dbcp.dumpList(std::cout); - std::cout << std::endl; - dbcp.dumpData(std::cout); - std::cout << std::endl; - dbcp.dumpAnnotatedData(std::cout); - std::cout << std::endl; + // Verify that the head checkpoint is in the cache until simulation teardown. + EXPECT_TRUE(dbcp.isCheckpointCached(head_id)); // Finish app_mgr.postSimLoopTeardown(); root.enterTeardown(); clocks.enterTeardown(); -} - -int main() -{ - std::unique_ptr warn_cerr(new sparta::log::Tap( - sparta::TreeNode::getVirtualGlobalNode(), - sparta::log::categories::WARN, - std::cerr)); - - std::unique_ptr warn_file(new sparta::log::Tap( - sparta::TreeNode::getVirtualGlobalNode(), - sparta::log::categories::WARN, - "warnings.log")); - generalTest(); + // Ensure that the head checkpoint is no longer in the cache + EXPECT_FALSE(dbcp.isCheckpointCached(head_id)); REPORT_ERROR; return ERROR_CODE; From 4203aaefced711a3491e6cbc573d74646ee34222 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 23 Sep 2025 12:43:47 -0500 Subject: [PATCH 13/30] Update all doxygen and add more testing --- .../serialization/checkpoint/Checkpoint.hpp | 3 +- .../checkpoint/CheckpointBase.hpp | 3 +- .../serialization/checkpoint/Checkpointer.hpp | 12 +- .../checkpoint/DatabaseCheckpoint.hpp | 159 ++++++-------- .../checkpoint/DatabaseCheckpointer.hpp | 204 +++++++++--------- .../checkpoint/DeltaCheckpoint.hpp | 2 +- .../checkpoint/FastCheckpointer.hpp | 32 +-- sparta/src/DatabaseCheckpoint.cpp | 45 +--- sparta/src/DatabaseCheckpointer.cpp | 138 +++++------- .../DatabaseCheckpoint_test.cpp | 103 +++++++-- 10 files changed, 333 insertions(+), 368 deletions(-) diff --git a/sparta/sparta/serialization/checkpoint/Checkpoint.hpp b/sparta/sparta/serialization/checkpoint/Checkpoint.hpp index 86e3a1b86c..d4a771d39f 100644 --- a/sparta/sparta/serialization/checkpoint/Checkpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/Checkpoint.hpp @@ -92,8 +92,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Get the ID of our previous checkpoint. Returns UNIDENTIFIED_CHECKPOINT - * if we have no previous checkpoint, as is the case with the head checkpoint - * and snapshots. + * only for the head checkpoint. */ chkpt_id_t getPrevID() const override { return prev_ ? prev_->getID() : UNIDENTIFIED_CHECKPOINT; diff --git a/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp b/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp index 9a44e75920..e83065bcf7 100644 --- a/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp +++ b/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp @@ -149,8 +149,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Get the ID of our previous checkpoint. Returns UNIDENTIFIED_CHECKPOINT - * if we have no previous checkpoint, as is the case with the head checkpoint - * and snapshots. + * only for the head checkpoint. */ virtual chkpt_id_t getPrevID() const = 0; diff --git a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp index 4a53d0a3e9..a15e7b964d 100644 --- a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp @@ -440,6 +440,11 @@ namespace sparta::serialization::checkpoint return 0; } + /*! + * \brief Returns IDs of the checkpoints immediately following the given checkpoint. + */ + virtual std::vector getNextIDs(chkpt_id_t id) = 0; + //////////////////////////////////////////////////////////////////////// //! @} @@ -542,7 +547,7 @@ namespace sparta::serialization::checkpoint } } - auto nexts = getNextIDs_(chkpt); + auto nexts = getNextIDs(chkpt); std::stringstream ss; // Draw separator between prev checkpoint and this @@ -675,11 +680,6 @@ namespace sparta::serialization::checkpoint current_ = current; } - /*! - * \brief Returns IDs of the checkpoints immediately following the given checkpoint. - */ - virtual std::vector getNextIDs_(chkpt_id_t id) = 0; - /*! * \brief Scheduler whose tick count will be set and read. Cannnot be * updated after first checkpoint without bad side effects. Keeping this diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp index 112fd219b4..113156a49f 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp @@ -11,26 +11,36 @@ namespace sparta::serialization::checkpoint class DatabaseCheckpointer; class DatabaseCheckpoint; - struct ChkptWindowBytes { + /*! + * \brief A window of checkpoints to be sent to/from the database as a unit. + * \note A "window" is defined as a group of (snap_thresh_ + 1) checkpoints, + * where the first checkpoint in the window is a snapshot and the remaining + * checkpoints in the window are deltas. Checkpoints are processed this way + * to enable various performance optimizations. + */ + struct ChkptWindow { using chkpt_id_t = CheckpointBase::chkpt_id_t; - std::vector chkpt_bytes; + std::vector> chkpts; chkpt_id_t start_chkpt_id = CheckpointBase::UNIDENTIFIED_CHECKPOINT; chkpt_id_t end_chkpt_id = CheckpointBase::UNIDENTIFIED_CHECKPOINT; uint64_t start_tick = 0; uint64_t end_tick = 0; + + //! \brief Support boost::serialization + template + void serialize(Archive& ar, const unsigned int /*version*/); }; - struct ChkptWindow { + /*! + * \brief Compressed version of ChkptWindow to be stored in the database. + */ + struct ChkptWindowBytes { using chkpt_id_t = CheckpointBase::chkpt_id_t; - std::vector> chkpts; + std::vector chkpt_bytes; chkpt_id_t start_chkpt_id = CheckpointBase::UNIDENTIFIED_CHECKPOINT; chkpt_id_t end_chkpt_id = CheckpointBase::UNIDENTIFIED_CHECKPOINT; uint64_t start_tick = 0; uint64_t end_tick = 0; - - //! \brief Support boost::serialization - template - void serialize(Archive& ar, const unsigned int /*version*/); }; /*! @@ -76,7 +86,6 @@ namespace sparta::serialization::checkpoint tick_t tick, chkpt_id_t prev_id, const std::vector& next_ids, - chkpt_id_t deleted_id, bool is_snapshot, const storage::VectorStorage& storage, DatabaseCheckpointer* checkpointer); @@ -88,6 +97,9 @@ namespace sparta::serialization::checkpoint public: + /* + * \brief Support boost::serialization + */ template void serialize(Archive& ar, const unsigned int version) { CheckpointBase::serialize(ar, version); @@ -98,33 +110,35 @@ namespace sparta::serialization::checkpoint } /*! - * \brief Returns a string describing this object + * \brief Returns a string describing this object. */ std::string stringize() const override; /*! - * \brief Writes all checkpoint raw data to an ostream - * \param o ostream to which raw data will be written - * \note No newlines or other extra characters will be appended + * \brief Writes all checkpoint raw data to an ostream. + * + * \param o ostream to which raw data will be written. + * + * \note No newlines or other extra characters will be appended. */ void dumpData(std::ostream& o) const override; /*! - * \brief Returns memory usage by this checkpoint + * \brief Returns memory usage by this checkpoint. */ uint64_t getTotalMemoryUse() const noexcept override; /*! - * \brief Returns memory usage by the content of this checkpoint + * \brief Returns memory usage by the content of this checkpoint. */ uint64_t getContentMemoryUse() const noexcept override; /*! * \brief Returns a stack of checkpoints from this checkpoint as far - * back as possible until no previous link is found. This is a superset - * of getRestoreChain and contains checkpoints that do not actually need - * to be inspected for restoring this checkpoint's data. This may reach - * the head checkpoint if no gaps are encountered. + * back as possible until no previous link is found. + * + * \note Since this checkpointer enforces a linear chain of checkpoints + * with no gaps, this always reaches the head checkpoint. */ std::stack getHistoryChain() const; @@ -137,14 +151,16 @@ namespace sparta::serialization::checkpoint /*! * \brief Get the ID of our previous checkpoint. Returns UNIDENTIFIED_CHECKPOINT - * if we have no previous checkpoint, as is the case with the head checkpoint - * and snapshots. + * only for the head checkpoint. */ chkpt_id_t getPrevID() const override; /*! - * \brief Returns next checkpoint following *this. May be an empty + * \brief Returns next checkpoints following this one. May be an empty * vector if there are no later checkpoints. + * + * \note Since this checkpointer enforces a linear chain of checkpoints + * with no gaps, this vector will always have 0 or 1 elements. */ std::vector getNextIDs() const override; @@ -152,123 +168,82 @@ namespace sparta::serialization::checkpoint * \brief Attempts to restore this checkpoint including any previous * deltas (dependencies). * - * Uses loadState to restore state from each checkpoint in the + * \note Uses loadState to restore state from each checkpoint in the * restore chain. */ void load(const std::vector& dats) override; /*! - * \brief Indicates whether this checkpoint has been flagged deleted. - * \note Does not imply that the checkpoint can safely be deleted; - * only that it was flagged for deletion. - * \note If false, Checkpoint ID will also be UNIDENTIFIED_CHECKPOINT - * \see flagDeleted() - */ - bool isFlaggedDeleted() const noexcept; - - /*! - * \brief Return the ID had by this checkpoint before it was deleted - * If this checkpoint has not been flagged for deletion, this will be - * UNIDENTIFIED_CHECKPOINT - */ - chkpt_id_t getDeletedID() const noexcept; - - /*! - * \brief Gets the representation of this deleted checkpoint as part of - * a checkpoint chain (if that checkpointer supports deletion) - * \return "D-" concatenate with ID copied when being deleted. Returns - * the ID if not yet deleted - */ - std::string getDeletedRepr() const override; - - /*! - * \brief Is this checkpoint a snapshot (contains ALL simulator state) + * \brief Is this checkpoint a snapshot? If true, this checkpoint has + * no dependencies and contains all simulator state. */ bool isSnapshot() const noexcept; /*! * \brief Determines how many checkpoints away the closest, earlier * snapshot is. + * * \return distance to closest snapshot. If this node is a snapshot, * returns 0; if immediate getPrev() is a snapshot, returns 1; and * so on. - * - * \note This is a noexcept function, which means that the exception if - * no snapshot is encountered is uncatchable. This is intentional. */ uint32_t getDistanceToPrevSnapshot() const noexcept; /*! * \brief Loads delta state of this checkpoint to root. - * Does not look at any other checkpoints checkpoints. - * \see load + * + * \note Does not look at any other checkpoints. + * + * \see DatabaseCheckpointer::load */ void loadState(const std::vector& dats); - /*! - * \brief Create a deep copy of this checkpoint. - */ - std::unique_ptr clone() const; - private: /*! * \brief Writes checkpoint data starting from current root to - * checkpoint storage - * \pre Must not have already stored data for this checkpoint - * This should only be called at construction + * checkpoint storage. + * + * \pre Must not have already stored data for this checkpoint. + * + * \note This should only be called at construction */ void storeSnapshot_(const std::vector& dats); /*! * \brief Writes checkpoint data starting from current root to - * checkpoint storage - * \pre Must not have already stored data for this checkpoint - * This should only be called at construction + * checkpoint storage. + * + * \pre Must not have already stored data for this checkpoint. + * + * \note This should only be called at construction */ void storeDelta_(const std::vector& dats); - /*! - * \brief Allows this checkpoint to be deleted if it is no longer a - * previous delta of some other delta (i.e. getNexts() returns an - * empty vector). Sets the checkpoint ID to invalid. Calling multiple - * times has no effect - * \pre Must not already be flagged deleted - * \post isFlaggedDeleted() will return true - * \post getDeletedID() will return the current ID (if any) - * \see canDelete - * \see isFlaggedDeleted - */ - void flagDeleted_(); - - /*! - * \brief ID of the previous checkpoint. - */ + //! \brief ID of the previous checkpoint. chkpt_id_t prev_id_; /*! - * \brief IDs of the next checkpoints. + * \brief IDs of the next checkpoints. Since this checkpointer + * enforces a linear chain of checkpoints with no gaps, this vector + * will always have 0 or 1 elements. */ std::vector next_ids_; - /*! - * \brief ID of the checkpoint before it was deleted. This is invalid - * until deletion. Prevents misuse of checkpoint ID or any confusion - * about whether it is deleted or not. - */ - chkpt_id_t deleted_id_; - //! \brief Is this node a snapshot? bool is_snapshot_; - //! \brief Storage implementation + //! \brief Storage implementation. storage::VectorStorage data_; - //! \brief Checkpointer who created us + //! \brief Checkpointer who created us. DatabaseCheckpointer* checkpointer_ = nullptr; }; - //! Defined down here for "new DatabaseCheckpoint" + /*! + * \brief Support boost::serialization for ChkptWindow. + * \note Defined down here for "new DatabaseCheckpoint". + */ template inline void ChkptWindow::serialize(Archive& ar, const unsigned int /*version*/) { ar & start_chkpt_id; @@ -286,8 +261,6 @@ namespace sparta::serialization::checkpoint } else { // We are saving a checkpoint window to disk for (auto& chkpt : chkpts) { - sparta_assert(!chkpt->isFlaggedDeleted(), - "Cannot serialize a ChkptWindow that contains a deleted checkpoint"); ar & *chkpt; } } diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp index 3558af77d1..9b65234dfb 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -8,11 +8,6 @@ #include "simdb/pipeline/Pipeline.hpp" #include -//! Default threshold for creating snapshots -#ifndef DEFAULT_SNAPSHOT_THRESH -#define DEFAULT_SNAPSHOT_THRESH 20 -#endif - namespace sparta::serialization::checkpoint { @@ -20,8 +15,9 @@ class DatabaseCheckpointer; /*! * \brief Implementation of the FastCheckpointer which only holds - * a "window" of checkpoints in memory at any given time, and sends - * checkpoints outside this window to/from SimDB. + * a subset of checkpoints in memory at any given time, and sends + * checkpoints outside this window to/from SimDB as needed using + * an LRU cache. */ class DatabaseCheckpointer : public simdb::App, public Checkpointer { @@ -34,7 +30,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer using window_id_t = uint64_t; /*! - * \brief FastCheckpointer Constructor + * \brief DatabaseCheckpointer constructor * * \param db_mgr SimDB instance to use as a backing store for all checkpoints. * @@ -42,9 +38,9 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * This cannot be changed later. This does not * necessarily need to be a RootTreeNode. Before * the first checkpoint is taken, this node must - * be finalized (see - * sparta::TreeNode::isFinalized). At this point, - * the node does not need to be finalized + * be finalized (see sparta::TreeNode::isFinalized). + * At the point of construction, the node does not + * need to be finalized. * * \param sched Scheduler to read and restart on checkpoint restore (if * not nullptr) @@ -63,7 +59,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer simdb::pipeline::AsyncDatabaseAccessor* db_accessor) override; /*! - * \brief Flush all windows down the pipeline before threads are shut down. + * \brief Flush all cached windows down the pipeline before threads are shut down. */ void preTeardown() override; @@ -74,40 +70,56 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * checkpointer to automatically place a snapshot checkpoint instead of * a delta. A threshold of 0 or 1 results in all checkpoints being * snapshots. A value of 10 results in every 10th checkpoint being a - * snapshot. Explicit snapshot creation using createCheckpoint can interrupt - * and restart this pattern. + * snapshot. + * + * \note Unlike FastCheckpointer, this threshold is always enforced and + * snapshots cannot be created using createCheckpoint(force_snapshot=true). * - * This value is a performance/space tradeoff knob. + * \note This value is a performance/space tradeoff knob. */ uint32_t getSnapshotThreshold() const; /*! - * \brief Sets the snapshot threshold - * \see getSnapshotThreshold + * \brief Sets the snapshot threshold. + * + * \note This must be called before any checkpoints are taken, and + * cannot be changed later. */ void setSnapshotThreshold(uint32_t thresh); /*! - * \brief Sets the max number of cached windows (LRU) + * \brief Sets the max number of cached windows (LRU). + * + * \note This must be called before any checkpoints are taken, and + * cannot be changed later. */ void setMaxCachedWindows(uint32_t max_windows); /*! * \brief Computes and returns the memory usage by this checkpointer at - * this moment including any framework overhead + * this moment including any framework overhead. + * * \note This is an approxiation and does not include some of * minimal dynamic overhead from stl containers. + * + * \note This only includes memory used by checkpoints currently + * in the cache, not checkpoints stored in the database. */ uint64_t getTotalMemoryUse() const noexcept override; /*! * \brief Computes and returns the memory usage by this checkpointer at - * this moment purely for the checkpoint state being held + * this moment purely for the checkpoint state being held. + * + * \note This only includes memory used by checkpoints currently + * in the cache, not checkpoints stored in the database. */ uint64_t getContentMemoryUse() const noexcept override; /*! * \brief Explicit checkpoint deletion is NOT supported by this checkpointer. + * + * \throw CheckpointError if called */ void deleteCheckpoint(chkpt_id_t) override final; @@ -115,83 +127,77 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \brief Loads state from a specific checkpoint by ID * \note This implicitly deletes all future checkpoints since this checkpointer * does not allow more than one branch. - * \throw CheckpointError if id does not refer to checkpoint that exists - * or if checkpoint could not be load. + * \throw CheckpointError if id does not refer to a checkpoint that exists + * or if checkpoint could not be loaded. * \warning If checkpoint fails during loading for reasons other than an * invalid ID, the simulation state could be corrupt - * \post current checkpoint is now the checkpoint specified by id - * \post Sets scheduler current tick to the checkpoint's tick using - * Scheduler::restartAt + * \post Current checkpoint is now the checkpoint specified by id + * \post Next checkpoint created will have ID = +1 since gaps are not allowed + * \post Sets scheduler current tick to the checkpoint's tick using Scheduler::restartAt */ void loadCheckpoint(chkpt_id_t id) override; /*! - * \brief Gets all checkpoints taken at tick t on any timeline. + * \brief Gets all checkpoints taken at tick t. * \param t Tick number at which checkpoints should found. * \return vector of valid checkpoint IDs (never * checkpoint_type::UNIDENTIFIED_CHECKPOINT) - * \note Makes a new vector of results. This should not be called in the - * critical path. + * \note Walks all checkpoints in cache and on disk. This should + * not be called in the critical path. */ std::vector getCheckpointsAt(tick_t t) override; /*! - * \brief Gets all checkpoint IDs available on any timeline sorted by - * tick (or equivalently checkpoint ID). - * \return vector of valid checkpoint IDs (never - * checkpoint_type::UNIDENTIFIED_CHECKPOINT) - * \note Makes a new vector of results. This should not be called in the - * critical path. + * \brief Gets all checkpoint IDs sorted by tick (or equivalently checkpoint ID). + * \return vector of valid checkpoint IDs (never checkpoint_type::UNIDENTIFIED_CHECKPOINT) + * \note Walks all checkpoints in cache and on disk. This should + * not be called in the critical path. */ std::vector getCheckpoints() override; /*! - * \brief Gets the current number of checkpoints having valid IDs + * \brief Gets the current number of checkpoints with valid IDs. */ uint32_t getNumCheckpoints() noexcept override; /*! - * \brief Gets the current number of snapshots with valid IDs + * \brief Gets the current number of snapshots with valid IDs. */ uint32_t getNumSnapshots() noexcept; /*! - * \brief Gets the current number of delta checkpoints with valid IDs + * \brief Gets the current number of delta checkpoints with valid IDs. */ uint32_t getNumDeltas() noexcept; /*! * \brief Debugging utility which gets a deque of checkpoints * representing a chain starting at the checkpoint head and ending at - * the checkpoint specified by \a id. Ths results can contain - * Checkpoint::UNIDENTIFIED_CHECKPOINT to represent temporary - * deleted checkpoints in the chain. + * the checkpoint specified by \a id. * \param id ID of checkpoint that terminates the chain * \return dequeue of checkpoint IDs where the front is always the head * and the back is always the checkpoint described by \a id. If there is * no checkpoint head, returns an empty result - * \throw CheckpointError if \a id does not refer to a valid - * checkpoint. - * \note Makes a new vector of results. This should not be called in the - * critical path. + * \throw CheckpointError if \a id does not refer to a valid checkpoint. + * \note The results never contain Checkpoint::UNIDENTIFIED_CHECKPOINT */ std::deque getCheckpointChain(chkpt_id_t id) override; /*! * \brief Finds a checkpoint by its ID. - * \param id ID of checkpoint to find. Guaranteed not to be flagged as - * deleted - * \note ONLY SEARCHES CHECKPOINT CACHE. - * \return Checkpoint with ID of \a id if found or nullptr if not found + * \param id ID of checkpoint to find. + * \param must_exist Whether to enforce that the checkpoint be found. + * \return Checkpoint with ID of \a id if found or nullptr if not found. + * \throw CheckpointError if \a must_exist is true and \a id does not + * refer to a valid checkpoint. */ std::shared_ptr findCheckpoint(chkpt_id_t id, bool must_exist=false); /*! * \brief Tests whether this checkpoint manager has a checkpoint with - * the given id. + * the given id in the cache or in the database. * \return True if id refers to a checkpoint held by this checkpointer - * and false if not. If id == Checkpoint::UNIDENTIFIED_CHECKPOINT, - * always returns false + * and false if not. */ bool hasCheckpoint(chkpt_id_t id) noexcept override; @@ -205,10 +211,9 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer /*! * \brief Returns a stack of checkpoints from this checkpoint as far - * back as possible until no previous link is found. This is a superset - * of getRestoreChain and contains checkpoints that do not actually need - * to be inspected for restoring this checkpoint's data. This may reach - * the head checkpoint if no gaps are encountered. + * back as possible until no previous link is found. + * \note Since this checkpointer enforces a linear chain of checkpoints + * with no gaps, this always reaches the head checkpoint. */ std::stack getHistoryChain(chkpt_id_t id); @@ -220,10 +225,11 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer std::stack getRestoreChain(chkpt_id_t id); /*! - * \brief Returns next checkpoint following *this. May be an empty - * vector if there are no later checkpoints. + * \brief Returns IDs of the checkpoints immediately following the given checkpoint. + * \note Since this checkpointer does not support checkpoint gaps, + * this will always be a vector of size 0 or 1. */ - std::vector getNextIDs(chkpt_id_t id); + std::vector getNextIDs(chkpt_id_t id) override; /*! * \brief Determines how many checkpoints away the closest, earlier @@ -231,9 +237,6 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \return distance to closest snapshot. If this node is a snapshot, * returns 0; if immediate getPrev() is a snapshot, returns 1; and * so on. - * - * \note This is a noexcept function, which means that the exception if - * no snapshot is encountered is uncatchable. This is intentional. */ uint32_t getDistanceToPrevSnapshot(chkpt_id_t id) noexcept; @@ -273,20 +276,27 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer /*! * \brief Debugging utility which dumps values in some bytes across a * chain of checkpoints. The intent is to show the values loaded when - * attempting to restore needed to restore the given value in the - * selected checkpoint - * \param o ostream with each value and checkpoint ID will be printed - * \param id ID of checkpoint to "restore" value from + * attempting to restore the given value in the selected checkpoint + * \param o ostream where each value and checkpoint ID will be printed + * \param id ID of checkpoint to restore value from * \param container ArchData in which the data being traced lives * \param offset Offset into \a container * \param size Bytes to read at \a offset * \warning This may change checkpoint data read/write state and should * only be done between completed checkpoints saves/restores in order to * not interfere. + * \note NOT CURRENTLY IMPLEMENTED + * \throw CheckpointError */ void traceValue(std::ostream& o, chkpt_id_t id, const ArchData* container, uint32_t offset, uint32_t size) override; - //! \brief Check if the given checkpoint is currently cached in memory. + /*! + * \brief Check if the given checkpoint is currently cached in memory. + * \note Used for testing and debugging only. + * \note Even though this might return true, if you wait a bit and call + * again with the same ID, it might return false since the checkpoint + * might have been evicted from the LRU cache. + */ bool isCheckpointCached(chkpt_id_t id) const noexcept; private: @@ -309,7 +319,9 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * (getHead() != nullptr) * \post Must create a checkpoint * \return Must return a checkpoint ID not currently in use - * \note invoked by createHead + * \note invoked by createCheckpoint + * \note This checkpointer does not support force_snapshot=true + * since it always enforces the snapshot threshold. */ chkpt_id_t createCheckpoint_(bool force_snapshot=false) override; @@ -318,19 +330,10 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer * \param id ID of checkpoint to delete. Must not be * Checkpoint::UNIDENTIFIED_CHECKPOINT and must not be equal to the * ID of the head checkpoint. - * \throw CheckpointError if this manager has no checkpoint with given - * id. Test with hasCheckpoint first. If id == - * Checkpoint::UNIDENTIFIED_CHECKPOINT, always throws. - * Throws if id == getHeadID(). Head cannot be deleted - * - * Internally, this deletion may be effective-only and actual data may - * still exist in an incaccessible form as part of the checkpoint - * tree implementation. - * - * If the current checkpoint is deleted, current will be updated back - * along the current checkpoints previous checkpoint chain until a non - * deleted checkpoint is found. This will become the new current - * checkpoint + * \throw CheckpointError if id == getHeadID(). Head cannot be deleted. + * \note Since this checkpointer does not support checkpoint gaps, + * this will delete all checkpoints from the cache and database + * starting at the given ID and going to the most recent checkpoint. */ void deleteCheckpoint_(chkpt_id_t id); @@ -339,11 +342,6 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer */ void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) override; - /*! - * \brief Returns IDs of the checkpoints immediately following the given checkpoint. - */ - std::vector getNextIDs_(chkpt_id_t id) override; - /*! * \brief Intercept calls to Checkpointer::setHead_() and ensure we do not delete it. */ @@ -351,7 +349,6 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer /*! * \brief Intercept calls to Checkpointer::setCurrent_() and ensure we do not delete it. - * Also take this time to "unbless" the previous current node. */ void setCurrent_(CheckpointBase* current) override; @@ -366,19 +363,20 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer void setCurrentID_(chkpt_id_t id); /*! - * \brief Add the given checkpoint to the cache and start processing it. + * \brief Add the given checkpoint to the cache and bump its window to the + * front of the LRU list. */ void addToCache_(std::shared_ptr chkpt); /*! - * \brief Get the window ID for the given checkpoint ID + * \brief Get the window ID for the given checkpoint ID. */ window_id_t getWindowID_(chkpt_id_t id) const { return id / (snap_thresh_ + 1); } /*! - * \brief Get the window ID for the given checkpoint + * \brief Get the window ID for the given checkpoint. */ template window_id_t getWindowID_(const CheckpointPtrT& chkpt) const { @@ -386,33 +384,38 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer } /*! - * \brief Bump the given window ID to the front of the LRU cache + * \brief Bump the given window ID to the front of the LRU cache. */ void touchWindow_(window_id_t id); /*! - * \brief Evict the least recently used window from the cache if needed + * \brief Evict the least recently used window(s) from the cache if needed + * and send them down the pipeline. */ void evictWindowsIfNeeded_(bool force_flush=false); /*! - * \brief Ensure this checkpoint's window is loaded in the LRU cache + * \brief Ensure this checkpoint's window is loaded in the LRU cache. + * \throw CheckpointError if must_succeed is true and the window + * could not be loaded. */ bool ensureWindowLoaded_(chkpt_id_t id, bool must_succeed=true); /*! - * \brief Retrieve a checkpoint window from the database + * \brief Retrieve a checkpoint window from the database. */ checkpoint_ptrs getWindowFromDatabase_(window_id_t win_id); /*! * \brief "Undo" the pipeline for a ChkptWindows.WindowBytes blob - * into the original vector of checkpoints + * into the original ChkptWindow structure. */ std::unique_ptr deserializeWindow_(const std::vector& window_bytes) const; /*! * \brief Apply the given callback to every checkpoint (cached and database). + * \note Do not call in the critical path. Used for debugging and for the + * various dump* apis. */ void forEachCheckpoint_(const std::function& cb); @@ -422,7 +425,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer //! \brief Checkpointer current ID. Used to prevent the current node from being deleted from the cache. chkpt_id_t current_id_ = checkpoint_type::UNIDENTIFIED_CHECKPOINT; - //! \brief Pipeline input queue from which new checkpoints to be processed are read. + //! \brief Pipeline input queue which accepts the oldest checkpoint window from the cache. simdb::ConcurrentQueue* pipeline_head_ = nullptr; //! \brief Subset (or all of) our checkpoints that we currently are holding in memory. @@ -440,17 +443,17 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer //! \brief Mutex to protect our checkpoints cache. mutable std::recursive_mutex cache_mutex_; - //! \brief SimDB instance + //! \brief SimDB instance. simdb::DatabaseManager* db_mgr_ = nullptr; - //! \brief Checkpoint pipeline flusher + //! \brief Checkpoint pipeline flusher. std::unique_ptr pipeline_flusher_; //! \brief Snapshot generation threshold. Every n checkpoints in a chain //! are taken as snapshots instead of deltas. utils::ValidValue snap_thresh_; - //! \brief Next checkpoint ID value + //! \brief Next checkpoint ID value. chkpt_id_t next_chkpt_id_; }; @@ -461,7 +464,7 @@ namespace simdb /*! * \brief This AppFactory specialization is provided since we have an app that inherits - * from FastCheckpointer, and thus cannot have the default app subclass ctor signature + * from Checkpointer, and thus cannot have the default app subclass ctor signature * that only takes the DatabaseManager like most other apps. */ template <> @@ -479,7 +482,8 @@ class AppFactory : publ AppT* createApp(DatabaseManager* db_mgr) override { if (!root_) { - throw sparta::SpartaException("Must set root (and maybe scheduler) before instantiating apps!"); + throw sparta::SpartaException( + "Must set root (and maybe scheduler) before instantiating DatabaseCheckpointer app"); } // Make the ctor call that the default AppFactory cannot make. diff --git a/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp index fa95d9d722..ec30d64c7c 100644 --- a/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DeltaCheckpoint.hpp @@ -352,7 +352,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Get the ID of our previous checkpoint. Returns UNIDENTIFIED_CHECKPOINT * if we have no previous checkpoint, as is the case with the head checkpoint - * and snapshots. + * and those flagged for deletion. */ chkpt_id_t getPrevID() const override { if (auto prev = static_cast(getPrev())) { diff --git a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp index b917e40583..eed35c25ef 100644 --- a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp @@ -418,6 +418,22 @@ namespace sparta::serialization::checkpoint return chkpts_.find(id) != chkpts_.end(); } + /*! + * \brief Returns IDs of the checkpoints immediately following the given checkpoint. + */ + std::vector getNextIDs(chkpt_id_t id) override final { + std::vector next_ids; + if (const auto chkpt = findCheckpoint_(id)) { + for (const auto next : chkpt->getNexts()) { + const auto dcp = static_cast(next); + if (!dcp->isFlaggedDeleted()) { + next_ids.push_back(next->getID()); + } + } + } + return next_ids; + } + //////////////////////////////////////////////////////////////////////// //! @} @@ -760,22 +776,6 @@ namespace sparta::serialization::checkpoint return dcp->getID(); } - /*! - * \brief Returns IDs of the checkpoints immediately following the given checkpoint. - */ - std::vector getNextIDs_(chkpt_id_t id) override final { - std::vector next_ids; - if (const auto chkpt = findCheckpoint_(id)) { - for (const auto next : chkpt->getNexts()) { - const auto dcp = static_cast(next); - if (!dcp->isFlaggedDeleted()) { - next_ids.push_back(next->getID()); - } - } - } - return next_ids; - } - /*! * \brief All checkpoints sorted by ascending tick number (or * equivalently ascending checkpoint ID since both are monotonically diff --git a/sparta/src/DatabaseCheckpoint.cpp b/sparta/src/DatabaseCheckpoint.cpp index 201c5c2a30..ae1cdea521 100644 --- a/sparta/src/DatabaseCheckpoint.cpp +++ b/sparta/src/DatabaseCheckpoint.cpp @@ -20,7 +20,6 @@ DatabaseCheckpoint::DatabaseCheckpoint(TreeNode& root, DatabaseCheckpointer* checkpointer) : CheckpointBase(id, tick) , prev_id_(prev ? prev->getID() : UNIDENTIFIED_CHECKPOINT) - , deleted_id_(UNIDENTIFIED_CHECKPOINT) , is_snapshot_(is_snapshot) , checkpointer_(checkpointer) { @@ -51,14 +50,12 @@ DatabaseCheckpoint::DatabaseCheckpoint(chkpt_id_t id, tick_t tick, chkpt_id_t prev_id, const std::vector& next_ids, - chkpt_id_t deleted_id, bool is_snapshot, const storage::VectorStorage& storage, DatabaseCheckpointer* checkpointer) : CheckpointBase(id, tick) , prev_id_(prev_id) , next_ids_(next_ids) - , deleted_id_(deleted_id) , is_snapshot_(is_snapshot) , data_(storage) , checkpointer_(checkpointer) @@ -68,12 +65,7 @@ DatabaseCheckpoint::DatabaseCheckpoint(chkpt_id_t id, std::string DatabaseCheckpoint::stringize() const { std::stringstream ss; - ss << "& dats) } } -void DatabaseCheckpoint::flagDeleted_() -{ - sparta_assert(!isFlaggedDeleted(), - "Cannot delete a checkpoint when it is already deleted: " << this); - deleted_id_ = getID(); - setID_(UNIDENTIFIED_CHECKPOINT); -} - -bool DatabaseCheckpoint::isFlaggedDeleted() const noexcept -{ - return getID() == UNIDENTIFIED_CHECKPOINT; -} - -chkpt_id_t DatabaseCheckpoint::getDeletedID() const noexcept -{ - return deleted_id_; -} - -std::string DatabaseCheckpoint::getDeletedRepr() const -{ - std::stringstream ss; - if (isFlaggedDeleted()) { - ss << "*" << getDeletedID(); - } else { - ss << getID(); - } - return ss.str(); -} - bool DatabaseCheckpoint::isSnapshot() const noexcept { return is_snapshot_; @@ -188,12 +151,6 @@ void DatabaseCheckpoint::loadState(const std::vector& dats) } } -std::unique_ptr DatabaseCheckpoint::clone() const -{ - auto clone = new DatabaseCheckpoint(getID(), getTick(), prev_id_, next_ids_, deleted_id_, is_snapshot_, data_, checkpointer_); - return std::unique_ptr(clone); -} - void DatabaseCheckpoint::storeSnapshot_(const std::vector& dats) { sparta_assert(data_.good(), diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index f3f326df77..f2d587a56e 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -51,40 +51,7 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( { auto pipeline = std::make_unique(db_mgr_, NAME); - // Task 1: Chop off "flag deleted" checkpoints from the given window. This is needed - // since the checkpointer many times will grab a window from the database, mark some - // checkpoints as deleted, and leave the window in the cache. When the window gets - // old enough to be evicted from the LRU cache and re-sent down the pipeline, we do - // not want to propagate the deleted checkpoints back into the database. - auto purge_deleted = simdb::pipeline::createTask>( - [](checkpoint_ptrs&& chkpts_in, - simdb::ConcurrentQueue& chkpts_out, - bool /*force_flush*/) - { - checkpoint_ptrs alive_chkpts; - bool ensure_rest_deleted = false; - - // Rule: Once we see a deleted checkpoint, all following checkpoints - // in the window must also be deleted. This is to ensure that - // checkpoints are always contiguous in the database. - - for (const auto& chkpt : chkpts_in) { - if (chkpt->isFlaggedDeleted()) { - ensure_rest_deleted = true; - } else if (ensure_rest_deleted && !chkpt->isFlaggedDeleted()) { - throw CheckpointError("Checkpoint window has non-contiguous deleted checkpoints"); - } else { - alive_chkpts.push_back(chkpt); - } - } - - if (!alive_chkpts.empty()) { - chkpts_out.emplace(std::move(alive_chkpts)); - } - } - ); - - // Task 2: Package up checkpoints into a checkpoint window + // Task 1: Package up checkpoints into a checkpoint window auto create_window = simdb::pipeline::createTask>( [](checkpoint_ptrs&& chkpts, simdb::ConcurrentQueue& windows, @@ -100,7 +67,7 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( } ); - // Task 3: Serialize a checkpoint window into a char buffer + // Task 2: Serialize a checkpoint window into a char buffer auto window_to_bytes = simdb::pipeline::createTask>( [](ChkptWindow&& window, simdb::ConcurrentQueue& window_bytes, @@ -121,7 +88,7 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( } ); - // Task 4: Perform zlib compression on the checkpoint window bytes + // Task 3: Perform zlib compression on the checkpoint window bytes auto zlib_bytes = simdb::pipeline::createTask>( [](ChkptWindowBytes&& bytes_in, simdb::ConcurrentQueue& bytes_out, @@ -134,7 +101,7 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( } ); - // Task 5: Write to the database + // Task 4: Write to the database auto write_to_db = db_accessor->createAsyncWriter( [this](ChkptWindowBytes&& bytes_in, simdb::pipeline::AppPreparedINSERTs* tables, @@ -162,15 +129,14 @@ std::unique_ptr DatabaseCheckpointer::createPipeline( } ); - *purge_deleted >> *create_window >> *window_to_bytes >> *zlib_bytes >> *write_to_db; + *create_window >> *window_to_bytes >> *zlib_bytes >> *write_to_db; - pipeline_head_ = purge_deleted->getTypedInputQueue(); + pipeline_head_ = create_window->getTypedInputQueue(); pipeline_flusher_ = std::make_unique( - *db_mgr_, purge_deleted, create_window, window_to_bytes, zlib_bytes, write_to_db); + *db_mgr_, create_window, window_to_bytes, zlib_bytes, write_to_db); pipeline->createTaskGroup("CheckpointPipeline") - ->addTask(std::move(purge_deleted)) ->addTask(std::move(create_window)) ->addTask(std::move(window_to_bytes)) ->addTask(std::move(zlib_bytes)); @@ -278,7 +244,7 @@ std::vector DatabaseCheckpointer::getCheckpointsAt(tick_t t) std::unordered_set ids; forEachCheckpoint_([t, &ids](const DatabaseCheckpoint* chkpt) { - if (!chkpt->isFlaggedDeleted() && chkpt->getTick() == t) { + if (chkpt->getTick() == t) { ids.insert(chkpt->getID()); } }); @@ -293,9 +259,7 @@ std::vector DatabaseCheckpointer::getCheckpoints() std::unordered_set ids; forEachCheckpoint_([&ids](const DatabaseCheckpoint* chkpt) { - if (!chkpt->isFlaggedDeleted()) { - ids.insert(chkpt->getID()); - } + ids.insert(chkpt->getID()); }); std::vector ret(ids.begin(), ids.end()); @@ -379,11 +343,7 @@ void DatabaseCheckpointer::dumpRestoreChain(std::ostream& o, chkpt_id_t id) if (chkpt->isSnapshot()) { o << "("; } - if (chkpt->getID() == checkpoint_type::UNIDENTIFIED_CHECKPOINT) { - o << "*" << chkpt->getDeletedID(); - } else { - o << chkpt->getID(); - } + o << chkpt->getID(); if (chkpt->isSnapshot()) { o << ")"; } @@ -509,7 +469,7 @@ void DatabaseCheckpointer::traceValue( (void)offset; (void)size; - sparta_assert(false, "Not implemented"); + throw CheckpointError("DatabaseCheckpointer::traceValue() not implemented"); } bool DatabaseCheckpointer::isCheckpointCached(chkpt_id_t id) const noexcept @@ -555,7 +515,9 @@ void DatabaseCheckpointer::createHead_() chkpt_id_t DatabaseCheckpointer::createCheckpoint_(bool force_snapshot) { - sparta_assert(!force_snapshot, "Forced snapshots are not supported by DatabaseCheckpointer"); + if (force_snapshot) { + throw CheckpointError("DatabaseCheckpointer does not support forced snapshots"); + } std::lock_guard lock(cache_mutex_); @@ -616,6 +578,11 @@ chkpt_id_t DatabaseCheckpointer::createCheckpoint_(bool force_snapshot) void DatabaseCheckpointer::deleteCheckpoint_(chkpt_id_t id) { + // Throw if trying to delete head checkpoint + if (id == getHeadID()) { + throw CheckpointError("Cannot delete head checkpoint with ID ") << id; + } + window_id_t start_win_id = getWindowID_(id); window_id_t end_win_id = getWindowID_(next_chkpt_id_ - 1); @@ -644,42 +611,40 @@ void DatabaseCheckpointer::deleteCheckpoint_(chkpt_id_t id) // Now delete from the database pipeline_flusher_->flush(); - db_mgr_->safeTransaction( - [&]() - { - // DELETE FROM ChkptWindows WHERE WindowID > start_win_id - auto query = db_mgr_->createQuery("ChkptWindows"); - query->addConstraintForUInt64("WindowID", simdb::Constraints::GREATER, start_win_id); - query->deleteResultSet(); + db_mgr_->safeTransaction([&]() { + // DELETE FROM ChkptWindows WHERE WindowID > start_win_id + auto query = db_mgr_->createQuery("ChkptWindows"); + query->addConstraintForUInt64("WindowID", simdb::Constraints::GREATER, start_win_id); + query->deleteResultSet(); - // Now update the window containing start_win_id to remove checkpoints >= id - query->resetConstraints(); - query->addConstraintForUInt64("WindowID", simdb::Constraints::EQUAL, start_win_id); + // Now update the window containing start_win_id to remove checkpoints >= id + query->resetConstraints(); + query->addConstraintForUInt64("WindowID", simdb::Constraints::EQUAL, start_win_id); - std::vector compressed_window_bytes; - query->select("WindowBytes", compressed_window_bytes); + std::vector compressed_window_bytes; + query->select("WindowBytes", compressed_window_bytes); - auto results = query->getResultSet(); - if (results.getNextRecord()) { - // DELETE FROM ChkptWindows WHERE WindowID = start_win_id - query->deleteResultSet(); + auto results = query->getResultSet(); + if (results.getNextRecord()) { + // DELETE FROM ChkptWindows WHERE WindowID = start_win_id + query->deleteResultSet(); - // Deserialize the window - auto window = deserializeWindow_(compressed_window_bytes); + // Deserialize the window + auto window = deserializeWindow_(compressed_window_bytes); - // Remove checkpoints >= id - auto new_end = std::remove_if(window->chkpts.begin(), window->chkpts.end(), - [id](const std::shared_ptr& chkpt) { - return chkpt->getID() >= id; - }); - window->chkpts.erase(new_end, window->chkpts.end()); + // Remove checkpoints >= id + auto new_end = std::remove_if(window->chkpts.begin(), window->chkpts.end(), + [id](const std::shared_ptr& chkpt) { + return chkpt->getID() >= id; + }); + window->chkpts.erase(new_end, window->chkpts.end()); - // Send down the pipeline - if (!window->chkpts.empty()) { - pipeline_head_->emplace(std::move(window->chkpts)); - } + // Send down the pipeline + if (!window->chkpts.empty()) { + pipeline_head_->emplace(std::move(window->chkpts)); } - }); + } + }); } void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) @@ -693,11 +658,6 @@ void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream } } -std::vector DatabaseCheckpointer::getNextIDs_(chkpt_id_t id) -{ - return getNextIDs(id); -} - void DatabaseCheckpointer::setHead_(CheckpointBase* head) { std::lock_guard lock(cache_mutex_); @@ -795,7 +755,9 @@ void DatabaseCheckpointer::evictWindowsIfNeeded_(bool force_flush) // Send the window down the pipeline for writing to the database auto& window = chkpts_cache_[win_id]; - pipeline_head_->emplace(std::move(window)); + if (!window.empty()) { + pipeline_head_->emplace(std::move(window)); + } // Cleanup chkpts_cache_.erase(win_id); @@ -905,8 +867,6 @@ void DatabaseCheckpointer::forEachCheckpoint_(const std::functionchkpts) { - sparta_assert(!chkpt->isFlaggedDeleted(), - "Deleted checkpoints should never make it to the database"); cb(chkpt.get()); } } diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp index d001fbc723..a3891c9f0d 100644 --- a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp +++ b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp @@ -116,6 +116,11 @@ int main() root.enterFinalized(); sched.finalize(); EXPECT_EQUAL(sched.getCurrentTick(), 0); + EXPECT_TRUE(dbcp.getCheckpointsAt(0).empty()); + EXPECT_EQUAL(dbcp.getNumCheckpoints(), 0); + EXPECT_EQUAL(dbcp.getNumSnapshots(), 0); + EXPECT_EQUAL(dbcp.getNumDeltas(), 0); + EXPECT_TRUE(dbcp.getCheckpointChain(0).empty()); // CHECKPOINT: Head DatabaseCheckpointer::chkpt_id_t head_id; @@ -125,16 +130,27 @@ int main() EXPECT_EQUAL(head_id, dbcp.getHead()->getID()); EXPECT_EQUAL(dbcp.getCurrentID(), head_id); EXPECT_EQUAL(dbcp.getCurrentTick(), 0); + EXPECT_TRUE(dbcp.isSnapshot(head_id)); - auto step_checkpointer = [&](DatabaseCheckpointer::chkpt_id_t expected_id) { + std::cout << dbcp.stringize() << std::endl; + + auto step_checkpointer = [&](DatabaseCheckpointer::chkpt_id_t expected_id, bool step_sched = true) { r1->write(expected_id * 5ul); r2->write(expected_id % 5ul); - sched.run(1, true, false); + if (step_sched) { + sched.run(1, true, false); + } DatabaseCheckpointer::chkpt_id_t actual_id; EXPECT_NOTHROW(actual_id = dbcp.createCheckpoint()); EXPECT_EQUAL(actual_id, expected_id); EXPECT_EQUAL(actual_id, dbcp.getCurrentID()); + EXPECT_EQUAL(dbcp.getNumCheckpoints(), expected_id + 1); + + // Should always have the head and current checkpoints in the cache + EXPECT_TRUE(dbcp.isCheckpointCached(dbcp.getHeadID())); + EXPECT_TRUE(dbcp.isCheckpointCached(dbcp.getCurrentID())); + return actual_id; }; @@ -144,10 +160,30 @@ int main() if (cp) { EXPECT_EQUAL(cp->getID(), id); EXPECT_EQUAL(cp->getPrevID(), (id > 0) ? (id - 1) : DatabaseCheckpoint::UNIDENTIFIED_CHECKPOINT); + EXPECT_EQUAL(cp->isSnapshot(), (id % (dbcp.getSnapshotThreshold() + 1)) == 0); + + if (cp->isSnapshot()) { + EXPECT_EQUAL(cp->getDistanceToPrevSnapshot(), 0); + } else { + EXPECT_EQUAL(cp->getDistanceToPrevSnapshot(), id % (dbcp.getSnapshotThreshold() + 1)); + } } return cp; }; + auto verif_load_chkpt = [&](DatabaseCheckpointer::chkpt_id_t id) { + EXPECT_NOTHROW(dbcp.loadCheckpoint(id)); + EXPECT_EQUAL(dbcp.getCurrentID(), id); + EXPECT_EQUAL(dbcp.getNumCheckpoints(), id + 1); + EXPECT_FALSE(dbcp.hasCheckpoint(id + 1)); + EXPECT_EQUAL(sched.getCurrentTick(), id); + + auto r1_val = r1->read(); + auto r2_val = r2->read(); + EXPECT_EQUAL(r1_val, id * 5ul); + EXPECT_EQUAL(r2_val, id % 5ul); + }; + auto wait_until_evicted = [&](DatabaseCheckpointer::chkpt_id_t id) { size_t num_tries = 0; while (dbcp.isCheckpointCached(id) && num_tries < 3) { @@ -158,6 +194,12 @@ int main() EXPECT_FALSE(dbcp.isCheckpointCached(id)); }; + // Ensure force_snapshot=true always throws. Not supported. + EXPECT_THROW(dbcp.createCheckpoint(true)); + + // Ensure traceValue() throws. Not supported. + EXPECT_THROW(dbcp.traceValue(std::cout, dbcp.getCurrentID(), nullptr, 0, 4)); + // Create 1000 checkpoints, and periodically access an old one. Also // go to sleep sometimes to increase the chances we have to go to the // database to retrieve a checkpoint. @@ -176,19 +218,6 @@ int main() } } - auto verif_load_chkpt = [&](DatabaseCheckpointer::chkpt_id_t id) { - EXPECT_NOTHROW(dbcp.loadCheckpoint(id)); - EXPECT_EQUAL(dbcp.getCurrentID(), id); - EXPECT_EQUAL(dbcp.getNumCheckpoints(), id + 1); - EXPECT_FALSE(dbcp.hasCheckpoint(id + 1)); - EXPECT_EQUAL(sched.getCurrentTick(), id); - - auto r1_val = r1->read(); - auto r2_val = r2->read(); - EXPECT_EQUAL(r1_val, id * 5ul); - EXPECT_EQUAL(r2_val, id % 5ul); - }; - // Nothing to test, just call dumpList/dumpData/dumpAnnotatedData. // Do this while we have a lot of checkpoints in the cache and // the database for max code coverage. @@ -312,6 +341,8 @@ int main() // Ensure exception is thrown when loading a non-existent checkpoint EXPECT_THROW(dbcp.loadCheckpoint(9999)); + + // Ensure findCheckpoint() throws when must_exist=true and checkpoint does not exist EXPECT_THROW(dbcp.findCheckpoint(9999, true)); EXPECT_NOTHROW(dbcp.findCheckpoint(9999, false)); @@ -428,6 +459,48 @@ int main() EXPECT_FALSE(c->isSnapshot()); } + // To check the getCheckpointsAt() method, go back to the head + // checkpoint. Then take a bunch of checkpoints at tick 1, 2, and 3. + verif_load_chkpt(head_id); + EXPECT_EQUAL(sched.getCurrentTick(), 0); + + std::vector chkpts_at_1; + for (uint32_t i = 1; i <= 300; ++i) { + const bool step_sched = (i == 1); + auto id = step_checkpointer(i, step_sched); + EXPECT_EQUAL(sched.getCurrentTick(), 1); + chkpts_at_1.push_back(id); + } + + std::vector chkpts_at_2; + for (uint32_t i = 301; i <= 500; ++i) { + const bool step_sched = (i == 301); + auto id = step_checkpointer(i, step_sched); + EXPECT_EQUAL(sched.getCurrentTick(), 2); + chkpts_at_2.push_back(id); + } + + std::vector chkpts_at_3; + for (uint32_t i = 501; i <= 700; ++i) { + const bool step_sched = (i == 501); + auto id = step_checkpointer(i, step_sched); + EXPECT_EQUAL(sched.getCurrentTick(), 3); + chkpts_at_3.push_back(id); + } + + EXPECT_EQUAL(dbcp.getCheckpointsAt(1), chkpts_at_1); + EXPECT_EQUAL(dbcp.getCheckpointsAt(2), chkpts_at_2); + EXPECT_EQUAL(dbcp.getCheckpointsAt(3), chkpts_at_3); + + // Wait for the older checkpoints to be evicted and + // verify getCheckpointsAt() again. + wait_until_evicted(chkpts_at_1.back()); + wait_until_evicted(chkpts_at_2.back()); + + EXPECT_EQUAL(dbcp.getCheckpointsAt(1), chkpts_at_1); + EXPECT_EQUAL(dbcp.getCheckpointsAt(2), chkpts_at_2); + EXPECT_EQUAL(dbcp.getCheckpointsAt(3), chkpts_at_3); + // Verify that the head checkpoint is in the cache until simulation teardown. EXPECT_TRUE(dbcp.isCheckpointCached(head_id)); From 0db5b77cf2c9a24a65bb04fbbda0efedae5fd710 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 23 Sep 2025 12:54:36 -0500 Subject: [PATCH 14/30] Fix clang build/regress failures --- sparta/sparta/serialization/checkpoint/Checkpointer.hpp | 2 +- .../serialization/checkpoint/DatabaseCheckpointer.hpp | 6 +++--- sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp | 2 +- sparta/src/DatabaseCheckpointer.cpp | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp index a15e7b964d..8a8d688ef1 100644 --- a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp @@ -330,7 +330,7 @@ namespace sparta::serialization::checkpoint * Ignores any internal temporary or deleted checkpoints without * visible IDs */ - virtual uint32_t getNumCheckpoints() noexcept = 0; + virtual uint32_t getNumCheckpoints() const noexcept = 0; /*! * \brief Debugging utility which gets a deque of checkpoints diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp index 9b65234dfb..af7c4b694f 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -158,17 +158,17 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer /*! * \brief Gets the current number of checkpoints with valid IDs. */ - uint32_t getNumCheckpoints() noexcept override; + uint32_t getNumCheckpoints() const noexcept override; /*! * \brief Gets the current number of snapshots with valid IDs. */ - uint32_t getNumSnapshots() noexcept; + uint32_t getNumSnapshots() const noexcept; /*! * \brief Gets the current number of delta checkpoints with valid IDs. */ - uint32_t getNumDeltas() noexcept; + uint32_t getNumDeltas() const noexcept; /*! * \brief Debugging utility which gets a deque of checkpoints diff --git a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp index eed35c25ef..e2f0121cc6 100644 --- a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp @@ -301,7 +301,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Gets the current number of checkpoints having valid IDs */ - uint32_t getNumCheckpoints() noexcept override { + uint32_t getNumCheckpoints() const noexcept override { return num_alive_checkpoints_; } diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index f2d587a56e..1c330ff4c2 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -267,17 +267,17 @@ std::vector DatabaseCheckpointer::getCheckpoints() return ret; } -uint32_t DatabaseCheckpointer::getNumCheckpoints() noexcept +uint32_t DatabaseCheckpointer::getNumCheckpoints() const noexcept { return next_chkpt_id_; } -uint32_t DatabaseCheckpointer::getNumSnapshots() noexcept +uint32_t DatabaseCheckpointer::getNumSnapshots() const noexcept { return next_chkpt_id_ ? getWindowID_(next_chkpt_id_) + 1 : 0; } -uint32_t DatabaseCheckpointer::getNumDeltas() noexcept +uint32_t DatabaseCheckpointer::getNumDeltas() const noexcept { return getNumCheckpoints() - getNumSnapshots(); } From ce66f1748442010c8d589cd11b36118a0c3af8d6 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 23 Sep 2025 13:03:53 -0500 Subject: [PATCH 15/30] Fix clang build/regress failures --- .../DatabaseCheckpoint/DatabaseCheckpoint_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp index a3891c9f0d..c425090661 100644 --- a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp +++ b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp @@ -141,7 +141,7 @@ int main() sched.run(1, true, false); } - DatabaseCheckpointer::chkpt_id_t actual_id; + DatabaseCheckpointer::chkpt_id_t actual_id = DatabaseCheckpoint::UNIDENTIFIED_CHECKPOINT; EXPECT_NOTHROW(actual_id = dbcp.createCheckpoint()); EXPECT_EQUAL(actual_id, expected_id); EXPECT_EQUAL(actual_id, dbcp.getCurrentID()); From 220001478531a1e4ce2c6eab1a11fc023e83c9c0 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 23 Sep 2025 13:29:13 -0500 Subject: [PATCH 16/30] Attempt to fix -LE issue with ctest (macos) --- sparta/example/CMakeLists.txt | 4 ++-- sparta/example/CoreModel/CMakeLists.txt | 4 ++-- sparta/test/CMakeLists.txt | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sparta/example/CMakeLists.txt b/sparta/example/CMakeLists.txt index c86b13a8ab..a6cb2999bc 100644 --- a/sparta/example/CMakeLists.txt +++ b/sparta/example/CMakeLists.txt @@ -38,8 +38,8 @@ add_custom_target (example_regress_valgrind) # NOTE: # running ctest with --test-action test creates Testing//Test.xml # that can be loaded into the CI test result tracker -add_custom_command (TARGET example_regress POST_BUILD COMMAND ctest -LE ${VALGRIND_TEST_LABEL} -j${NUM_CORES} --test-action test) -add_custom_command (TARGET example_regress_valgrind POST_BUILD COMMAND ctest -L ${VALGRIND_TEST_LABEL} -j${NUM_CORES} --test-action test) +add_custom_command (TARGET example_regress POST_BUILD COMMAND ctest -LE "${VALGRIND_TEST_LABEL}" -j${NUM_CORES} --test-action test) +add_custom_command (TARGET example_regress_valgrind POST_BUILD COMMAND ctest -L "${VALGRIND_TEST_LABEL}" -j${NUM_CORES} --test-action test) add_subdirectory(CoreModel) add_subdirectory(DynamicModelPipeline) diff --git a/sparta/example/CoreModel/CMakeLists.txt b/sparta/example/CoreModel/CMakeLists.txt index 3014c95270..c5526d8ed9 100644 --- a/sparta/example/CoreModel/CMakeLists.txt +++ b/sparta/example/CoreModel/CMakeLists.txt @@ -43,8 +43,8 @@ message(STATUS "Found " ${NUM_CORES} " cores in machine (for ctest)") # NOTE: # running ctest with --test-action test creates Testing//Test.xml # that can be loaded into the CI test result tracker -add_custom_command(TARGET core_example_regress POST_BUILD COMMAND ctest -LE ${VALGRIND_TEST_LABEL} -j${NUM_CORES} --test-action test) -add_custom_command(TARGET core_example_regress_valgrind POST_BUILD COMMAND ctest -L ${VALGRIND_TEST_LABEL} -j${NUM_CORES} --test-action test) +add_custom_command(TARGET core_example_regress POST_BUILD COMMAND ctest -LE "${VALGRIND_TEST_LABEL}" -j${NUM_CORES} --test-action test) +add_custom_command(TARGET core_example_regress_valgrind POST_BUILD COMMAND ctest -L "${VALGRIND_TEST_LABEL}" -j${NUM_CORES} --test-action test) # Ensure the simulator is built first. add_dependencies(core_example_regress sparta_core_example) diff --git a/sparta/test/CMakeLists.txt b/sparta/test/CMakeLists.txt index 9b4af819f6..a1fdca998a 100644 --- a/sparta/test/CMakeLists.txt +++ b/sparta/test/CMakeLists.txt @@ -41,8 +41,8 @@ add_custom_target (regress_valgrind) # NOTE: # running ctest with --test-action test creates Testing//Test.xml # that can be loaded into the CI test result tracker -add_custom_command (TARGET regress POST_BUILD COMMAND ctest -LE ${VALGRIND_TEST_LABEL} -j${NUM_CORES} --test-action test) -add_custom_command (TARGET regress_valgrind POST_BUILD COMMAND ctest -L ${VALGRIND_TEST_LABEL} -j${NUM_CORES} --test-action test) +add_custom_command (TARGET regress POST_BUILD COMMAND ctest -LE "${VALGRIND_TEST_LABEL}" -j${NUM_CORES} --test-action test) +add_custom_command (TARGET regress_valgrind POST_BUILD COMMAND ctest -L "${VALGRIND_TEST_LABEL}" -j${NUM_CORES} --test-action test) #add_subdirectory (pipeViewer) add_subdirectory (Array) From 6a5c9f73ee3d2623e6c818739cdfe247395531ae Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 23 Sep 2025 15:01:24 -0500 Subject: [PATCH 17/30] Align the apple/valgrind/ctest code in the test dir and example dir --- sparta/example/CMakeLists.txt | 2 +- sparta/test/CMakeLists.txt | 31 ++++++++++++++++--------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/sparta/example/CMakeLists.txt b/sparta/example/CMakeLists.txt index a6cb2999bc..3962a76139 100644 --- a/sparta/example/CMakeLists.txt +++ b/sparta/example/CMakeLists.txt @@ -4,10 +4,10 @@ include (CTest) # Setup options for valgrind testing. if (NOT APPLE) - message(STATUS "Valgrind regression enabled") find_program (VALGRIND_TOOL valgrind) if (VALGRIND_TOOL) set(VALGRIND_REGRESS_ENABLED TRUE) + message(STATUS "Valgrind regression enabled") endif() endif() diff --git a/sparta/test/CMakeLists.txt b/sparta/test/CMakeLists.txt index a1fdca998a..f3eb0cc7f2 100644 --- a/sparta/test/CMakeLists.txt +++ b/sparta/test/CMakeLists.txt @@ -7,23 +7,24 @@ include (CTest) include(${SPARTA_CMAKE_MACRO_PATH}/SpartaTestingMacros.cmake) # Setup options for valgrind testing. -find_program (VALGRIND_TOOL valgrind) -if (VALGRIND_TOOL) - message(STATUS "Valgrind enabled: ${VALGRIND_TOOL})") - set(VALGRIND_REGRESS_ENABLED TRUE) - set (VALGRIND_OPTS - "--error-exitcode=5" - "--leak-check=full" - "--show-reachable=yes" - "--undef-value-errors=yes" - "--suppressions=${SPARTA_BASE}/test/valgrind_leakcheck.supp" - "--soname-synonyms=somalloc=NONE" - ) - set (VALGRIND_TEST_LABEL valgrind_test) -else() - message(STATUS "valgrind testing NOT enabled") +if (NOT APPLE) + find_program (VALGRIND_TOOL valgrind) + if (VALGRIND_TOOL) + set(VALGRIND_REGRESS_ENABLED TRUE) + message(STATUS "Valgrind regression enabled") + endif() endif() +set (VALGRIND_OPTS + "--error-exitcode=5" + "--leak-check=full" + "--show-reachable=yes" + "--undef-value-errors=yes" + "--suppressions=${SPARTA_BASE}/test/valgrind_leakcheck.supp" + "--soname-synonyms=somalloc=NONE" + ) +set (VALGRIND_TEST_LABEL valgrind_test) + # Since make does not pass the parallel jobs flag to ctest from the user, # a fixed count will be set based on core count w/ a max 8 include(ProcessorCount) From a34944109bfbb36761cd9ec0a4fcc411533b38d2 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 23 Sep 2025 16:10:43 -0500 Subject: [PATCH 18/30] Redirect std::cerr to a logfile for the Buffer_test --- sparta/test/Buffer/Buffer_test.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sparta/test/Buffer/Buffer_test.cpp b/sparta/test/Buffer/Buffer_test.cpp index 66d24b61a7..3ea8825182 100644 --- a/sparta/test/Buffer/Buffer_test.cpp +++ b/sparta/test/Buffer/Buffer_test.cpp @@ -863,9 +863,16 @@ void testInvalidates() // testEraseSupport::const_reverse_iterator, sparta::Buffer>(); } +void redirectCerrToFile(const std::string& filename) +{ + static std::ofstream error_log(filename); // Ensure it stays open + std::cerr.rdbuf(error_log.rdbuf()); // Redirect cerr +} int main() { + redirectCerrToFile("Buffer_test_errors.log"); + testPointerTypes>(); testPointerTypes>(); generalTest(); From ce62a4d5fe69817605808582f6069a8a541693e6 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Wed, 24 Sep 2025 19:34:42 -0500 Subject: [PATCH 19/30] Address PR feedback --- sparta/example/CMakeLists.txt | 4 +- sparta/example/CoreModel/CMakeLists.txt | 4 +- .../checkpoint/CheckpointBase.hpp | 10 ++--- .../serialization/checkpoint/Checkpointer.hpp | 4 +- .../checkpoint/DatabaseCheckpoint.hpp | 26 ++++--------- .../checkpoint/DatabaseCheckpointer.hpp | 39 +++++++------------ .../checkpoint/FastCheckpointer.hpp | 2 +- sparta/src/DatabaseCheckpoint.cpp | 16 -------- sparta/src/DatabaseCheckpointer.cpp | 32 +++++++-------- sparta/test/CMakeLists.txt | 4 +- 10 files changed, 49 insertions(+), 92 deletions(-) diff --git a/sparta/example/CMakeLists.txt b/sparta/example/CMakeLists.txt index acfdae5d8b..8b5e3dd9ea 100644 --- a/sparta/example/CMakeLists.txt +++ b/sparta/example/CMakeLists.txt @@ -19,8 +19,8 @@ add_custom_target (example_regress_valgrind) # NOTE: # running ctest with --test-action test creates Testing//Test.xml # that can be loaded into the CI test result tracker -add_custom_command (TARGET example_regress POST_BUILD COMMAND ctest -LE "${VALGRIND_TEST_LABEL}" -j${NUM_CORES} --test-action test) -add_custom_command (TARGET example_regress_valgrind POST_BUILD COMMAND ctest -L "${VALGRIND_TEST_LABEL}" -j${NUM_CORES} --test-action test) +add_custom_command (TARGET example_regress POST_BUILD COMMAND ctest -LE ${VALGRIND_TEST_LABEL} -j${NUM_CORES} --test-action test) +add_custom_command (TARGET example_regress_valgrind POST_BUILD COMMAND ctest -L ${VALGRIND_TEST_LABEL} -j${NUM_CORES} --test-action test) add_subdirectory(CoreModel) add_subdirectory(DynamicModelPipeline) diff --git a/sparta/example/CoreModel/CMakeLists.txt b/sparta/example/CoreModel/CMakeLists.txt index c5526d8ed9..3014c95270 100644 --- a/sparta/example/CoreModel/CMakeLists.txt +++ b/sparta/example/CoreModel/CMakeLists.txt @@ -43,8 +43,8 @@ message(STATUS "Found " ${NUM_CORES} " cores in machine (for ctest)") # NOTE: # running ctest with --test-action test creates Testing//Test.xml # that can be loaded into the CI test result tracker -add_custom_command(TARGET core_example_regress POST_BUILD COMMAND ctest -LE "${VALGRIND_TEST_LABEL}" -j${NUM_CORES} --test-action test) -add_custom_command(TARGET core_example_regress_valgrind POST_BUILD COMMAND ctest -L "${VALGRIND_TEST_LABEL}" -j${NUM_CORES} --test-action test) +add_custom_command(TARGET core_example_regress POST_BUILD COMMAND ctest -LE ${VALGRIND_TEST_LABEL} -j${NUM_CORES} --test-action test) +add_custom_command(TARGET core_example_regress_valgrind POST_BUILD COMMAND ctest -L ${VALGRIND_TEST_LABEL} -j${NUM_CORES} --test-action test) # Ensure the simulator is built first. add_dependencies(core_example_regress sparta_core_example) diff --git a/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp b/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp index e83065bcf7..4c67bcdca0 100644 --- a/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp +++ b/sparta/sparta/serialization/checkpoint/CheckpointBase.hpp @@ -59,13 +59,13 @@ namespace sparta::serialization::checkpoint CheckpointBase(const CheckpointBase&) = delete; //! \brief Non-assignable - const CheckpointBase& operator=(const CheckpointBase&) = delete; + CheckpointBase& operator=(const CheckpointBase&) = delete; - //! \brief Default move construction - CheckpointBase(CheckpointBase&&) = default; + //! \brief Not move constructable + CheckpointBase(CheckpointBase&&) = delete; - //! \brief Default move assignment - CheckpointBase& operator=(CheckpointBase&&) = default; + //! \brief Not move assignable + CheckpointBase& operator=(CheckpointBase&&) = delete; protected: diff --git a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp index 8a8d688ef1..7f17aea363 100644 --- a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp @@ -654,7 +654,7 @@ namespace sparta::serialization::checkpoint * \pre Internal head pointer must be nullptr. * \note This can only be done once */ - virtual void setHead_(CheckpointBase* head) { + void setHead_(CheckpointBase* head) { sparta_assert(head != nullptr, "head argument in setHead_ cannot be nullptr"); sparta_assert(head_ == nullptr, "Cannot setHead_ again on a Checkpointer once heas is already set"); head_ = head; @@ -674,7 +674,7 @@ namespace sparta::serialization::checkpoint * checkpoint created will follow the current checkpoint set here. * Cannot be nullptr */ - virtual void setCurrent_(CheckpointBase* current) { + void setCurrent_(CheckpointBase* current) { sparta_assert(current != nullptr, "Can never setCurrent_ to nullptr except. A null current is a valid state at initialization only") current_ = current; diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp index 113156a49f..14fd2949d1 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp @@ -47,7 +47,7 @@ namespace sparta::serialization::checkpoint * \brief Checkpoint class optimized for use with database-backed * checkpointers. */ - class DatabaseCheckpoint : public CheckpointBase + class DatabaseCheckpoint final : public CheckpointBase { public: @@ -55,19 +55,16 @@ namespace sparta::serialization::checkpoint //! @{ //////////////////////////////////////////////////////////////////////// - //! \brief Default constructable required for boost::serialization - DatabaseCheckpoint() = default; - //! \brief Not copy constructable DatabaseCheckpoint(const DatabaseCheckpoint&) = delete; //! \brief Non-assignable DatabaseCheckpoint& operator=(const DatabaseCheckpoint&) = delete; - //! \brief Move constructor - DatabaseCheckpoint(DatabaseCheckpoint&&) = default; + //! \brief Not move constructable + DatabaseCheckpoint(DatabaseCheckpoint&&) = delete; - //! \brief Not move-assignable + //! \brief Not move assignable DatabaseCheckpoint& operator=(DatabaseCheckpoint&&) = delete; private: @@ -81,18 +78,13 @@ namespace sparta::serialization::checkpoint bool is_snapshot, DatabaseCheckpointer* checkpointer); - //! \brief This constructor is called during checkpoint cloning - DatabaseCheckpoint(chkpt_id_t id, - tick_t tick, - chkpt_id_t prev_id, - const std::vector& next_ids, - bool is_snapshot, - const storage::VectorStorage& storage, - DatabaseCheckpointer* checkpointer); + //! \brief Default constructable required for boost::serialization of ChkptWindow + DatabaseCheckpoint() = default; //////////////////////////////////////////////////////////////////////// //! @} + friend class ChkptWindow; friend class DatabaseCheckpointer; public: @@ -116,9 +108,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Writes all checkpoint raw data to an ostream. - * * \param o ostream to which raw data will be written. - * * \note No newlines or other extra characters will be appended. */ void dumpData(std::ostream& o) const override; @@ -191,9 +181,7 @@ namespace sparta::serialization::checkpoint /*! * \brief Loads delta state of this checkpoint to root. - * * \note Does not look at any other checkpoints. - * * \see DatabaseCheckpointer::load */ void loadState(const std::vector& dats); diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp index af7c4b694f..2a9278c971 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -19,7 +19,7 @@ class DatabaseCheckpointer; * checkpoints outside this window to/from SimDB as needed using * an LRU cache. */ -class DatabaseCheckpointer : public simdb::App, public Checkpointer +class DatabaseCheckpointer final : public simdb::App, public Checkpointer { public: static constexpr auto NAME = "db-checkpointer"; @@ -343,24 +343,21 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer void dumpCheckpointNode_(const chkpt_id_t id, std::ostream& o) override; /*! - * \brief Intercept calls to Checkpointer::setHead_() and ensure we do not delete it. + * \brief Sets the head checkpointer pointer to \a head for the first + * time + * \param head New head checkpoint pointer. Must not be nullptr + * \pre Internal head pointer must be nullptr. + * \note This can only be done once */ - void setHead_(CheckpointBase* head) override; + void setHead_(DatabaseCheckpoint* head); /*! - * \brief Intercept calls to Checkpointer::setCurrent_() and ensure we do not delete it. + * \brief Sets the current checkpoint pointer. + * \param current Pointer to set as current checkpoint. The next + * checkpoint created will follow the current checkpoint set here. + * Cannot be nullptr */ - void setCurrent_(CheckpointBase* current) override; - - /*! - * \brief Set ID of head checkpoint. - */ - void setHeadID_(chkpt_id_t id); - - /*! - * \brief Set ID of current checkpoint. - */ - void setCurrentID_(chkpt_id_t id); + void setCurrent_(DatabaseCheckpoint* current); /*! * \brief Add the given checkpoint to the cache and bump its window to the @@ -371,17 +368,7 @@ class DatabaseCheckpointer : public simdb::App, public Checkpointer /*! * \brief Get the window ID for the given checkpoint ID. */ - window_id_t getWindowID_(chkpt_id_t id) const { - return id / (snap_thresh_ + 1); - } - - /*! - * \brief Get the window ID for the given checkpoint. - */ - template - window_id_t getWindowID_(const CheckpointPtrT& chkpt) const { - return getWindowID_(chkpt->getID()); - } + window_id_t getWindowID_(chkpt_id_t id) const; /*! * \brief Bump the given window ID to the front of the LRU cache. diff --git a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp index e2f0121cc6..ce92e3a33b 100644 --- a/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/FastCheckpointer.hpp @@ -399,7 +399,7 @@ namespace sparta::serialization::checkpoint * deleted * \return Checkpoint with ID of \a id if found or nullptr if not found */ - checkpoint_type* findCheckpoint(chkpt_id_t id) noexcept { + const checkpoint_type* findCheckpoint(chkpt_id_t id) noexcept { auto it = chkpts_.find(id); if (it != chkpts_.end()) { return static_cast(it->second.get()); diff --git a/sparta/src/DatabaseCheckpoint.cpp b/sparta/src/DatabaseCheckpoint.cpp index ae1cdea521..931381bf47 100644 --- a/sparta/src/DatabaseCheckpoint.cpp +++ b/sparta/src/DatabaseCheckpoint.cpp @@ -46,22 +46,6 @@ DatabaseCheckpoint::DatabaseCheckpoint(TreeNode& root, } } -DatabaseCheckpoint::DatabaseCheckpoint(chkpt_id_t id, - tick_t tick, - chkpt_id_t prev_id, - const std::vector& next_ids, - bool is_snapshot, - const storage::VectorStorage& storage, - DatabaseCheckpointer* checkpointer) - : CheckpointBase(id, tick) - , prev_id_(prev_id) - , next_ids_(next_ids) - , is_snapshot_(is_snapshot) - , data_(storage) - , checkpointer_(checkpointer) -{ -} - std::string DatabaseCheckpoint::stringize() const { std::stringstream ss; diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index 1c330ff4c2..833ce4fba0 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -21,6 +21,7 @@ namespace sparta::serialization::checkpoint using tick_t = typename CheckpointBase::tick_t; using chkpt_id_t = typename CheckpointBase::chkpt_id_t; +using window_id_t = typename DatabaseCheckpointer::window_id_t; DatabaseCheckpointer::DatabaseCheckpointer(simdb::DatabaseManager* db_mgr, TreeNode& root, Scheduler* sched) : Checkpointer(root, sched), @@ -658,32 +659,24 @@ void DatabaseCheckpointer::dumpCheckpointNode_(const chkpt_id_t id, std::ostream } } -void DatabaseCheckpointer::setHead_(CheckpointBase* head) +void DatabaseCheckpointer::setHead_(DatabaseCheckpoint* head) { - std::lock_guard lock(cache_mutex_); - setHeadID_(head->getID()); - Checkpointer::setHead_(head); -} + const auto id = head->getID(); + sparta_assert(id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); + sparta_assert(head_id_ == checkpoint_type::UNIDENTIFIED_CHECKPOINT); -void DatabaseCheckpointer::setCurrent_(CheckpointBase* current) -{ std::lock_guard lock(cache_mutex_); - setCurrentID_(current->getID()); - Checkpointer::setCurrent_(current); + Checkpointer::setHead_(head); + head_id_ = id; } -void DatabaseCheckpointer::setHeadID_(chkpt_id_t id) +void DatabaseCheckpointer::setCurrent_(DatabaseCheckpoint* current) { - std::lock_guard lock(cache_mutex_); + const auto id = current->getID(); sparta_assert(id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); - sparta_assert(head_id_ == checkpoint_type::UNIDENTIFIED_CHECKPOINT || head_id_ == id); - head_id_ = id; -} -void DatabaseCheckpointer::setCurrentID_(chkpt_id_t id) -{ std::lock_guard lock(cache_mutex_); - sparta_assert(id != checkpoint_type::UNIDENTIFIED_CHECKPOINT); + Checkpointer::setCurrent_(current); current_id_ = id; } @@ -700,6 +693,11 @@ void DatabaseCheckpointer::addToCache_(std::shared_ptr chkpt) evictWindowsIfNeeded_(); } +window_id_t DatabaseCheckpointer::getWindowID_(chkpt_id_t id) const +{ + return id / (snap_thresh_ + 1); +} + void DatabaseCheckpointer::touchWindow_(window_id_t id) { std::lock_guard lock(cache_mutex_); diff --git a/sparta/test/CMakeLists.txt b/sparta/test/CMakeLists.txt index 635f33d1bb..1f854afc28 100644 --- a/sparta/test/CMakeLists.txt +++ b/sparta/test/CMakeLists.txt @@ -23,8 +23,8 @@ add_custom_target (regress_valgrind) # NOTE: # running ctest with --test-action test creates Testing//Test.xml # that can be loaded into the CI test result tracker -add_custom_command (TARGET regress POST_BUILD COMMAND ctest -LE "${VALGRIND_TEST_LABEL}" -j${NUM_CORES} --test-action test) -add_custom_command (TARGET regress_valgrind POST_BUILD COMMAND ctest -L "${VALGRIND_TEST_LABEL}" -j${NUM_CORES} --test-action test) +add_custom_command (TARGET regress POST_BUILD COMMAND ctest -LE ${VALGRIND_TEST_LABEL} -j${NUM_CORES} --test-action test) +add_custom_command (TARGET regress_valgrind POST_BUILD COMMAND ctest -L ${VALGRIND_TEST_LABEL} -j${NUM_CORES} --test-action test) #add_subdirectory (pipeViewer) add_subdirectory (Array) From 0ae3bffd80b6eb6e2f8dcfcb393ddc8bcd8f4d91 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Wed, 24 Sep 2025 19:42:07 -0500 Subject: [PATCH 20/30] Address PR feedback --- sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp index 14fd2949d1..8509fa4cf7 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpoint.hpp @@ -84,7 +84,7 @@ namespace sparta::serialization::checkpoint //////////////////////////////////////////////////////////////////////// //! @} - friend class ChkptWindow; + friend struct ChkptWindow; friend class DatabaseCheckpointer; public: From 81b874d30845dd7cd6ebd7f8553249f0c3529218 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Wed, 24 Sep 2025 20:13:21 -0500 Subject: [PATCH 21/30] Remove old debug code --- sparta/test/Buffer/Buffer_test.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sparta/test/Buffer/Buffer_test.cpp b/sparta/test/Buffer/Buffer_test.cpp index 3ea8825182..66d24b61a7 100644 --- a/sparta/test/Buffer/Buffer_test.cpp +++ b/sparta/test/Buffer/Buffer_test.cpp @@ -863,16 +863,9 @@ void testInvalidates() // testEraseSupport::const_reverse_iterator, sparta::Buffer>(); } -void redirectCerrToFile(const std::string& filename) -{ - static std::ofstream error_log(filename); // Ensure it stays open - std::cerr.rdbuf(error_log.rdbuf()); // Redirect cerr -} int main() { - redirectCerrToFile("Buffer_test_errors.log"); - testPointerTypes>(); testPointerTypes>(); generalTest(); From 6efd3a57cf330c073607abcea97ef42eade15c37 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Thu, 25 Sep 2025 09:58:40 -0500 Subject: [PATCH 22/30] Add DatabaseCheckpointer::findLatestCheckpointAtOrBefore() --- .../checkpoint/DatabaseCheckpointer.hpp | 19 +++++ sparta/src/DatabaseCheckpointer.cpp | 15 ++++ .../DatabaseCheckpoint_test.cpp | 83 ++++++++++++++----- 3 files changed, 94 insertions(+), 23 deletions(-) diff --git a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp index 2a9278c971..b871d8c384 100644 --- a/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/DatabaseCheckpointer.hpp @@ -193,6 +193,25 @@ class DatabaseCheckpointer final : public simdb::App, public Checkpointer */ std::shared_ptr findCheckpoint(chkpt_id_t id, bool must_exist=false); + /*! + * \brief Finds the latest checkpoint at or before the given tick + * starting at the \a from checkpoint and working backward. + * If no checkpoints before or at tick are found, returns nullptr. + * \param tick Tick to search for + * \param from Checkpoint at which to begin searching for a tick. + * Must be a valid checkpoint known by this checkpointer. + * See hasCheckpoint. + * \return The latest checkpoint with a tick number less than or equal + * to the \a tick argument. Returns nullptr if no checkpoints before \a + * tick were found. It is possible for the checkpoint identified by \a + * from could be returned. + * \warning This is not a high-performance method. Generally, + * a client of this interface knows a paticular ID. + * \throw CheckpointError if \a from does not refer to a valid + * checkpoint. + */ + std::shared_ptr findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from); + /*! * \brief Tests whether this checkpoint manager has a checkpoint with * the given id in the cache or in the database. diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index 833ce4fba0..68475588c0 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -329,6 +329,21 @@ std::shared_ptr DatabaseCheckpointer::findCheckpoint(chkpt_i return chkpt; } +std::shared_ptr DatabaseCheckpointer::findLatestCheckpointAtOrBefore(tick_t tick, chkpt_id_t from) +{ + std::lock_guard lock(cache_mutex_); + + auto chkpt = findCheckpoint(from, true); + do { + if (chkpt->getTick() <= tick) { + break; + } + chkpt = findCheckpoint(chkpt->getPrevID()); + } while (chkpt); + + return (chkpt && chkpt->getTick() <= tick) ? chkpt : nullptr; +} + bool DatabaseCheckpointer::hasCheckpoint(chkpt_id_t id) noexcept { return findCheckpoint(id) != nullptr; diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp index c425090661..75bc2da122 100644 --- a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp +++ b/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp @@ -66,18 +66,8 @@ class DummyDevice : public sparta::TreeNode {} }; -int main() +void RunCheckpointerTest(uint64_t initial_sched_tick = 0) { - auto warn_cerr = std::make_unique( - sparta::TreeNode::getVirtualGlobalNode(), - sparta::log::categories::WARN, - std::cerr); - - auto warn_file = std::make_unique( - sparta::TreeNode::getVirtualGlobalNode(), - sparta::log::categories::WARN, - "warnings.log"); - sparta::Scheduler sched; RootTreeNode clocks("clocks"); sparta::Clock clk(&clocks, "clock", &sched); @@ -122,6 +112,13 @@ int main() EXPECT_EQUAL(dbcp.getNumDeltas(), 0); EXPECT_TRUE(dbcp.getCheckpointChain(0).empty()); + // Advance the scheduler before taking the head checkpoint + if (initial_sched_tick > 0) { + sched.run(initial_sched_tick, true, false); + } + auto initial_tick = sched.getCurrentTick(); + EXPECT_EQUAL(initial_tick, initial_sched_tick); + // CHECKPOINT: Head DatabaseCheckpointer::chkpt_id_t head_id; EXPECT_NOTHROW(dbcp.createHead()); @@ -129,7 +126,7 @@ int main() EXPECT_NOTEQUAL(dbcp.getHead(), nullptr); EXPECT_EQUAL(head_id, dbcp.getHead()->getID()); EXPECT_EQUAL(dbcp.getCurrentID(), head_id); - EXPECT_EQUAL(dbcp.getCurrentTick(), 0); + EXPECT_EQUAL(dbcp.getCurrentTick(), initial_tick); EXPECT_TRUE(dbcp.isSnapshot(head_id)); std::cout << dbcp.stringize() << std::endl; @@ -176,7 +173,7 @@ int main() EXPECT_EQUAL(dbcp.getCurrentID(), id); EXPECT_EQUAL(dbcp.getNumCheckpoints(), id + 1); EXPECT_FALSE(dbcp.hasCheckpoint(id + 1)); - EXPECT_EQUAL(sched.getCurrentTick(), id); + EXPECT_EQUAL(sched.getCurrentTick(), id + initial_tick); auto r1_val = r1->read(); auto r2_val = r2->read(); @@ -462,13 +459,13 @@ int main() // To check the getCheckpointsAt() method, go back to the head // checkpoint. Then take a bunch of checkpoints at tick 1, 2, and 3. verif_load_chkpt(head_id); - EXPECT_EQUAL(sched.getCurrentTick(), 0); + EXPECT_EQUAL(sched.getCurrentTick(), initial_tick); std::vector chkpts_at_1; for (uint32_t i = 1; i <= 300; ++i) { const bool step_sched = (i == 1); auto id = step_checkpointer(i, step_sched); - EXPECT_EQUAL(sched.getCurrentTick(), 1); + EXPECT_EQUAL(sched.getCurrentTick(), 1 + initial_tick); chkpts_at_1.push_back(id); } @@ -476,7 +473,7 @@ int main() for (uint32_t i = 301; i <= 500; ++i) { const bool step_sched = (i == 301); auto id = step_checkpointer(i, step_sched); - EXPECT_EQUAL(sched.getCurrentTick(), 2); + EXPECT_EQUAL(sched.getCurrentTick(), 2 + initial_tick); chkpts_at_2.push_back(id); } @@ -484,22 +481,41 @@ int main() for (uint32_t i = 501; i <= 700; ++i) { const bool step_sched = (i == 501); auto id = step_checkpointer(i, step_sched); - EXPECT_EQUAL(sched.getCurrentTick(), 3); + EXPECT_EQUAL(sched.getCurrentTick(), 3 + initial_tick); chkpts_at_3.push_back(id); } - EXPECT_EQUAL(dbcp.getCheckpointsAt(1), chkpts_at_1); - EXPECT_EQUAL(dbcp.getCheckpointsAt(2), chkpts_at_2); - EXPECT_EQUAL(dbcp.getCheckpointsAt(3), chkpts_at_3); + EXPECT_EQUAL(dbcp.getCheckpointsAt(1 + initial_tick), chkpts_at_1); + EXPECT_EQUAL(dbcp.getCheckpointsAt(2 + initial_tick), chkpts_at_2); + EXPECT_EQUAL(dbcp.getCheckpointsAt(3 + initial_tick), chkpts_at_3); // Wait for the older checkpoints to be evicted and // verify getCheckpointsAt() again. wait_until_evicted(chkpts_at_1.back()); wait_until_evicted(chkpts_at_2.back()); - EXPECT_EQUAL(dbcp.getCheckpointsAt(1), chkpts_at_1); - EXPECT_EQUAL(dbcp.getCheckpointsAt(2), chkpts_at_2); - EXPECT_EQUAL(dbcp.getCheckpointsAt(3), chkpts_at_3); + EXPECT_EQUAL(dbcp.getCheckpointsAt(1 + initial_tick), chkpts_at_1); + EXPECT_EQUAL(dbcp.getCheckpointsAt(2 + initial_tick), chkpts_at_2); + EXPECT_EQUAL(dbcp.getCheckpointsAt(3 + initial_tick), chkpts_at_3); + + // Verify the findLatestCheckpointAtOrBefore() method. + // Valid tick (2), invalid ID (9999) + EXPECT_THROW(dbcp.findLatestCheckpointAtOrBefore(2, 9999)); + + // Valid ID (1), but tick is before the head checkpoint + if (initial_sched_tick > 0) { + EXPECT_EQUAL(dbcp.findLatestCheckpointAtOrBefore(initial_sched_tick - 1, 1), nullptr); + } + + // Valid tick (2), valid ID + EXPECT_NOTHROW(chkpt = dbcp.findLatestCheckpointAtOrBefore(2 + initial_sched_tick, chkpts_at_2.back())); + EXPECT_EQUAL(chkpt->getID(), chkpts_at_2.back()); + EXPECT_EQUAL(chkpt->getTick(), 2 + initial_sched_tick); + + // Valid tick (2), valid ID + EXPECT_NOTHROW(chkpt = dbcp.findLatestCheckpointAtOrBefore(2 + initial_sched_tick, chkpts_at_3.back())); + EXPECT_EQUAL(chkpt->getID(), chkpts_at_2.back()); + EXPECT_EQUAL(chkpt->getTick(), 2 + initial_sched_tick); // Verify that the head checkpoint is in the cache until simulation teardown. EXPECT_TRUE(dbcp.isCheckpointCached(head_id)); @@ -511,6 +527,27 @@ int main() // Ensure that the head checkpoint is no longer in the cache EXPECT_FALSE(dbcp.isCheckpointCached(head_id)); +} + +int main() +{ + auto warn_cerr = std::make_unique( + sparta::TreeNode::getVirtualGlobalNode(), + sparta::log::categories::WARN, + std::cerr); + + auto warn_file = std::make_unique( + sparta::TreeNode::getVirtualGlobalNode(), + sparta::log::categories::WARN, + "warnings.log"); + + // Run the test with initial scheduler tick = 0, + // i.e. head checkpoint at tick 0 + RunCheckpointerTest(0); + + // Run the test with initial scheduler tick = 10, + // i.e. head checkpoint at tick 10 + RunCheckpointerTest(10); REPORT_ERROR; return ERROR_CODE; From 4a4ac84eaf1c0f4a269f1038507c9bb93af55377 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Thu, 25 Sep 2025 10:19:28 -0500 Subject: [PATCH 23/30] Remove O(n) code from ensureWindowLoaded_ --- sparta/src/DatabaseCheckpointer.cpp | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/sparta/src/DatabaseCheckpointer.cpp b/sparta/src/DatabaseCheckpointer.cpp index 68475588c0..61b6a46964 100644 --- a/sparta/src/DatabaseCheckpointer.cpp +++ b/sparta/src/DatabaseCheckpointer.cpp @@ -787,25 +787,38 @@ bool DatabaseCheckpointer::ensureWindowLoaded_(chkpt_id_t chkpt_id, bool must_su checkpoint_ptrs window_chkpts = getWindowFromDatabase_(win_id); if (window_chkpts.empty() && must_succeed) { throw CheckpointError("Could not find checkpoint window with ID ") << win_id; + } else if (window_chkpts.empty()) { + return false; } chkpts_cache_[win_id] = std::move(window_chkpts); } - bool success = false; - for (const auto& chkpt : chkpts_cache_[win_id]) { - if (chkpt->getID() == chkpt_id) { - success = true; - break; + auto& window = chkpts_cache_[win_id]; + if (window.empty()) { + if (must_succeed) { + throw CheckpointError("Checkpoint window with ID ") << win_id << " is empty"; } + chkpts_cache_.erase(win_id); + return false; } - if (!success && must_succeed) { - throw CheckpointError("Could not find checkpoint with ID ") << chkpt_id; + auto snapshot_id = window.front()->getID(); + if (chkpt_id < snapshot_id || chkpt_id > window.back()->getID()) { + if (must_succeed) { + throw CheckpointError("Checkpoint ID ") << chkpt_id + << " is not in the loaded checkpoint window with ID " << win_id + << " which contains checkpoints from " << snapshot_id + << " to " << window.back()->getID(); + } + return false; } + auto& chkpt = window.at(chkpt_id - snapshot_id); + sparta_assert(chkpt->getID() == chkpt_id); + touchWindow_(win_id); evictWindowsIfNeeded_(); - return success; + return true; } std::vector> DatabaseCheckpointer::getWindowFromDatabase_(window_id_t win_id) From a8ec19d63597e4905767ca53afb07421d717b45c Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Thu, 25 Sep 2025 10:26:37 -0500 Subject: [PATCH 24/30] Re-enable Buffer_test --- sparta/sparta/resources/Buffer.hpp | 3 ++- sparta/test/CMakeLists.txt | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sparta/sparta/resources/Buffer.hpp b/sparta/sparta/resources/Buffer.hpp index 9f23785ea5..0a385c3e96 100644 --- a/sparta/sparta/resources/Buffer.hpp +++ b/sparta/sparta/resources/Buffer.hpp @@ -99,8 +99,9 @@ namespace sparta public: DataPointer() { } + ~DataPointer() { } - DataPointer(DataPointer &&orig) { + DataPointer(DataPointer &&orig) noexcept { ::memcpy(&object_memory_, &orig.object_memory_, sizeof(object_memory_)); data = reinterpret_cast(&object_memory_); } diff --git a/sparta/test/CMakeLists.txt b/sparta/test/CMakeLists.txt index 1f854afc28..f32dfb0894 100644 --- a/sparta/test/CMakeLists.txt +++ b/sparta/test/CMakeLists.txt @@ -30,9 +30,7 @@ add_custom_command (TARGET regress_valgrind POST_BUILD COMMAND ctest -L ${VALGR add_subdirectory (Array) add_subdirectory (Audience) add_subdirectory (BasicHistogram) -if (NOT APPLE) - add_subdirectory (Buffer) -endif () +add_subdirectory (Buffer) add_subdirectory (Bus) add_subdirectory (cache) add_subdirectory (CachedMemory) From 713c8aab27d4d10f7bd8e2e11d3f228842d5a4c9 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Thu, 25 Sep 2025 16:18:05 -0500 Subject: [PATCH 25/30] Move DatabaseCheckpoint test dir --- sparta/test/CMakeLists.txt | 1 + .../test/{FastCheckpoint => }/DatabaseCheckpoint/CMakeLists.txt | 0 .../DatabaseCheckpoint/DatabaseCheckpoint_test.cpp | 0 sparta/test/FastCheckpoint/CMakeLists.txt | 1 - 4 files changed, 1 insertion(+), 1 deletion(-) rename sparta/test/{FastCheckpoint => }/DatabaseCheckpoint/CMakeLists.txt (100%) rename sparta/test/{FastCheckpoint => }/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp (100%) diff --git a/sparta/test/CMakeLists.txt b/sparta/test/CMakeLists.txt index f32dfb0894..1a80c840b2 100644 --- a/sparta/test/CMakeLists.txt +++ b/sparta/test/CMakeLists.txt @@ -44,6 +44,7 @@ add_subdirectory (ContextCounter) add_subdirectory (CycleHistogram) add_subdirectory (DAG) add_subdirectory (DAG_Ordering) +add_subdirectory (DatabaseCheckpoint) add_subdirectory (DataView) add_subdirectory (Enum) add_subdirectory (Events) diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/CMakeLists.txt b/sparta/test/DatabaseCheckpoint/CMakeLists.txt similarity index 100% rename from sparta/test/FastCheckpoint/DatabaseCheckpoint/CMakeLists.txt rename to sparta/test/DatabaseCheckpoint/CMakeLists.txt diff --git a/sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp b/sparta/test/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp similarity index 100% rename from sparta/test/FastCheckpoint/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp rename to sparta/test/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp diff --git a/sparta/test/FastCheckpoint/CMakeLists.txt b/sparta/test/FastCheckpoint/CMakeLists.txt index 5a94bdd494..2bc9fbd9ac 100644 --- a/sparta/test/FastCheckpoint/CMakeLists.txt +++ b/sparta/test/FastCheckpoint/CMakeLists.txt @@ -6,4 +6,3 @@ sparta_test(FastCheckpoint_test FastCheckpoint_test_RUN) add_subdirectory(FILEStream) add_subdirectory(PersistentFastCheckpoint) -add_subdirectory(DatabaseCheckpoint) From 967a50ff15848ecd545e530b494d5cec86ee6cb6 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Thu, 25 Sep 2025 16:22:56 -0500 Subject: [PATCH 26/30] Change variable name in unit test --- .../DatabaseCheckpoint_test.cpp | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/sparta/test/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp b/sparta/test/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp index 75bc2da122..47b93d9f35 100644 --- a/sparta/test/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp +++ b/sparta/test/DatabaseCheckpoint/DatabaseCheckpoint_test.cpp @@ -66,7 +66,7 @@ class DummyDevice : public sparta::TreeNode {} }; -void RunCheckpointerTest(uint64_t initial_sched_tick = 0) +void RunCheckpointerTest(uint64_t initial_tick = 0) { sparta::Scheduler sched; RootTreeNode clocks("clocks"); @@ -113,11 +113,10 @@ void RunCheckpointerTest(uint64_t initial_sched_tick = 0) EXPECT_TRUE(dbcp.getCheckpointChain(0).empty()); // Advance the scheduler before taking the head checkpoint - if (initial_sched_tick > 0) { - sched.run(initial_sched_tick, true, false); + if (initial_tick > 0) { + sched.run(initial_tick, true, false); } - auto initial_tick = sched.getCurrentTick(); - EXPECT_EQUAL(initial_tick, initial_sched_tick); + EXPECT_EQUAL(sched.getCurrentTick(), initial_tick); // CHECKPOINT: Head DatabaseCheckpointer::chkpt_id_t head_id; @@ -503,19 +502,19 @@ void RunCheckpointerTest(uint64_t initial_sched_tick = 0) EXPECT_THROW(dbcp.findLatestCheckpointAtOrBefore(2, 9999)); // Valid ID (1), but tick is before the head checkpoint - if (initial_sched_tick > 0) { - EXPECT_EQUAL(dbcp.findLatestCheckpointAtOrBefore(initial_sched_tick - 1, 1), nullptr); + if (initial_tick > 0) { + EXPECT_EQUAL(dbcp.findLatestCheckpointAtOrBefore(initial_tick - 1, 1), nullptr); } // Valid tick (2), valid ID - EXPECT_NOTHROW(chkpt = dbcp.findLatestCheckpointAtOrBefore(2 + initial_sched_tick, chkpts_at_2.back())); + EXPECT_NOTHROW(chkpt = dbcp.findLatestCheckpointAtOrBefore(2 + initial_tick, chkpts_at_2.back())); EXPECT_EQUAL(chkpt->getID(), chkpts_at_2.back()); - EXPECT_EQUAL(chkpt->getTick(), 2 + initial_sched_tick); + EXPECT_EQUAL(chkpt->getTick(), 2 + initial_tick); // Valid tick (2), valid ID - EXPECT_NOTHROW(chkpt = dbcp.findLatestCheckpointAtOrBefore(2 + initial_sched_tick, chkpts_at_3.back())); + EXPECT_NOTHROW(chkpt = dbcp.findLatestCheckpointAtOrBefore(2 + initial_tick, chkpts_at_3.back())); EXPECT_EQUAL(chkpt->getID(), chkpts_at_2.back()); - EXPECT_EQUAL(chkpt->getTick(), 2 + initial_sched_tick); + EXPECT_EQUAL(chkpt->getTick(), 2 + initial_tick); // Verify that the head checkpoint is in the cache until simulation teardown. EXPECT_TRUE(dbcp.isCheckpointCached(head_id)); From 4e43e97c9452d626ea8e6e9849fa782e32a19fbe Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Mon, 29 Sep 2025 11:18:57 -0500 Subject: [PATCH 27/30] Bump SimDB --- sparta/simdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparta/simdb b/sparta/simdb index f94fd60cc5..40a80072ed 160000 --- a/sparta/simdb +++ b/sparta/simdb @@ -1 +1 @@ -Subproject commit f94fd60cc595f0ded277131093c2106f25a41e82 +Subproject commit 40a80072ed438ccc143723239b6017b5df626a44 From ce464b9da45ecf8bf588139cf3abc2479d9f333c Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 30 Sep 2025 09:34:38 -0500 Subject: [PATCH 28/30] Bump SimDB --- sparta/simdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparta/simdb b/sparta/simdb index 40a80072ed..64408cd505 160000 --- a/sparta/simdb +++ b/sparta/simdb @@ -1 +1 @@ -Subproject commit 40a80072ed438ccc143723239b6017b5df626a44 +Subproject commit 64408cd5059b0b4c3236c2ca8b2f7af24ea9fcdf From a7c722852729188a582fde8939803c3cd2c04ad0 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 30 Sep 2025 14:36:09 -0500 Subject: [PATCH 29/30] PR feedback --- .../sparta/serialization/checkpoint/Checkpointer.hpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp index 7f17aea363..1730e756a9 100644 --- a/sparta/sparta/serialization/checkpoint/Checkpointer.hpp +++ b/sparta/sparta/serialization/checkpoint/Checkpointer.hpp @@ -87,10 +87,7 @@ namespace sparta::serialization::checkpoint */ Checkpointer(TreeNode& root, sparta::Scheduler* sched=nullptr) : sched_(sched), - root_(root), - head_(nullptr), - current_(nullptr), - total_chkpts_created_(0) + root_(root) { } /*! @@ -742,7 +739,7 @@ namespace sparta::serialization::checkpoint * \brief Head checkpoint. This is the first checkpoint taken but cannot * be deleted. Head checkpoint memory is owned by checkpointer subclass. */ - CheckpointBase* head_; + CheckpointBase* head_ = nullptr; /*! * \brief ArchDatas required to checkpoint for this checkpointiner based @@ -753,13 +750,13 @@ namespace sparta::serialization::checkpoint /*! * \brief Most recent checkpoint created or loaded */ - CheckpointBase* current_; + CheckpointBase* current_ = nullptr; /*! * \brief Total checkpoint ever created by this instance. Monotonically * increasing. Includes the head checkpoint */ - uint64_t total_chkpts_created_; + uint64_t total_chkpts_created_ = 0; }; } // namespace sparta::serialization::checkpoint From 656e8bf94264991124397cbd34e065c2644548b5 Mon Sep 17 00:00:00 2001 From: Colby Nyce Date: Tue, 30 Sep 2025 14:55:21 -0500 Subject: [PATCH 30/30] Bump SimDB --- sparta/simdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparta/simdb b/sparta/simdb index 64408cd505..d35c99035b 160000 --- a/sparta/simdb +++ b/sparta/simdb @@ -1 +1 @@ -Subproject commit 64408cd5059b0b4c3236c2ca8b2f7af24ea9fcdf +Subproject commit d35c99035b8d870ed99000f44a957cad597f79c5