From a05202ab87271aab2a177f9defb6f44b7af5d49d Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Fri, 18 Mar 2022 18:28:13 -0500 Subject: [PATCH 01/43] Starting S3Outputer --- CMakeLists.txt | 14 ++++++ S3Outputer.cc | 18 +++++++ S3Outputer.h | 133 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+) create mode 100644 S3Outputer.cc create mode 100644 S3Outputer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index a2dde77..b9da154 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,3 +185,17 @@ if(ENABLE_HDF5) add_test(NAME TestProductsHDFEvent COMMAND bash -c "${CMAKE_CURRENT_BINARY_DIR}/threaded_io_test -s TestProductsSource -t 1 -n 10 -o HDFEventOutputer=test_prod_e.h5") #; ${CMAKE_CURRENT_BINARY_DIR}/threaded_io_test -s HDFSource=test_prodi_e.h5 -t 1 -n 10 -o TestProductsOutputer") endif() + +option(ENABLE_S3 "Build S3 Sources and Outputers" OFF) # default OFF +if(ENABLE_S3) + if(NOT DEFINED LIBS3_DIR) + message(FATAL_ERROR "You must provide LIBS3_DIR variable") + endif() + target_sources(threaded_io_test PRIVATE + S3Outputer.cc + ) + target_include_directories(threaded_io_test PRIVATE ${LIBS3_DIR}/include) + target_link_directories(threaded_io_test PRIVATE ${LIBS3_DIR}/lib) + target_link_libraries(threaded_io_test PRIVATE s3) + # add_test(NAME S3OutputerEmptyTest COMMAND threaded_io_test EmptySource 1 1 0 10 S3Outputer) +endif() diff --git a/S3Outputer.cc b/S3Outputer.cc new file mode 100644 index 0000000..3c5dae4 --- /dev/null +++ b/S3Outputer.cc @@ -0,0 +1,18 @@ +#include "S3Outputer.h" +#include "OutputerFactory.h" +#include + +namespace cce::tf { +namespace { + class Maker : public OutputerMakerBase { + public: + Maker(): OutputerMakerBase("S3Outputer") {} + std::unique_ptr create(unsigned int iNLanes, ConfigurationParameters const& params) const final { + bool verbose = params.get("verbose",false); + return std::make_unique(iNLanes, verbose); + } + }; + + Maker s_maker; +} +} diff --git a/S3Outputer.h b/S3Outputer.h new file mode 100644 index 0000000..9928aed --- /dev/null +++ b/S3Outputer.h @@ -0,0 +1,133 @@ +#if !defined(S3Outputer_h) +#define S3Outputer_h + +#include +#include +#include +#include + +#include "OutputerBase.h" +#include "EventIdentifier.h" +#include "SerializerWrapper.h" +#include "DataProductRetriever.h" +#include "summarize_serializers.h" +#include "SerialTaskQueue.h" + +#include "libs3.h" + +namespace cce::tf { +class S3Outputer :public OutputerBase { + public: + S3Outputer(unsigned int iNLanes, bool iVerbose): serializers_(iNLanes), verbose_(iVerbose) {} + + void setupForLane(unsigned int iLaneIndex, std::vector const& iDPs) final { + auto& s = serializers_[iLaneIndex]; + s.reserve(iDPs.size()); + for(auto const& dp: iDPs) { + s.emplace_back(dp.name(), dp.classType()); + } + if (outputProductBuffer_.size() == 0) { + outputProductBuffer_.resize(iDPs.size()); + for (auto& p : outputProductBuffer_) { + // initialize offsets + std::get<1>(p).push_back(0); + } + } + // all lanes see same products? if not we'll need a map + assert(outputProductBuffer_.size() == iDPs.size()); + } + + void productReadyAsync(unsigned int iLaneIndex, DataProductRetriever const& iDataProduct, TaskHolder iCallback) const final { + assert(iLaneIndex < serializers_.size()); + auto& laneSerializers = serializers_[iLaneIndex]; + auto group = iCallback.group(); + assert(iDataProduct.index() < laneSerializers.size() ); + laneSerializers[iDataProduct.index()].doWorkAsync(*group, iDataProduct.address(), std::move(iCallback)); + } + + bool usesProductReadyAsync() const final {return true; } + + void outputAsync(unsigned int iLaneIndex, EventIdentifier const& iEventID, TaskHolder iCallback) const final { + auto start = std::chrono::high_resolution_clock::now(); + // all products + queue_.push(*iCallback.group(), [this, iEventID, iLaneIndex, callback=std::move(iCallback)]() mutable { + auto start = std::chrono::high_resolution_clock::now(); + output(iEventID, serializers_[iLaneIndex]); + serialTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + callback.doneWaiting(); + }); + auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + parallelTime_ += time.count(); + } + + void printSummary() const final { + summarize_serializers(serializers_); + std::cout <<"S3Outputer\n total serial time at end event: "< const& iSerializers) const { + using namespace std::string_literals; + if(verbose_) { + std::cout <<" run:"s+std::to_string(iEventID.run)+" lumi:"s+std::to_string(iEventID.lumi)+" event:"s+std::to_string(iEventID.event)+"\n"<blob().size()); + std::copy(s->blob().begin(), s->blob().end(), buffer.begin()+offset); + + if ( buffer.size() > productBufferFlushMinSize_ ) { + size_t bufferNevents = offsets.size() - 1; + assert(eventIDs_.size() - global_offset == bufferNevents); + if(verbose_) { + std::cout << "product buffer for "s + std::string(s->name()) + " is full ("s + std::to_string(buffer.size()) + + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n" << std::flush; + } + // if ( goodDivisor(bufferNevents) ) ... + // must remember chosen divisor? + std::vector offsetsOut {0}; + std::vector bufferOut; + // use current size as hint + offsetsOut.reserve(offsets.size()); + bufferOut.reserve(buffer.size()); + + global_offset += bufferNevents; + std::swap(offsets, offsetsOut); + std::swap(buffer, bufferOut); + // writeAsync(offsetsOut, bufferOut); + } + } + + // if ( eventIDs_.size() > eventFlushSize_ ) + // any buffers with global_offset > 0 should be empty + // because the sizes all evenly divide eventFlushSize_ + // the rest never got big enough, write them out now + // merge some together to respect productBufferFlushMinSize_? + } +private: + mutable std::vector> serializers_; + mutable SerialTaskQueue queue_; + + // configuration options + bool verbose_; + size_t productBufferFlushMinSize_{1024*512}; + size_t eventFlushSize_{24}; + + // starting event index (into eventIDs_), byte offset for each event, contiguous serialized product data + using ProductInfo = std::tuple, std::vector>; + // data product order matches serializers_ inner vector + mutable std::vector outputProductBuffer_; + mutable std::vector eventIDs_; + + mutable std::chrono::microseconds serialTime_; + mutable std::atomic parallelTime_; +}; +} +#endif From d6268c9033df7391bc94de010c77381b2d72f324 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Tue, 22 Mar 2022 16:51:00 -0500 Subject: [PATCH 02/43] Event flush --- S3Outputer.h | 79 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 19 deletions(-) diff --git a/S3Outputer.h b/S3Outputer.h index 9928aed..65953d0 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -18,7 +18,14 @@ namespace cce::tf { class S3Outputer :public OutputerBase { public: - S3Outputer(unsigned int iNLanes, bool iVerbose): serializers_(iNLanes), verbose_(iVerbose) {} + S3Outputer(unsigned int iNLanes, bool iVerbose): + serializers_(iNLanes), + verbose_(iVerbose), + serialTime_{std::chrono::microseconds::zero()}, + parallelTime_{0} + { + eventIDs_.reserve(eventFlushSize_); + } void setupForLane(unsigned int iLaneIndex, std::vector const& iDPs) final { auto& s = serializers_[iLaneIndex]; @@ -28,10 +35,6 @@ class S3Outputer :public OutputerBase { } if (outputProductBuffer_.size() == 0) { outputProductBuffer_.resize(iDPs.size()); - for (auto& p : outputProductBuffer_) { - // initialize offsets - std::get<1>(p).push_back(0); - } } // all lanes see same products? if not we'll need a map assert(outputProductBuffer_.size() == iDPs.size()); @@ -77,40 +80,75 @@ class S3Outputer :public OutputerBase { auto s = std::begin(iSerializers); auto p = std::begin(outputProductBuffer_); for(; s != std::end(iSerializers); ++s, ++p) { - auto& [global_offset, offsets, buffer] = *p; + auto& [global_offset, last_flush, offsets, buffer] = *p; size_t offset = buffer.size(); offsets.push_back(offset); buffer.resize(offset + s->blob().size()); std::copy(s->blob().begin(), s->blob().end(), buffer.begin()+offset); - - if ( buffer.size() > productBufferFlushMinSize_ ) { - size_t bufferNevents = offsets.size() - 1; + size_t bufferNevents = offsets.size(); + + // first flush when we exceed min size and have an even divisor of eventFlushSize_ + // subsequent flush when we reach last_flush + if ( + ((last_flush == 0) && (buffer.size() > productBufferFlushMinSize_) && (eventFlushSize_ % bufferNevents == 0)) + || (bufferNevents == last_flush) + ) + { assert(eventIDs_.size() - global_offset == bufferNevents); if(verbose_) { std::cout << "product buffer for "s + std::string(s->name()) + " is full ("s + std::to_string(buffer.size()) + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n" << std::flush; } - // if ( goodDivisor(bufferNevents) ) ... - // must remember chosen divisor? - std::vector offsetsOut {0}; + std::vector offsetsOut; std::vector bufferOut; // use current size as hint offsetsOut.reserve(offsets.size()); bufferOut.reserve(buffer.size()); global_offset += bufferNevents; + last_flush = bufferNevents; std::swap(offsets, offsetsOut); std::swap(buffer, bufferOut); // writeAsync(offsetsOut, bufferOut); } } - // if ( eventIDs_.size() > eventFlushSize_ ) - // any buffers with global_offset > 0 should be empty - // because the sizes all evenly divide eventFlushSize_ - // the rest never got big enough, write them out now - // merge some together to respect productBufferFlushMinSize_? + if ( eventIDs_.size() == eventFlushSize_ ) { + if(verbose_) { + std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n" << std::flush; + } + // any buffers with global_offset > 0 should be empty + // because the sizes all evenly divide eventFlushSize_ + // the rest never got big enough, write them out now + // merge some together to respect productBufferFlushMinSize_? + for(auto& p : outputProductBuffer_) { + auto& [global_offset, last_flush, offsets, buffer] = p; + size_t bufferNevents = offsets.size(); + assert((global_offset == 0) ^ (bufferNevents == 0)); + if (bufferNevents > 0) { + if(verbose_) { + std::cout << "product buffer for X is full ("s + std::to_string(buffer.size()) + + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n" << std::flush; + } + std::vector offsetsOut; + std::vector bufferOut; + // use current size as hint + offsetsOut.reserve(offsets.size()); + bufferOut.reserve(buffer.size()); + + global_offset += bufferNevents; + last_flush = bufferNevents; + std::swap(offsets, offsetsOut); + std::swap(buffer, bufferOut); + // writeAsync(offsetsOut, bufferOut); + } + assert(global_offset == eventFlushSize_); + global_offset = 0; + } + eventIDs_.clear(); + } } + private: mutable std::vector> serializers_; mutable SerialTaskQueue queue_; @@ -120,8 +158,11 @@ class S3Outputer :public OutputerBase { size_t productBufferFlushMinSize_{1024*512}; size_t eventFlushSize_{24}; - // starting event index (into eventIDs_), byte offset for each event, contiguous serialized product data - using ProductInfo = std::tuple, std::vector>; + // 0: starting event index (into eventIDs_) + // 1: last buffer flush size + // 2: byte offset for each event + // 3: contiguous serialized product data + using ProductInfo = std::tuple, std::vector>; // data product order matches serializers_ inner vector mutable std::vector outputProductBuffer_; mutable std::vector eventIDs_; From f3dab1aa83ddcab07f29f2d709017faed81b7adc Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Wed, 23 Mar 2022 10:38:09 -0500 Subject: [PATCH 03/43] Simplify small buffer output --- S3Outputer.h | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/S3Outputer.h b/S3Outputer.h index 65953d0..e6b3f7b 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -89,9 +89,11 @@ class S3Outputer :public OutputerBase { // first flush when we exceed min size and have an even divisor of eventFlushSize_ // subsequent flush when we reach last_flush + // always flush when we reach eventFlushSize_ (for buffers that never get big enough) if ( ((last_flush == 0) && (buffer.size() > productBufferFlushMinSize_) && (eventFlushSize_ % bufferNevents == 0)) || (bufferNevents == last_flush) + || (bufferNevents == eventFlushSize_) ) { assert(eventIDs_.size() - global_offset == bufferNevents); @@ -119,33 +121,16 @@ class S3Outputer :public OutputerBase { } // any buffers with global_offset > 0 should be empty // because the sizes all evenly divide eventFlushSize_ - // the rest never got big enough, write them out now - // merge some together to respect productBufferFlushMinSize_? for(auto& p : outputProductBuffer_) { auto& [global_offset, last_flush, offsets, buffer] = p; - size_t bufferNevents = offsets.size(); - assert((global_offset == 0) ^ (bufferNevents == 0)); - if (bufferNevents > 0) { - if(verbose_) { - std::cout << "product buffer for X is full ("s + std::to_string(buffer.size()) - + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n" << std::flush; - } - std::vector offsetsOut; - std::vector bufferOut; - // use current size as hint - offsetsOut.reserve(offsets.size()); - bufferOut.reserve(buffer.size()); - - global_offset += bufferNevents; - last_flush = bufferNevents; - std::swap(offsets, offsetsOut); - std::swap(buffer, bufferOut); - // writeAsync(offsetsOut, bufferOut); - } + assert(bufferNevents == 0); assert(global_offset == eventFlushSize_); global_offset = 0; } - eventIDs_.clear(); + std::vector eventIDsOut; + eventIDsOut.reserve(eventFlushSize_); + std::swap(eventIDs_, eventIDsOut); + // writeAsync(eventIDsOut); } } From 1de023e691eb54a1f69ff10fa97a27902b328c35 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Wed, 23 Mar 2022 23:29:53 -0500 Subject: [PATCH 04/43] Implement get and put --- CMakeLists.txt | 1 + ConfigurationParameters.cc | 5 + S3Common.cc | 286 +++++++++++++++++++++++++++++++++++++ S3Common.h | 59 ++++++++ S3Outputer.cc | 32 ++++- S3Outputer.h | 26 ++-- 6 files changed, 397 insertions(+), 12 deletions(-) create mode 100644 S3Common.cc create mode 100644 S3Common.h diff --git a/CMakeLists.txt b/CMakeLists.txt index b9da154..1d545f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -193,6 +193,7 @@ if(ENABLE_S3) endif() target_sources(threaded_io_test PRIVATE S3Outputer.cc + S3Common.cc ) target_include_directories(threaded_io_test PRIVATE ${LIBS3_DIR}/include) target_link_directories(threaded_io_test PRIVATE ${LIBS3_DIR}/lib) diff --git a/ConfigurationParameters.cc b/ConfigurationParameters.cc index 995ecfc..999bc0d 100644 --- a/ConfigurationParameters.cc +++ b/ConfigurationParameters.cc @@ -21,6 +21,11 @@ namespace cce::tf { return std::stof(iValue); } + template<> + unsigned long ConfigurationParameters::convert(std::string const& iValue) { + return std::stoul(iValue); + } + template<> bool ConfigurationParameters::convert(std::string const& iValue) { return (iValue.empty()) or ( diff --git a/S3Common.cc b/S3Common.cc new file mode 100644 index 0000000..c843e1d --- /dev/null +++ b/S3Common.cc @@ -0,0 +1,286 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libs3.h" +#include "tbb/concurrent_queue.h" +#include "S3Common.h" + + +namespace cce::tf { + +class S3LibWrapper { + public: + S3LibWrapper(bool async=false) : async_(async), running_(false) { + initStatus_ = S3_initialize("s3", S3_INIT_ALL, ""); + if ( initStatus_ != S3StatusOK ) { + std::cerr << "Failed to initialize libs3, error: " << S3_get_status_name(initStatus_) << "\n"; + return; + } + running_ = true; + if ( async_ ) { + loop_ = std::thread(&S3LibWrapper::loop_body, this); + } + } + + ~S3LibWrapper() { + running_ = false; + if ( loop_.joinable() ) loop_.join(); + S3_deinitialize(); + } + + bool isAsync() { return async_; } + bool running() { return running_; } + + void get(const S3BucketContext* bucketCtx, const std::string key, S3Request::Callback&& cb) { + auto req = new S3Request{ + .type = S3Request::Type::get, + .bucketCtx = bucketCtx, + .key = key, + .callback = std::move(cb), + .owner = this, + }; + if ( async_ ) { + requests_.push(req); + } else { + submit(req); + } + } + + void put(const S3BucketContext* bucketCtx, const std::string key, std::vector&& value, S3Request::Callback&& cb) { + auto req = new S3Request{ + .type = S3Request::Type::put, + .bucketCtx = bucketCtx, + .key = key, + .callback = std::move(cb), + .buffer = std::move(value), + .owner = this, + }; + if ( async_ ) { + requests_.push(req); + } else { + submit(req); + } + } + + private: + void loop_body() { + S3_create_request_context(&requestContext_); + while(running_) { + using namespace std::chrono_literals; + std::this_thread::sleep_for(1s); + + // S3Status S3_get_request_context_fdsets(S3RequestContext *requestContext, fd_set *readFdSet, fd_set *writeFdSet, fd_set *exceptFdSet, int *maxFd); + // int64_t S3_get_request_context_timeout(S3RequestContext *requestContext); // milliseconds + // select() + // S3Status S3_runonce_request_context(S3RequestContext *requestContext, int *requestsRemainingReturn); + + // S3Request* req; + // concurrency limit? + // while ( requests_.try_pop(req) ) submit(req); + } + S3_destroy_request_context(requestContext_); + } + + void submit(S3Request* req) { + // this function will not block if requestContext_ is not null + // which should only be the case if async_ is true + switch ( req->type ) { + case S3Request::Type::undef: + assert(false); // logic error + break; + case S3Request::Type::get: + S3_get_object( + req->bucketCtx, + req->key.c_str(), + nullptr, // S3GetConditions + 0, // startByte + 0, // byteCount + requestContext_, + req->timeout, + &S3LibWrapper::getObjectHandler, + static_cast(req)); + break; + case S3Request::Type::put: + S3_put_object( + req->bucketCtx, + req->key.c_str(), + req->buffer.size(), + nullptr, // S3PutProperties (TODO probably want .md5) + requestContext_, + req->timeout, + &S3LibWrapper::putObjectHandler, + static_cast(req)); + break; + } + } + + static S3Status responsePropertiesCallback(const S3ResponseProperties *properties, void *callbackData) { + auto req = static_cast(callbackData); + if ( req->type == S3Request::Type::get ) { + if ( properties->contentLength > 0 ) { + req->buffer.reserve(properties->contentLength); + } + // else what? + // TODO: save headers? + } + return S3StatusOK; + // perhaps S3StatusAbortedByCallback + } + + static void responseCompleteCallback(S3Status status, const S3ErrorDetails *error, void *callbackData) { + auto req = static_cast(callbackData); + if ( S3_status_is_retryable(status) && req->retriesRemaining > 0 ) { + req->retriesRemaining--; + if ( req->owner->async_ ) { + req->owner->requests_.push(req); + } else { + // can libs3 callbacks recurse? probably... + req->owner->submit(req); + } + return; // no delete! + } + switch ( status ) { + case S3StatusOK: + req->status = S3Request::Status::ok; + break; + default: + req->status = S3Request::Status::error; + } + if ( req->callback ) req->callback(req); + // end of S3Request lifecycle (s3lib will always call responseCompleteCallback) + delete req; + } + + static int putObjectDataCallback(int bufferSize, char *buffer, void *callbackData) { + auto req = static_cast(callbackData); + size_t toWrite = std::min(bufferSize, (int) (req->buffer.size() - req->put_offset)); + if ( toWrite > 0 ) { + std::copy_n(req->buffer.begin() + req->put_offset, toWrite, buffer); + req->put_offset += toWrite; + } + // return > 0 = bytes written, 0 = done, -1 = S3StatusAbortedByCallback + return toWrite; + } + + static S3Status getObjectDataCallback(int bufferSize, const char *buffer, void *callbackData) { + auto req = static_cast(callbackData); + auto offset = req->buffer.size(); + req->buffer.resize(offset + bufferSize); // out of memory exception? + std::copy_n(buffer, bufferSize, req->buffer.begin() + offset); + return S3StatusOK; // can also return S3StatusAbortedByCallback + } + + constexpr static S3ResponseHandler responseHandler{ + &S3LibWrapper::responsePropertiesCallback, + &S3LibWrapper::responseCompleteCallback + }; + + constexpr static S3PutObjectHandler putObjectHandler{ + responseHandler, + &S3LibWrapper::putObjectDataCallback + }; + + constexpr static S3GetObjectHandler getObjectHandler{ + responseHandler, + &S3LibWrapper::getObjectDataCallback + }; + + private: + S3Status initStatus_; + bool async_; + std::thread loop_; + std::atomic running_; + S3RequestContext* requestContext_{nullptr}; + // all callbackData pointers are to S3Request objects + tbb::concurrent_queue requests_; +}; + +// libs3 asks us to initialize and de-initialize once per process +// optional TODO: make it a singleton and only initialize when needed +S3LibWrapper s3lib; + + +S3ConnectionRef S3Connection::from_config(std::string filename) { + std::ifstream fin(filename); + if (not fin.is_open()) { + std::cerr << "S3Connection config file " << filename << " could not be opened\n"; + return {}; + } + std::string hostName; + std::string bucketName; + std::string accessKeyId; + std::string secretAccessKey; + std::string securityToken; + for (std::string line; std::getline(fin, line); ) { + if ( line.empty() || line[0] == '#' ) continue; + auto delim = line.find("="); + auto key = line.substr(0, delim); + auto val = line.substr(delim+1, line.length() - 1); + if ( key == "hostName" ) hostName = val; + else if ( key == "bucketName" ) bucketName = val; + else if ( key == "accessKeyId" ) accessKeyId = val; + else if ( key == "secretAccessKey" ) secretAccessKey = val; + else if ( key == "securityToken" ) securityToken = val; + else { + std::cerr << "unrecognized config file key " << key << " in S3Connection config " << filename << "\n"; + } + } + + if ( hostName.empty() || bucketName.empty() || accessKeyId.empty() || secretAccessKey.empty() ) { + std::cerr << "S3Connection config file missing required keys\n"; + return {}; + } + + if ( not s3lib.running() ) { + return {}; + } + + S3Status status = S3_validate_bucket_name(bucketName.c_str(), S3UriStyleVirtualHost); + if ( status != S3StatusOK ) { + std::cerr << "S3 bucket name invalid: " << bucketName << "\n"; + return {}; + } + + return std::make_shared(hostName, bucketName, accessKeyId, secretAccessKey, securityToken); +}; + +S3Connection::S3Connection( + std::string_view iHostName, + std::string_view iBucketName, + std::string_view iAccessKey, + std::string_view iSecretKey, + std::string_view iSecurityToken + ) : + hostName_(iHostName), + bucketName_(iBucketName), + accessKeyId_(iAccessKey), + secretAccessKey_(iSecretKey), + securityToken_(iSecurityToken) +{ + ctx_.reset(new S3BucketContext{ + .hostName = hostName_.c_str(), + .bucketName = bucketName_.c_str(), + .protocol = S3ProtocolHTTP, + .uriStyle = S3UriStylePath, + .accessKeyId = accessKeyId_.c_str(), + .secretAccessKey = secretAccessKey_.c_str(), + .securityToken = securityToken_.empty() ? nullptr : securityToken_.c_str(), + .authRegion = nullptr + }); +}; + +void S3Connection::get(const std::string key, S3Request::Callback&& cb) { + s3lib.get(ctx_.get(), key, std::move(cb)); +}; + +void S3Connection::put(const std::string key, std::vector&& value, S3Request::Callback&& cb) { + s3lib.put(ctx_.get(), key, std::move(value), std::move(cb)); +}; + +} diff --git a/S3Common.h b/S3Common.h new file mode 100644 index 0000000..ff911bb --- /dev/null +++ b/S3Common.h @@ -0,0 +1,59 @@ +#if !defined(S3Common_h) +#define S3Common_h + +#include +#include + +// libs3.h +struct S3BucketContext; + +namespace cce::tf { +class S3LibWrapper; +class S3Connection; +typedef std::shared_ptr S3ConnectionRef; + +struct S3Request { + enum class Type {undef, get, put}; + enum class Status {ok, error}; + typedef std::function Callback; + + const Type type{Type::undef}; + const S3BucketContext* bucketCtx{nullptr}; + const std::string key; + const Callback callback; + std::vector buffer; + int timeout{1000}; // milliseconds + int retriesRemaining{3}; + Status status; + // "private" + S3LibWrapper *const owner{nullptr}; + size_t put_offset{0}; +}; + +class S3Connection { + public: + static S3ConnectionRef from_config(std::string filename); + + S3Connection( + std::string_view iHostName, + std::string_view iBucketName, + std::string_view iAccessKey, + std::string_view iSecretKey, + std::string_view iSecurityToken + ); + + void get(const std::string key, S3Request::Callback&& cb); + void put(const std::string key, std::vector&& value, S3Request::Callback&& cb); + + private: + const std::string hostName_; + const std::string bucketName_; + const std::string accessKeyId_; + const std::string secretAccessKey_; + const std::string securityToken_; + // holds pointers to c_str() of the above + std::unique_ptr ctx_; +}; + +} +#endif diff --git a/S3Outputer.cc b/S3Outputer.cc index 3c5dae4..01fb8d4 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -8,8 +8,36 @@ namespace { public: Maker(): OutputerMakerBase("S3Outputer") {} std::unique_ptr create(unsigned int iNLanes, ConfigurationParameters const& params) const final { - bool verbose = params.get("verbose",false); - return std::make_unique(iNLanes, verbose); + auto verbose = params.get("verbose", 0); + auto productFlush = params.get("productFlush", 1024*512); + auto eventFlush = params.get("eventFlush", 24); + auto connfile = params.get("conn"); + if(not connfile) { + std::cerr <<"no connection configuration file name given for S3Outputer\n"; + return {}; + } + auto conn = S3Connection::from_config(connfile.value()); + if(not conn) { + return {}; + } + + std::vector tmp; + conn->get("testkey", [&tmp](S3Request* req) mutable { + if ( req->status == S3Request::Status::ok ) { + std::swap(req->buffer, tmp); + } + else { std::cout << "no key" << std::endl; } + }); + std::string_view s(tmp.data(), tmp.size()); + std::cout << tmp.size() << ": " << s << std::endl; + + conn->put("testkey2", std::move(tmp), [](S3Request* req) { + if ( req->status == S3Request::Status::ok ) { + std::cout << "wrote something!" << std::endl; + } + }); + + return std::make_unique(iNLanes, verbose, productFlush, eventFlush, conn); } }; diff --git a/S3Outputer.h b/S3Outputer.h index e6b3f7b..816417b 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -13,14 +13,17 @@ #include "summarize_serializers.h" #include "SerialTaskQueue.h" -#include "libs3.h" +#include "S3Common.h" namespace cce::tf { -class S3Outputer :public OutputerBase { +class S3Outputer : public OutputerBase { public: - S3Outputer(unsigned int iNLanes, bool iVerbose): + S3Outputer(unsigned int iNLanes, int iVerbose, size_t iProductBufferFlush, size_t iEventFlushSize, S3ConnectionRef conn): serializers_(iNLanes), verbose_(iVerbose), + productBufferFlushMinSize_(iProductBufferFlush), + eventFlushSize_(iEventFlushSize), + conn_(conn), serialTime_{std::chrono::microseconds::zero()}, parallelTime_{0} { @@ -64,7 +67,9 @@ class S3Outputer :public OutputerBase { } void printSummary() const final { - summarize_serializers(serializers_); + if(verbose_ >= 2) { + summarize_serializers(serializers_); + } std::cout <<"S3Outputer\n total serial time at end event: "< const& iSerializers) const { using namespace std::string_literals; - if(verbose_) { + if(verbose_ >= 2) { std::cout <<" run:"s+std::to_string(iEventID.run)+" lumi:"s+std::to_string(iEventID.lumi)+" event:"s+std::to_string(iEventID.event)+"\n"<= 2) { std::cout << "product buffer for "s + std::string(s->name()) + " is full ("s + std::to_string(buffer.size()) + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n" << std::flush; } @@ -116,7 +121,7 @@ class S3Outputer :public OutputerBase { } if ( eventIDs_.size() == eventFlushSize_ ) { - if(verbose_) { + if(verbose_ >= 2) { std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n" << std::flush; } // any buffers with global_offset > 0 should be empty @@ -139,9 +144,10 @@ class S3Outputer :public OutputerBase { mutable SerialTaskQueue queue_; // configuration options - bool verbose_; - size_t productBufferFlushMinSize_{1024*512}; - size_t eventFlushSize_{24}; + int verbose_; + size_t productBufferFlushMinSize_; + size_t eventFlushSize_; + S3ConnectionRef conn_; // 0: starting event index (into eventIDs_) // 1: last buffer flush size From 78b67bdc60fc79aecabf2bc2f4d362f476d6ad53 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Wed, 1 Jun 2022 11:53:36 -0500 Subject: [PATCH 05/43] Implement basic product and event output Missing tail of events if number to be processed (-n) is not divisible by eventFlush parameter --- S3Outputer.h | 99 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 19 deletions(-) diff --git a/S3Outputer.h b/S3Outputer.h index 816417b..38f8309 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -6,13 +6,14 @@ #include #include +#include "tbb/task_group.h" + #include "OutputerBase.h" #include "EventIdentifier.h" #include "SerializerWrapper.h" #include "DataProductRetriever.h" #include "summarize_serializers.h" #include "SerialTaskQueue.h" - #include "S3Common.h" namespace cce::tf { @@ -58,9 +59,8 @@ class S3Outputer : public OutputerBase { // all products queue_.push(*iCallback.group(), [this, iEventID, iLaneIndex, callback=std::move(iCallback)]() mutable { auto start = std::chrono::high_resolution_clock::now(); - output(iEventID, serializers_[iLaneIndex]); + output(iEventID, serializers_[iLaneIndex], std::move(callback)); serialTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); - callback.doneWaiting(); }); auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); parallelTime_ += time.count(); @@ -70,12 +70,19 @@ class S3Outputer : public OutputerBase { if(verbose_ >= 2) { summarize_serializers(serializers_); } + std::chrono::microseconds serializerTime = std::chrono::microseconds::zero(); + for(const auto& lane : serializers_) { + for(const auto& s : lane) { + serializerTime += s.accumulatedTime(); + } + } std::cout <<"S3Outputer\n total serial time at end event: "< const& iSerializers) const { + void output(EventIdentifier const& iEventID, std::vector const& iSerializers, TaskHolder iCallback) const { using namespace std::string_literals; if(verbose_ >= 2) { std::cout <<" run:"s+std::to_string(iEventID.run)+" lumi:"s+std::to_string(iEventID.lumi)+" event:"s+std::to_string(iEventID.event)+"\n"<blob().size()); @@ -93,15 +100,15 @@ class S3Outputer : public OutputerBase { size_t bufferNevents = offsets.size(); // first flush when we exceed min size and have an even divisor of eventFlushSize_ - // subsequent flush when we reach last_flush + // subsequent flush when we reach productFlushSize // always flush when we reach eventFlushSize_ (for buffers that never get big enough) if ( - ((last_flush == 0) && (buffer.size() > productBufferFlushMinSize_) && (eventFlushSize_ % bufferNevents == 0)) - || (bufferNevents == last_flush) + ((productFlushSize == 0) && (buffer.size() > productBufferFlushMinSize_) && (eventFlushSize_ % bufferNevents == 0)) + || (bufferNevents == productFlushSize) || (bufferNevents == eventFlushSize_) ) { - assert(eventIDs_.size() - global_offset == bufferNevents); + assert(eventGlobalOffset_ + eventIDs_.size() - productGlobalOffset == bufferNevents); if(verbose_ >= 2) { std::cout << "product buffer for "s + std::string(s->name()) + " is full ("s + std::to_string(buffer.size()) + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n" << std::flush; @@ -112,11 +119,36 @@ class S3Outputer : public OutputerBase { offsetsOut.reserve(offsets.size()); bufferOut.reserve(buffer.size()); - global_offset += bufferNevents; - last_flush = bufferNevents; std::swap(offsets, offsetsOut); std::swap(buffer, bufferOut); - // writeAsync(offsetsOut, bufferOut); + iCallback.group()->run( + [this, productName=s->name(), productGlobalOffset, offsets=std::move(offsetsOut), buffer=std::move(bufferOut), callback=iCallback]() { + auto start = std::chrono::high_resolution_clock::now(); + // concatenate offsets and buffer + std::vector finalbuf( + sizeof(decltype(productGlobalOffset)) + + offsets.size()*sizeof(decltype(offsets)::value_type) + + buffer.size() + ); + auto it = std::begin(finalbuf); + it = std::copy_n(reinterpret_cast(&productGlobalOffset), sizeof(decltype(productGlobalOffset)), it); + it = std::copy(std::begin(offsets), std::end(offsets), it); + it = std::copy(std::begin(buffer), std::end(buffer), it); + // TODO can we clear offsets and buffer yet? + assert(it == std::end(finalbuf)); + std::string name(productName); + name += std::to_string(productGlobalOffset); + conn_->put(name, std::move(finalbuf), [name, callback=std::move(callback)](S3Request* req) { + if ( req->status != S3Request::Status::ok ) { + std::cerr << "failed to write product buffer " << name << std::endl; + } + }); + auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + parallelTime_ += time.count(); + } + ); + productGlobalOffset += bufferNevents; + productFlushSize = bufferNevents; } } @@ -124,18 +156,46 @@ class S3Outputer : public OutputerBase { if(verbose_ >= 2) { std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n" << std::flush; } - // any buffers with global_offset > 0 should be empty - // because the sizes all evenly divide eventFlushSize_ + // all buffers should be empty because the sizes all evenly divide eventFlushSize_ for(auto& p : outputProductBuffer_) { - auto& [global_offset, last_flush, offsets, buffer] = p; + auto& [productGlobalOffset, productFlushSize, offsets, buffer] = p; assert(bufferNevents == 0); - assert(global_offset == eventFlushSize_); - global_offset = 0; + assert(productGlobalOffset == eventGlobalOffset_ + eventFlushSize_); } std::vector eventIDsOut; eventIDsOut.reserve(eventFlushSize_); std::swap(eventIDs_, eventIDsOut); - // writeAsync(eventIDsOut); + iCallback.group()->run( + [this, events=std::move(eventIDsOut), callback=iCallback]() { + auto start = std::chrono::high_resolution_clock::now(); + + // serialize EventIdentifier + constexpr unsigned int headerBufferSizeInWords = 5; + std::vector finalbuf(headerBufferSizeInWords*4*events.size()); + auto it = begin(finalbuf); + size_t iev = eventGlobalOffset_; + std::array buffer; + for(const auto& ev : events) { + buffer[0] = iev++; + buffer[1] = ev.run; + buffer[2] = ev.lumi; + buffer[3] = (ev.event >> 32) & 0xFFFFFFFF; + buffer[4] = ev.event & 0xFFFFFFFF; + it = std::copy(begin(buffer), end(buffer), it); + } + assert(it == std::end(finalbuf)); + // TODO: can we clear events? + std::string name = std::to_string(eventGlobalOffset_); + conn_->put(name, std::move(finalbuf), [name, callback=std::move(callback)](S3Request* req) { + if ( req->status != S3Request::Status::ok ) { + std::cerr << "failed to write event index buffer " << name << std::endl; + } + }); + auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + parallelTime_ += time.count(); + } + ); + eventGlobalOffset_ += eventFlushSize_; } } @@ -157,6 +217,7 @@ class S3Outputer : public OutputerBase { // data product order matches serializers_ inner vector mutable std::vector outputProductBuffer_; mutable std::vector eventIDs_; + mutable size_t eventGlobalOffset_; mutable std::chrono::microseconds serialTime_; mutable std::atomic parallelTime_; From 2787f35f148f271c9e89a9733dd65ea1cfe21878 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Thu, 16 Jun 2022 10:49:08 -0500 Subject: [PATCH 06/43] Migrate to using protobuf --- CMakeLists.txt | 7 ++ S3Common.cc | 35 ++++++++-- S3Common.h | 4 +- S3Outputer.cc | 162 +++++++++++++++++++++++++++++++++++++------ S3Outputer.h | 169 ++++++++++----------------------------------- objectstripe.proto | 33 +++++++++ 6 files changed, 249 insertions(+), 161 deletions(-) create mode 100644 objectstripe.proto diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d545f1..54153ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -191,9 +191,16 @@ if(ENABLE_S3) if(NOT DEFINED LIBS3_DIR) message(FATAL_ERROR "You must provide LIBS3_DIR variable") endif() + find_package(Protobuf REQUIRED) + include_directories(${Protobuf_INCLUDE_DIRS}) + include_directories(${CMAKE_CURRENT_BINARY_DIR}) + protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS objectstripe.proto) + target_link_libraries(threaded_io_test PRIVATE ${Protobuf_LIBRARIES}) target_sources(threaded_io_test PRIVATE S3Outputer.cc + # S3Source.cc S3Common.cc + ${PROTO_SRCS} ) target_include_directories(threaded_io_test PRIVATE ${LIBS3_DIR}/include) target_link_directories(threaded_io_test PRIVATE ${LIBS3_DIR}/lib) diff --git a/S3Common.cc b/S3Common.cc index c843e1d..25ccb6f 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -24,6 +24,7 @@ class S3LibWrapper { } running_ = true; if ( async_ ) { + throw std::runtime_error("Async not supported yet"); loop_ = std::thread(&S3LibWrapper::loop_body, this); } } @@ -38,6 +39,7 @@ class S3LibWrapper { bool running() { return running_; } void get(const S3BucketContext* bucketCtx, const std::string key, S3Request::Callback&& cb) { + // start of S3Request lifecycle (s3lib will always call responseCompleteCallback) auto req = new S3Request{ .type = S3Request::Type::get, .bucketCtx = bucketCtx, @@ -52,7 +54,8 @@ class S3LibWrapper { } } - void put(const S3BucketContext* bucketCtx, const std::string key, std::vector&& value, S3Request::Callback&& cb) { + void put(const S3BucketContext* bucketCtx, const std::string key, std::string&& value, S3Request::Callback&& cb) { + // start of S3Request lifecycle (s3lib will always call responseCompleteCallback) auto req = new S3Request{ .type = S3Request::Type::put, .bucketCtx = bucketCtx, @@ -137,6 +140,8 @@ class S3LibWrapper { auto req = static_cast(callbackData); if ( S3_status_is_retryable(status) && req->retriesRemaining > 0 ) { req->retriesRemaining--; + // TODO: back-off algo? + // https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ if ( req->owner->async_ ) { req->owner->requests_.push(req); } else { @@ -263,6 +268,10 @@ S3Connection::S3Connection( secretAccessKey_(iSecretKey), securityToken_(iSecurityToken) { + if ( hostName_ == "devnull") { + // magic do-nothing connection + return; + } ctx_.reset(new S3BucketContext{ .hostName = hostName_.c_str(), .bucketName = bucketName_.c_str(), @@ -276,11 +285,29 @@ S3Connection::S3Connection( }; void S3Connection::get(const std::string key, S3Request::Callback&& cb) { - s3lib.get(ctx_.get(), key, std::move(cb)); + if ( ctx_ ) { + s3lib.get(ctx_.get(), key, std::move(cb)); + } else if ( cb ) { + S3Request dummy{ + .type = S3Request::Type::get, + .key = key, + .status = S3Request::Status::error + }; + cb(&dummy); + } }; -void S3Connection::put(const std::string key, std::vector&& value, S3Request::Callback&& cb) { - s3lib.put(ctx_.get(), key, std::move(value), std::move(cb)); +void S3Connection::put(const std::string key, std::string&& value, S3Request::Callback&& cb) { + if ( ctx_ ) { + s3lib.put(ctx_.get(), key, std::move(value), std::move(cb)); + } else if ( cb ) { + S3Request dummy{ + .type = S3Request::Type::put, + .key = key, + .status = S3Request::Status::ok + }; + cb(&dummy); + } }; } diff --git a/S3Common.h b/S3Common.h index ff911bb..67b3384 100644 --- a/S3Common.h +++ b/S3Common.h @@ -21,7 +21,7 @@ struct S3Request { const S3BucketContext* bucketCtx{nullptr}; const std::string key; const Callback callback; - std::vector buffer; + std::string buffer; int timeout{1000}; // milliseconds int retriesRemaining{3}; Status status; @@ -43,7 +43,7 @@ class S3Connection { ); void get(const std::string key, S3Request::Callback&& cb); - void put(const std::string key, std::vector&& value, S3Request::Callback&& cb); + void put(const std::string key, std::string&& value, S3Request::Callback&& cb); private: const std::string hostName_; diff --git a/S3Outputer.cc b/S3Outputer.cc index 01fb8d4..1741c01 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -1,14 +1,147 @@ +#include #include "S3Outputer.h" #include "OutputerFactory.h" -#include -namespace cce::tf { +using namespace cce::tf; + +void S3Outputer::output( + EventIdentifier const& iEventID, + std::vector const& iSerializers, + TaskHolder iCallback + ) const +{ + using namespace std::string_literals; + if(verbose_ >= 2) { + std::cout <<" run:"s+std::to_string(iEventID.run)+" lumi:"s+std::to_string(iEventID.lumi)+" event:"s+std::to_string(iEventID.event)+"\n"<set_offset(eventGlobalOffset_++); + sev->set_run(iEventID.run); + sev->set_lumi(iEventID.lumi); + sev->set_event(iEventID.event); + + auto s = std::begin(iSerializers); + auto p = std::begin(currentProductStripes_); + auto pi = index_.mutable_products()->begin(); + for(; s != std::end(iSerializers); ++s, ++p, ++pi) { + size_t offset = p->content().size(); + p->add_offsets(offset); + p->mutable_content()->append(s->blob().begin(), s->blob().end()); + } + + flushProductStripes(iCallback); + + if ( currentEventStripe_.events_size() == eventFlushSize_ ) { + if(verbose_ >= 2) { + std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n" << std::flush; + } + flushEventStripe(iCallback); + } +} + + +void S3Outputer::flushProductStripes(TaskHolder iCallback, bool last) const { + using namespace std::string_literals; + auto p = currentProductStripes_.begin(); + auto pi = index_.mutable_products()->begin(); + for(; p != std::end(currentProductStripes_); ++p, ++pi) { + size_t offset = p->content().size(); + size_t bufferNevents = p->offsets_size(); + + // first flush when we exceed min size and have an even divisor of eventFlushSize_ + // subsequent flush when we reach productFlushSize + // always flush when we reach eventFlushSize_ (for buffers that never get big enough) + // flush if last call and we have something to write + if ( + ((pi->flushsize() == 0) && (offset > productBufferFlushMinSize_) && (eventFlushSize_ % bufferNevents == 0)) + || (bufferNevents == pi->flushsize()) + || (bufferNevents == eventFlushSize_) + || (last && bufferNevents > 0) + ) + { + if(verbose_ >= 2) { + std::cout << "product buffer for "s + std::string(pi->productname()) + " is full ("s + std::to_string(offset) + + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n" << std::flush; + } + objstripe::ProductStripe pOut; + pOut.mutable_offsets()->Reserve(bufferNevents); + pOut.mutable_content()->reserve(offset); + std::swap(*p, pOut); + std::string name = objPrefix_; + name += pi->productname(); + name += std::to_string(eventGlobalOffset_ - bufferNevents); + iCallback.group()->run( + [this, name=std::move(name), pOut=std::move(pOut), callback=iCallback]() { + auto start = std::chrono::high_resolution_clock::now(); + std::string finalbuf; + pOut.SerializeToString(&finalbuf); + conn_->put(name, std::move(finalbuf), [name=std::move(name), callback=std::move(callback)](S3Request* req) { + if ( req->status != S3Request::Status::ok ) { + std::cerr << "failed to write product buffer " << name << std::endl; + } + }); + auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + parallelTime_ += time.count(); + } + ); + if ( pi->flushsize() == 0 ) { + pi->set_flushsize(bufferNevents); + } + } + } +} + +void S3Outputer::flushEventStripe(TaskHolder iCallback, bool last) const { + if ( not last ) { + // all buffers should be empty because the sizes all evenly divide eventFlushSize_ + for(auto& p : currentProductStripes_) { + assert(p->offsets_size() == 0); + } + } + objstripe::EventStripe stripeOut; + stripeOut.mutable_events()->Reserve(eventFlushSize_); + std::swap(currentEventStripe_, stripeOut); + // TODO: are we sure writing to dest is threadsafe? + auto dest = index_.add_packedeventstripes(); + index_.set_totalevents(eventGlobalOffset_); + iCallback.group()->run( + [this, dest, stripeOut=std::move(stripeOut), callback=iCallback]() { + auto start = std::chrono::high_resolution_clock::now(); + // TODO: compression + stripeOut.SerializeToString(dest); + if ( verbose_ >= 2 ) { + std::cout << "length of packed EventStripe: " << dest->size() << "\n"; + std::cout << stripeOut.DebugString() << "\n"; + } + + // TODO: checkpoint only every few event stripes? + if ( verbose_ >= 2 ) { + std::cout << index_.DebugString() << "\n"; + } + std::string indexOut; + index_.SerializeToString(&indexOut); + conn_->put(objPrefix_ + "index", std::move(indexOut), [callback=std::move(callback)](S3Request* req) { + if ( req->status != S3Request::Status::ok ) { + std::cerr << "failed to write product buffer index" << std::endl; + } + }); + auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + parallelTime_ += time.count(); + } + ); +} + namespace { - class Maker : public OutputerMakerBase { +class Maker : public OutputerMakerBase { public: Maker(): OutputerMakerBase("S3Outputer") {} std::unique_ptr create(unsigned int iNLanes, ConfigurationParameters const& params) const final { auto verbose = params.get("verbose", 0); + auto objPrefix = params.get("prefix"); + if(not objPrefix) { + std::cerr << "no object prefix given for S3Outputer\n"; + return {}; + } auto productFlush = params.get("productFlush", 1024*512); auto eventFlush = params.get("eventFlush", 24); auto connfile = params.get("conn"); @@ -21,26 +154,9 @@ namespace { return {}; } - std::vector tmp; - conn->get("testkey", [&tmp](S3Request* req) mutable { - if ( req->status == S3Request::Status::ok ) { - std::swap(req->buffer, tmp); - } - else { std::cout << "no key" << std::endl; } - }); - std::string_view s(tmp.data(), tmp.size()); - std::cout << tmp.size() << ": " << s << std::endl; - - conn->put("testkey2", std::move(tmp), [](S3Request* req) { - if ( req->status == S3Request::Status::ok ) { - std::cout << "wrote something!" << std::endl; - } - }); - - return std::make_unique(iNLanes, verbose, productFlush, eventFlush, conn); + return std::make_unique(iNLanes, objPrefix.value(), verbose, productFlush, eventFlush, conn); } - }; +}; - Maker s_maker; -} +Maker s_maker; } diff --git a/S3Outputer.h b/S3Outputer.h index 38f8309..ed2fe06 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -15,12 +15,15 @@ #include "summarize_serializers.h" #include "SerialTaskQueue.h" #include "S3Common.h" +#include "FunctorTask.h" +#include "objectstripe.pb.h" namespace cce::tf { class S3Outputer : public OutputerBase { public: - S3Outputer(unsigned int iNLanes, int iVerbose, size_t iProductBufferFlush, size_t iEventFlushSize, S3ConnectionRef conn): + S3Outputer(unsigned int iNLanes, std::string objPrefix, int iVerbose, size_t iProductBufferFlush, size_t iEventFlushSize, S3ConnectionRef conn): serializers_(iNLanes), + objPrefix_(objPrefix), verbose_(iVerbose), productBufferFlushMinSize_(iProductBufferFlush), eventFlushSize_(iEventFlushSize), @@ -28,7 +31,8 @@ class S3Outputer : public OutputerBase { serialTime_{std::chrono::microseconds::zero()}, parallelTime_{0} { - eventIDs_.reserve(eventFlushSize_); + index_.set_eventstripesize(eventFlushSize_); + currentEventStripe_.mutable_events()->Reserve(eventFlushSize_); } void setupForLane(unsigned int iLaneIndex, std::vector const& iDPs) final { @@ -37,11 +41,17 @@ class S3Outputer : public OutputerBase { for(auto const& dp: iDPs) { s.emplace_back(dp.name(), dp.classType()); } - if (outputProductBuffer_.size() == 0) { - outputProductBuffer_.resize(iDPs.size()); + if (currentProductStripes_.size() == 0) { + currentProductStripes_.resize(iDPs.size()); + index_.mutable_products()->Reserve(iDPs.size()); + for(auto const& dp: iDPs) { + auto prod = index_.add_products(); + prod->set_productname(dp.name()); + prod->set_flushsize(0); + } } // all lanes see same products? if not we'll need a map - assert(outputProductBuffer_.size() == iDPs.size()); + assert(currentProductStripes_.size() == iDPs.size()); } void productReadyAsync(unsigned int iLaneIndex, DataProductRetriever const& iDataProduct, TaskHolder iCallback) const final { @@ -67,6 +77,18 @@ class S3Outputer : public OutputerBase { } void printSummary() const final { + { + tbb::task_group group; + { + auto start = std::chrono::high_resolution_clock::now(); + TaskHolder th(group, make_functor_task([](){})); + flushProductStripes(th, true); + flushEventStripe(th, true); + serialTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + } + group.wait(); + } + if(verbose_ >= 2) { summarize_serializers(serializers_); } @@ -81,143 +103,26 @@ class S3Outputer : public OutputerBase { " total serializer parallel time at end event: "< const& iSerializers, TaskHolder iCallback) const { - using namespace std::string_literals; - if(verbose_ >= 2) { - std::cout <<" run:"s+std::to_string(iEventID.run)+" lumi:"s+std::to_string(iEventID.lumi)+" event:"s+std::to_string(iEventID.event)+"\n"<blob().size()); - std::copy(s->blob().begin(), s->blob().end(), buffer.begin()+offset); - size_t bufferNevents = offsets.size(); - - // first flush when we exceed min size and have an even divisor of eventFlushSize_ - // subsequent flush when we reach productFlushSize - // always flush when we reach eventFlushSize_ (for buffers that never get big enough) - if ( - ((productFlushSize == 0) && (buffer.size() > productBufferFlushMinSize_) && (eventFlushSize_ % bufferNevents == 0)) - || (bufferNevents == productFlushSize) - || (bufferNevents == eventFlushSize_) - ) - { - assert(eventGlobalOffset_ + eventIDs_.size() - productGlobalOffset == bufferNevents); - if(verbose_ >= 2) { - std::cout << "product buffer for "s + std::string(s->name()) + " is full ("s + std::to_string(buffer.size()) - + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n" << std::flush; - } - std::vector offsetsOut; - std::vector bufferOut; - // use current size as hint - offsetsOut.reserve(offsets.size()); - bufferOut.reserve(buffer.size()); - - std::swap(offsets, offsetsOut); - std::swap(buffer, bufferOut); - iCallback.group()->run( - [this, productName=s->name(), productGlobalOffset, offsets=std::move(offsetsOut), buffer=std::move(bufferOut), callback=iCallback]() { - auto start = std::chrono::high_resolution_clock::now(); - // concatenate offsets and buffer - std::vector finalbuf( - sizeof(decltype(productGlobalOffset)) - + offsets.size()*sizeof(decltype(offsets)::value_type) - + buffer.size() - ); - auto it = std::begin(finalbuf); - it = std::copy_n(reinterpret_cast(&productGlobalOffset), sizeof(decltype(productGlobalOffset)), it); - it = std::copy(std::begin(offsets), std::end(offsets), it); - it = std::copy(std::begin(buffer), std::end(buffer), it); - // TODO can we clear offsets and buffer yet? - assert(it == std::end(finalbuf)); - std::string name(productName); - name += std::to_string(productGlobalOffset); - conn_->put(name, std::move(finalbuf), [name, callback=std::move(callback)](S3Request* req) { - if ( req->status != S3Request::Status::ok ) { - std::cerr << "failed to write product buffer " << name << std::endl; - } - }); - auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); - parallelTime_ += time.count(); - } - ); - productGlobalOffset += bufferNevents; - productFlushSize = bufferNevents; - } - } - - if ( eventIDs_.size() == eventFlushSize_ ) { - if(verbose_ >= 2) { - std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n" << std::flush; - } - // all buffers should be empty because the sizes all evenly divide eventFlushSize_ - for(auto& p : outputProductBuffer_) { - auto& [productGlobalOffset, productFlushSize, offsets, buffer] = p; - assert(bufferNevents == 0); - assert(productGlobalOffset == eventGlobalOffset_ + eventFlushSize_); - } - std::vector eventIDsOut; - eventIDsOut.reserve(eventFlushSize_); - std::swap(eventIDs_, eventIDsOut); - iCallback.group()->run( - [this, events=std::move(eventIDsOut), callback=iCallback]() { - auto start = std::chrono::high_resolution_clock::now(); - - // serialize EventIdentifier - constexpr unsigned int headerBufferSizeInWords = 5; - std::vector finalbuf(headerBufferSizeInWords*4*events.size()); - auto it = begin(finalbuf); - size_t iev = eventGlobalOffset_; - std::array buffer; - for(const auto& ev : events) { - buffer[0] = iev++; - buffer[1] = ev.run; - buffer[2] = ev.lumi; - buffer[3] = (ev.event >> 32) & 0xFFFFFFFF; - buffer[4] = ev.event & 0xFFFFFFFF; - it = std::copy(begin(buffer), end(buffer), it); - } - assert(it == std::end(finalbuf)); - // TODO: can we clear events? - std::string name = std::to_string(eventGlobalOffset_); - conn_->put(name, std::move(finalbuf), [name, callback=std::move(callback)](S3Request* req) { - if ( req->status != S3Request::Status::ok ) { - std::cerr << "failed to write event index buffer " << name << std::endl; - } - }); - auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); - parallelTime_ += time.count(); - } - ); - eventGlobalOffset_ += eventFlushSize_; - } - } - private: + void output(EventIdentifier const& iEventID, std::vector const& iSerializers, TaskHolder iCallback) const; + void flushProductStripes(TaskHolder iCallback, bool last=false) const; + void flushEventStripe(TaskHolder iCallback, bool last=false) const; + mutable std::vector> serializers_; mutable SerialTaskQueue queue_; // configuration options int verbose_; + std::string objPrefix_; size_t productBufferFlushMinSize_; size_t eventFlushSize_; S3ConnectionRef conn_; - // 0: starting event index (into eventIDs_) - // 1: last buffer flush size - // 2: byte offset for each event - // 3: contiguous serialized product data - using ProductInfo = std::tuple, std::vector>; - // data product order matches serializers_ inner vector - mutable std::vector outputProductBuffer_; - mutable std::vector eventIDs_; - mutable size_t eventGlobalOffset_; + // mutated only by methods called in queue_ + mutable objstripe::ObjectStripeIndex index_; + mutable objstripe::EventStripe currentEventStripe_; + mutable std::vector currentProductStripes_; + mutable size_t eventGlobalOffset_{0}; mutable std::chrono::microseconds serialTime_; mutable std::atomic parallelTime_; diff --git a/objectstripe.proto b/objectstripe.proto new file mode 100644 index 0000000..a4d12d1 --- /dev/null +++ b/objectstripe.proto @@ -0,0 +1,33 @@ +syntax = "proto2"; + +package objstripe; + +message ObjectStripeIndex { + optional uint32 eventStripeSize = 1; + optional uint64 totalEvents = 2; + + message Product { + optional string productName = 1; + optional uint32 flushSize = 2; + } + + repeated Product products = 3; + + repeated bytes packedEventStripes = 4; +} + +message EventStripe { + message Event { + optional uint64 offset = 1; + optional uint32 run = 2; + optional uint32 lumi = 3; + optional uint64 event = 4; + } + + repeated Event events = 1; +} + +message ProductStripe { + repeated uint32 offsets = 1 [packed = true]; + optional bytes content = 2; +} From 8582432c4ce430aaa89d0a2b5879277c20d321f1 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Thu, 23 Jun 2022 22:38:51 -0500 Subject: [PATCH 07/43] Functional S3Source --- CMakeLists.txt | 2 +- S3Common.cc | 92 ++++++++++------- S3Common.h | 61 ++++++++--- S3Outputer.cc | 82 +++++++++++++-- S3Outputer.h | 55 ++-------- S3Source.cc | 248 +++++++++++++++++++++++++++++++++++++++++++++ S3Source.h | 102 +++++++++++++++++++ SharedSourceBase.h | 2 +- objectstripe.proto | 12 ++- 9 files changed, 545 insertions(+), 111 deletions(-) create mode 100644 S3Source.cc create mode 100644 S3Source.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 54153ec..b8e71ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -198,7 +198,7 @@ if(ENABLE_S3) target_link_libraries(threaded_io_test PRIVATE ${Protobuf_LIBRARIES}) target_sources(threaded_io_test PRIVATE S3Outputer.cc - # S3Source.cc + S3Source.cc S3Common.cc ${PROTO_SRCS} ) diff --git a/S3Common.cc b/S3Common.cc index 25ccb6f..0bfc994 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include "libs3.h" #include "tbb/concurrent_queue.h" @@ -40,13 +41,7 @@ class S3LibWrapper { void get(const S3BucketContext* bucketCtx, const std::string key, S3Request::Callback&& cb) { // start of S3Request lifecycle (s3lib will always call responseCompleteCallback) - auto req = new S3Request{ - .type = S3Request::Type::get, - .bucketCtx = bucketCtx, - .key = key, - .callback = std::move(cb), - .owner = this, - }; + auto req = new S3Request(S3Request::Type::get, bucketCtx, key, std::move(cb), this); if ( async_ ) { requests_.push(req); } else { @@ -56,14 +51,7 @@ class S3LibWrapper { void put(const S3BucketContext* bucketCtx, const std::string key, std::string&& value, S3Request::Callback&& cb) { // start of S3Request lifecycle (s3lib will always call responseCompleteCallback) - auto req = new S3Request{ - .type = S3Request::Type::put, - .bucketCtx = bucketCtx, - .key = key, - .callback = std::move(cb), - .buffer = std::move(value), - .owner = this, - }; + auto req = new S3Request(S3Request::Type::put, bucketCtx, key, std::move(cb), this, std::move(value)); if ( async_ ) { requests_.push(req); } else { @@ -105,7 +93,7 @@ class S3LibWrapper { 0, // startByte 0, // byteCount requestContext_, - req->timeout, + req->_timeout, &S3LibWrapper::getObjectHandler, static_cast(req)); break; @@ -116,7 +104,7 @@ class S3LibWrapper { req->buffer.size(), nullptr, // S3PutProperties (TODO probably want .md5) requestContext_, - req->timeout, + req->_timeout, &S3LibWrapper::putObjectHandler, static_cast(req)); break; @@ -138,15 +126,29 @@ class S3LibWrapper { static void responseCompleteCallback(S3Status status, const S3ErrorDetails *error, void *callbackData) { auto req = static_cast(callbackData); - if ( S3_status_is_retryable(status) && req->retriesRemaining > 0 ) { - req->retriesRemaining--; - // TODO: back-off algo? - // https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ - if ( req->owner->async_ ) { - req->owner->requests_.push(req); + if ( S3_status_is_retryable(status) && req->_retries_executed < req->retries ) { + if ( status == S3Status::S3StatusErrorRequestTimeout ) { + // https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ + static thread_local std::minstd_rand rng(std::hash{}(std::this_thread::get_id())); + std::uniform_int_distribution dist(0l, std::min(S3Request::max_timeout.count(), req->_timeout)); + auto dt = std::chrono::milliseconds(dist(rng)); + if ( req->_owner->async_ ) { + // TODO: how to async sleep? + } else { + std::cerr << "Got status " << S3_get_status_name(status) << " while running request " << req << ", sleeping for " << dt.count() << "\n"; + // TODO: better option? + std::this_thread::sleep_for(dt); + req->_timeout *= 2; + } + } else { + std::cerr << "Got status " << S3_get_status_name(status) << " while running request " << req << "\n"; + } + req->_retries_executed++; + if ( req->_owner->async_ ) { + req->_owner->requests_.push(req); } else { // can libs3 callbacks recurse? probably... - req->owner->submit(req); + req->_owner->submit(req); } return; // no delete! } @@ -164,10 +166,11 @@ class S3LibWrapper { static int putObjectDataCallback(int bufferSize, char *buffer, void *callbackData) { auto req = static_cast(callbackData); - size_t toWrite = std::min(bufferSize, (int) (req->buffer.size() - req->put_offset)); + int toWrite = std::min(bufferSize, (int) (req->buffer.size() - req->_put_offset)); + assert(toWrite >= 0); if ( toWrite > 0 ) { - std::copy_n(req->buffer.begin() + req->put_offset, toWrite, buffer); - req->put_offset += toWrite; + std::copy_n(req->buffer.begin() + req->_put_offset, toWrite, buffer); + req->_put_offset += toWrite; } // return > 0 = bytes written, 0 = done, -1 = S3StatusAbortedByCallback return toWrite; @@ -210,6 +213,29 @@ class S3LibWrapper { // optional TODO: make it a singleton and only initialize when needed S3LibWrapper s3lib; +std::ostream& operator<<(std::ostream& os, const S3Request& req) { + os << "S3Request("; + switch (req.type) { + case S3Request::Type::undef: + os << "undef"; break; + case S3Request::Type::get: + os << "get"; break; + case S3Request::Type::put: + os << "put"; break; + } + os << ", key=" << req.key << ", timeout=" << req.timeout.count() << "ms, retries=" << req.retries; + os << ", buffer length=" << req.buffer.size() << ", "; + switch (req.status) { + case S3Request::Status::waiting: + os << "waiting"; break; + case S3Request::Status::ok: + os << "ok"; break; + case S3Request::Status::error: + os << "error"; break; + } + os << ") (put offset: " << req._put_offset << ", retries executed: " << req._retries_executed << ")"; + return os; +} S3ConnectionRef S3Connection::from_config(std::string filename) { std::ifstream fin(filename); @@ -288,11 +314,7 @@ void S3Connection::get(const std::string key, S3Request::Callback&& cb) { if ( ctx_ ) { s3lib.get(ctx_.get(), key, std::move(cb)); } else if ( cb ) { - S3Request dummy{ - .type = S3Request::Type::get, - .key = key, - .status = S3Request::Status::error - }; + S3Request dummy(S3Request::Type::get, key, S3Request::Status::error); cb(&dummy); } }; @@ -301,11 +323,7 @@ void S3Connection::put(const std::string key, std::string&& value, S3Request::Ca if ( ctx_ ) { s3lib.put(ctx_.get(), key, std::move(value), std::move(cb)); } else if ( cb ) { - S3Request dummy{ - .type = S3Request::Type::put, - .key = key, - .status = S3Request::Status::ok - }; + S3Request dummy(S3Request::Type::put, key, S3Request::Status::ok); cb(&dummy); } }; diff --git a/S3Common.h b/S3Common.h index 67b3384..9b8346a 100644 --- a/S3Common.h +++ b/S3Common.h @@ -1,8 +1,10 @@ #if !defined(S3Common_h) #define S3Common_h +#include #include #include +#include // libs3.h struct S3BucketContext; @@ -12,24 +14,51 @@ class S3LibWrapper; class S3Connection; typedef std::shared_ptr S3ConnectionRef; -struct S3Request { - enum class Type {undef, get, put}; - enum class Status {ok, error}; - typedef std::function Callback; - - const Type type{Type::undef}; - const S3BucketContext* bucketCtx{nullptr}; - const std::string key; - const Callback callback; - std::string buffer; - int timeout{1000}; // milliseconds - int retriesRemaining{3}; - Status status; - // "private" - S3LibWrapper *const owner{nullptr}; - size_t put_offset{0}; +class S3Request { + public: + enum class Type {undef, get, put}; + enum class Status {waiting, ok, error}; + typedef std::function Callback; + static constexpr std::chrono::milliseconds max_timeout{60000}; + + const Type type; + const S3BucketContext* bucketCtx; + const std::string key; + const Callback callback; + const std::chrono::milliseconds timeout{1000}; + const int retries{5}; + std::string buffer; + Status status; + + private: + S3Request() = delete; + // constructor for devnull connection + S3Request(Type iType, std::string iKey, Status stat): + type{iType}, key{iKey}, status{stat} {}; + // get constructor + S3Request(Type iType, const S3BucketContext* iCtx, std::string iKey, Callback iCb, S3LibWrapper* iOwner): + type{iType}, bucketCtx{iCtx}, key{iKey}, callback{iCb}, _owner{iOwner} + { + _timeout = timeout.count(); + }; + // put constructor + S3Request(Type iType, const S3BucketContext* iCtx, std::string iKey, Callback iCb, S3LibWrapper* iOwner, std::string&& buf): + type{iType}, bucketCtx{iCtx}, key{iKey}, callback{iCb}, _owner{iOwner}, buffer{buf} + { + _timeout = timeout.count(); + }; + + S3LibWrapper *const _owner{nullptr}; + size_t _put_offset{0}; + int _retries_executed{0}; + long _timeout; + + friend class S3LibWrapper; + friend class S3Connection; + friend std::ostream& operator<<(std::ostream& os, const S3Request& req); }; + class S3Connection { public: static S3ConnectionRef from_config(std::string filename); diff --git a/S3Outputer.cc b/S3Outputer.cc index 1741c01..4934703 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -1,12 +1,70 @@ #include #include "S3Outputer.h" #include "OutputerFactory.h" +#include "UnrolledSerializerWrapper.h" +#include "FunctorTask.h" using namespace cce::tf; +void S3Outputer::setupForLane(unsigned int iLaneIndex, std::vector const& iDPs) { + auto& s = serializers_[iLaneIndex]; + switch(index_.serializestrategy()) { + case objstripe::SerializeStrategy::kRoot: + s = SerializeStrategy::make>(); + break; + case objstripe::SerializeStrategy::kRootUnrolled: + s = SerializeStrategy::make>(); + break; + default: + throw std::runtime_error("S3Outputer: unrecognized serialization strategy"); + } + s.reserve(iDPs.size()); + for(auto const& dp: iDPs) { + s.emplace_back(dp.name(), dp.classType()); + } + if (currentProductStripes_.size() == 0) { + currentProductStripes_.resize(iDPs.size()); + index_.mutable_products()->Reserve(iDPs.size()); + for(auto const& ss: s) { + auto prod = index_.add_products(); + prod->set_productname(std::string(ss.name())); + prod->set_producttype(ss.className()); + prod->set_flushsize(0); + } + } + // all lanes see same products? if not we'll need a map + assert(currentProductStripes_.size() == iDPs.size()); +} + +void S3Outputer::printSummary() const { + { + tbb::task_group group; + { + auto start = std::chrono::high_resolution_clock::now(); + TaskHolder th(group, make_functor_task([](){})); + flushProductStripes(th, true); + flushEventStripe(th, true); + serialTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + } + group.wait(); + } + + if(verbose_ >= 2) { + summarize_serializers(serializers_); + } + std::chrono::microseconds serializerTime = std::chrono::microseconds::zero(); + for(const auto& lane : serializers_) { + for(const auto& s : lane) { + serializerTime += s.accumulatedTime(); + } + } + std::cout <<"S3Outputer\n total serial time at end event: "< const& iSerializers, + SerializeStrategy const& iSerializers, TaskHolder iCallback ) const { @@ -20,13 +78,20 @@ void S3Outputer::output( sev->set_lumi(iEventID.lumi); sev->set_event(iEventID.event); - auto s = std::begin(iSerializers); auto p = std::begin(currentProductStripes_); auto pi = index_.mutable_products()->begin(); - for(; s != std::end(iSerializers); ++s, ++p, ++pi) { - size_t offset = p->content().size(); - p->add_offsets(offset); - p->mutable_content()->append(s->blob().begin(), s->blob().end()); + for(const auto& s : iSerializers) { + if (verbose_ >= 2) { + std::cout << "adding blob len " << s.blob().size() << " to " << pi->productname() << "\n"; + for (auto c : s.blob()) { + if ( isprint(c) ) std::cout << c; + else std::cout << "\\x" << std::hex << (int) c << std::dec; + } + std::cout << "\n"; + } + p->mutable_content()->append(s.blob().begin(), s.blob().end()); + p->add_offsets(p->content().size()); + p++; pi++; } flushProductStripes(iCallback); @@ -98,12 +163,15 @@ void S3Outputer::flushEventStripe(TaskHolder iCallback, bool last) const { assert(p->offsets_size() == 0); } } + index_.set_totalevents(eventGlobalOffset_); + if ( last and currentEventStripe_.events_size() == 0 ) { + return; + } objstripe::EventStripe stripeOut; stripeOut.mutable_events()->Reserve(eventFlushSize_); std::swap(currentEventStripe_, stripeOut); // TODO: are we sure writing to dest is threadsafe? auto dest = index_.add_packedeventstripes(); - index_.set_totalevents(eventGlobalOffset_); iCallback.group()->run( [this, dest, stripeOut=std::move(stripeOut), callback=iCallback]() { auto start = std::chrono::high_resolution_clock::now(); diff --git a/S3Outputer.h b/S3Outputer.h index ed2fe06..acf57fc 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -10,12 +10,11 @@ #include "OutputerBase.h" #include "EventIdentifier.h" -#include "SerializerWrapper.h" +#include "SerializeStrategy.h" #include "DataProductRetriever.h" #include "summarize_serializers.h" #include "SerialTaskQueue.h" #include "S3Common.h" -#include "FunctorTask.h" #include "objectstripe.pb.h" namespace cce::tf { @@ -32,27 +31,12 @@ class S3Outputer : public OutputerBase { parallelTime_{0} { index_.set_eventstripesize(eventFlushSize_); + // TODO: make configurable + index_.set_serializestrategy(objstripe::SerializeStrategy::kRoot); currentEventStripe_.mutable_events()->Reserve(eventFlushSize_); } - void setupForLane(unsigned int iLaneIndex, std::vector const& iDPs) final { - auto& s = serializers_[iLaneIndex]; - s.reserve(iDPs.size()); - for(auto const& dp: iDPs) { - s.emplace_back(dp.name(), dp.classType()); - } - if (currentProductStripes_.size() == 0) { - currentProductStripes_.resize(iDPs.size()); - index_.mutable_products()->Reserve(iDPs.size()); - for(auto const& dp: iDPs) { - auto prod = index_.add_products(); - prod->set_productname(dp.name()); - prod->set_flushsize(0); - } - } - // all lanes see same products? if not we'll need a map - assert(currentProductStripes_.size() == iDPs.size()); - } + void setupForLane(unsigned int iLaneIndex, std::vector const& iDPs) final; void productReadyAsync(unsigned int iLaneIndex, DataProductRetriever const& iDataProduct, TaskHolder iCallback) const final { assert(iLaneIndex < serializers_.size()); @@ -76,39 +60,14 @@ class S3Outputer : public OutputerBase { parallelTime_ += time.count(); } - void printSummary() const final { - { - tbb::task_group group; - { - auto start = std::chrono::high_resolution_clock::now(); - TaskHolder th(group, make_functor_task([](){})); - flushProductStripes(th, true); - flushEventStripe(th, true); - serialTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); - } - group.wait(); - } - - if(verbose_ >= 2) { - summarize_serializers(serializers_); - } - std::chrono::microseconds serializerTime = std::chrono::microseconds::zero(); - for(const auto& lane : serializers_) { - for(const auto& s : lane) { - serializerTime += s.accumulatedTime(); - } - } - std::cout <<"S3Outputer\n total serial time at end event: "< const& iSerializers, TaskHolder iCallback) const; + void output(EventIdentifier const& iEventID, SerializeStrategy const& iSerializers, TaskHolder iCallback) const; void flushProductStripes(TaskHolder iCallback, bool last=false) const; void flushEventStripe(TaskHolder iCallback, bool last=false) const; - mutable std::vector> serializers_; + mutable std::vector serializers_; mutable SerialTaskQueue queue_; // configuration options diff --git a/S3Source.cc b/S3Source.cc new file mode 100644 index 0000000..e0d209e --- /dev/null +++ b/S3Source.cc @@ -0,0 +1,248 @@ +#include +#include "S3Source.h" +#include "SourceFactory.h" +#include "Deserializer.h" +#include "UnrolledDeserializer.h" + +using namespace cce::tf; + +S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, unsigned long long iNEvents, S3ConnectionRef conn): + SharedSourceBase(iNEvents), + objPrefix_(iObjPrefix), + verbose_(iVerbose), + conn_(conn), + readTime_{std::chrono::microseconds::zero()} +{ + auto start = std::chrono::high_resolution_clock::now(); + + conn->get(objPrefix_ + "index", [this](S3Request* req) mutable { + if ( req->status == S3Request::Status::ok ) { + if ( not index_.ParseFromString(req->buffer) ) { + throw std::runtime_error("Could not deserialize index in S3Source construction"); + } + } + else { throw std::runtime_error("Could not retrieve index in S3Source construction"); } + }); + if ( verbose_ >= 3 ) { + std::cout << index_.DebugString() << "\n"; + } + + if ( index_.totalevents() < iNEvents ) { + std::cout << "WARNING: less events in source than requested: " + << index_.totalevents() << " vs. " << iNEvents << ". Will read all available events instead.\n"; + } + + currentProductStripes_.resize(index_.products_size()); + + laneInfos_.reserve(iNLanes); + for(unsigned int i = 0; i< iNLanes; ++i) { + DeserializeStrategy strategy; + switch(index_.serializestrategy()) { + case objstripe::SerializeStrategy::kRoot: + strategy = DeserializeStrategy::make>(); + break; + case objstripe::SerializeStrategy::kRootUnrolled: + strategy = DeserializeStrategy::make>(); + break; + } + laneInfos_.emplace_back(index_, std::move(strategy)); + } + + readTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); +} + +S3Source::LaneInfo::LaneInfo(objstripe::ObjectStripeIndex const& index, DeserializeStrategy deserialize): + deserializers_{std::move(deserialize)} +{ + dataProducts_.reserve(index.products_size()); + dataBuffers_.resize(index.products_size(), nullptr); + deserializers_.reserve(index.products_size()); + size_t i{0}; + for(auto const& pi : index.products()) { + TClass* cls = TClass::GetClass(pi.producttype().c_str()); + if ( cls == nullptr ) { + throw std::runtime_error("No TClass reflection available for " + pi.productname()); + } + dataBuffers_[i] = cls->New(); + dataProducts_.emplace_back(i, &dataBuffers_[i], pi.productname(), cls, &delayedRetriever_); + deserializers_.emplace_back(cls); + ++i; + } +} + +S3Source::LaneInfo::~LaneInfo() { + auto it = dataProducts_.begin(); + for( void * b: dataBuffers_) { + it->classType()->Destructor(b); + ++it; + } +} + +std::pair DelayedProductStripeRetriever::bufferAt(size_t globalEventIndex) const { + std::call_once(flag_, [this](){ + conn_->get(name_, [this](S3Request* req) { + if ( req->status == S3Request::Status::ok ) { + if ( not data_.ParseFromString(req->buffer) ) { + throw std::runtime_error("Could not deserialize ProductStripe for key " + name_); + } + } + else { throw std::runtime_error("Could not retrieve ProductStripe for key " + name_); } + }); + }); + assert(globalOffset_ <= globalEventIndex); + size_t offset = globalEventIndex - globalOffset_; + assert(offset < data_.offsets_size()); + size_t bstart = (offset == 0) ? 0 : data_.offsets(offset-1); + size_t bstop = data_.offsets(offset); + return {&data_.content()[bstart], bstop - bstart}; +} + +size_t S3Source::numberOfDataProducts() const { + return laneInfos_[0].dataProducts_.size(); +} + +std::vector& S3Source::dataProducts(unsigned int iLane, long iEventIndex) { + return laneInfos_[iLane].dataProducts_; +} + +EventIdentifier S3Source::eventIdentifier(unsigned int iLane, long iEventIndex) { + return laneInfos_[iLane].eventID_; +} + +void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTaskHolder iTask) { + queue_.push(*iTask.group(), [iLane, optTask = std::move(iTask), this]() mutable { + auto start = std::chrono::high_resolution_clock::now(); + if( + (nextEventStripe_ < index_.packedeventstripes_size()) + or (nextEventInStripe_ < currentEventStripe_.events_size()) + ) + { + // default-constructed currentEventStripe_ will have size zero, so 0, 0 will load first stripe + if(nextEventInStripe_ == currentEventStripe_.events_size()) { + // need to read ahead + // TODO: compression + currentEventStripe_.ParseFromString(index_.packedeventstripes(nextEventStripe_++)); + nextEventInStripe_ = 0; + } + const auto event = currentEventStripe_.events(nextEventInStripe_); + if ( verbose_ >= 2 ) std::cout << event.DebugString() << "\n"; + size_t globalEventIndex = event.offset(); + laneInfos_[iLane].eventID_.run = event.run(); + laneInfos_[iLane].eventID_.lumi = event.lumi(); + laneInfos_[iLane].eventID_.event = event.event(); + auto stripes = std::vector>(); + stripes.reserve(currentProductStripes_.size()); + size_t i{0}; + for (auto& ps : currentProductStripes_) { + const auto& productinfo = index_.products(i++); + if ( nextEventInStripe_ % productinfo.flushsize() == 0 ) { + auto new_ps = std::make_shared( + conn_, + objPrefix_ + productinfo.productname() + std::to_string(globalEventIndex), + globalEventIndex + ); + std::swap(ps, new_ps); + } + stripes.push_back(ps); + } + + ++nextEventInStripe_; + auto group = optTask.group(); + group->run([this, task=optTask.releaseToTaskHolder(), iLane, globalEventIndex, stripes=std::move(stripes)]() { + auto& laneInfo = this->laneInfos_[iLane]; + + auto it_stripe = stripes.cbegin(); + auto it_deserialize = laneInfo.deserializers_.begin(); + auto it_product = laneInfo.dataProducts_.begin(); + while ( it_stripe != stripes.cend() ) { + auto start = std::chrono::high_resolution_clock::now(); + auto [buf, len] = (*it_stripe)->bufferAt(globalEventIndex); + if ( verbose_ >= 3 ) { + std::cout << "got buffer " << uint64_t(buf) << " len " << len << "\n"; + for (size_t i=0; i(split - start); + auto readSize = (*it_deserialize).deserialize(buf, len, *it_product->address()); + if ( verbose_ >= 3 ) std::cout << "read " << readSize << " bytes\n"; + it_product->setSize(readSize); + ++it_stripe; + ++it_deserialize; + ++it_product; + laneInfo.deserializeTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - split); + } + }); + } + readTime_ +=std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + }); +} + +void S3Source::printSummary() const { + std::cout <<"\nSource:\n" + " serial read time: "< create(unsigned int iNLanes, unsigned long long iNEvents, ConfigurationParameters const& params) const final { + auto verbose = params.get("verbose", 0); + auto objPrefix = params.get("prefix"); + if(not objPrefix) { + std::cerr << "no object prefix given for S3Outputer\n"; + return {}; + } + auto connfile = params.get("conn"); + if(not connfile) { + std::cerr <<"no connection configuration file name given for S3Source\n"; + return {}; + } + auto conn = S3Connection::from_config(connfile.value()); + if(not conn) { + return {}; + } + + return std::make_unique(iNLanes, objPrefix.value(), verbose, iNEvents, conn); + } + }; + +Maker s_maker; +} diff --git a/S3Source.h b/S3Source.h new file mode 100644 index 0000000..2e74dd7 --- /dev/null +++ b/S3Source.h @@ -0,0 +1,102 @@ +#if !defined(S3Source_h) +#define S3Source_h + +#include +#include +#include +#include +#include + +#include "SharedSourceBase.h" +#include "DataProductRetriever.h" +#include "DelayedProductRetriever.h" +#include "SerialTaskQueue.h" +#include "DeserializeStrategy.h" +#include "S3Common.h" +#include "objectstripe.pb.h" + + +namespace cce::tf { +class S3DelayedRetriever : public DelayedProductRetriever { + void getAsync(DataProductRetriever&, int index, TaskHolder) final {} +}; + +class DelayedProductStripeRetriever { + public: + DelayedProductStripeRetriever(S3ConnectionRef conn, std::string name, size_t globalOffset): + conn_(conn), name_(name), globalOffset_(globalOffset) {}; + std::pair bufferAt(size_t globalEventIndex) const; + ~DelayedProductStripeRetriever() {}; + + private: + S3ConnectionRef conn_; + std::string name_; + size_t globalOffset_; + + mutable std::once_flag flag_; + mutable objstripe::ProductStripe data_; +}; + +class S3Source : public SharedSourceBase { + public: + S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, unsigned long long iNEvents, S3ConnectionRef conn); + S3Source(S3Source&&) = delete; + S3Source(S3Source const&) = delete; + ~S3Source() = default; + + size_t numberOfDataProducts() const final; + std::vector& dataProducts(unsigned int iLane, long iEventIndex) final; + EventIdentifier eventIdentifier(unsigned int iLane, long iEventIndex) final; + + void printSummary() const final; + + bool mayBeAbleToGoToEvent(long int iEventIndex) const final { + return SharedSourceBase::mayBeAbleToGoToEvent(iEventIndex) and (iEventIndex < index_.totalevents()); + }; + + private: + void readEventAsync(unsigned int iLane, long iEventIndex, OptionalTaskHolder) final; + + std::chrono::microseconds serialReadTime() const; + std::chrono::microseconds parallelReadTime() const; + std::chrono::microseconds decompressTime() const; + std::chrono::microseconds deserializeTime() const; + + int verbose_; + std::string objPrefix_; + S3ConnectionRef conn_; + SerialTaskQueue queue_; + + objstripe::ObjectStripeIndex index_; + + struct LaneInfo { + LaneInfo(objstripe::ObjectStripeIndex const&, DeserializeStrategy); + + LaneInfo(LaneInfo&&) = default; + LaneInfo(LaneInfo const&) = delete; + + LaneInfo& operator=(LaneInfo&&) = default; + LaneInfo& operator=(LaneInfo const&) = delete; + + EventIdentifier eventID_; + std::vector dataProducts_; + std::vector dataBuffers_; + DeserializeStrategy deserializers_; + S3DelayedRetriever delayedRetriever_; + std::chrono::microseconds readTime_{0}; + std::chrono::microseconds decompressTime_{0}; + std::chrono::microseconds deserializeTime_{0}; + ~LaneInfo(); + }; + + size_t nextEventStripe_ = 0; + size_t nextEventInStripe_ = 0; + objstripe::EventStripe currentEventStripe_; + std::vector> currentProductStripes_; + + std::vector laneInfos_; + std::chrono::microseconds readTime_; +}; +} + +#endif diff --git a/SharedSourceBase.h b/SharedSourceBase.h index 754744b..49757c0 100644 --- a/SharedSourceBase.h +++ b/SharedSourceBase.h @@ -20,7 +20,7 @@ class SharedSourceBase { virtual std::vector& dataProducts(unsigned int iLane, long iEventIndex) = 0; virtual EventIdentifier eventIdentifier(unsigned int iLane, long iEventIndex) = 0; - bool mayBeAbleToGoToEvent(long int iEventIndex) const; + virtual bool mayBeAbleToGoToEvent(long int iEventIndex) const; //returns false if can immediately tell that can not continue processing void gotoEventAsync(unsigned int iLane, long iEventIndex, OptionalTaskHolder); diff --git a/objectstripe.proto b/objectstripe.proto index a4d12d1..c8a37a8 100644 --- a/objectstripe.proto +++ b/objectstripe.proto @@ -2,18 +2,26 @@ syntax = "proto2"; package objstripe; +enum SerializeStrategy { + kRoot = 0; + kRootUnrolled = 1; +} + message ObjectStripeIndex { optional uint32 eventStripeSize = 1; optional uint64 totalEvents = 2; message Product { optional string productName = 1; - optional uint32 flushSize = 2; + optional string productType = 2; + optional uint32 flushSize = 3; } repeated Product products = 3; repeated bytes packedEventStripes = 4; + + optional SerializeStrategy serializeStrategy = 5; } message EventStripe { @@ -25,9 +33,11 @@ message EventStripe { } repeated Event events = 1; + // TODO: store product flushSize here? } message ProductStripe { + // TODO: global offset (as a cross check) repeated uint32 offsets = 1 [packed = true]; optional bytes content = 2; } From 3172651b37786b3d0a105cf61a7e05fe20089c23 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Thu, 23 Jun 2022 23:11:40 -0500 Subject: [PATCH 08/43] Reset put offset! --- S3Common.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/S3Common.cc b/S3Common.cc index 0bfc994..4d6838d 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -135,14 +135,14 @@ class S3LibWrapper { if ( req->_owner->async_ ) { // TODO: how to async sleep? } else { - std::cerr << "Got status " << S3_get_status_name(status) << " while running request " << req << ", sleeping for " << dt.count() << "\n"; // TODO: better option? std::this_thread::sleep_for(dt); req->_timeout *= 2; } } else { - std::cerr << "Got status " << S3_get_status_name(status) << " while running request " << req << "\n"; + std::cerr << "Got status " << S3_get_status_name(status) << " while running request " << *req << ", retrying\n"; } + req->_put_offset = 0; req->_retries_executed++; if ( req->_owner->async_ ) { req->_owner->requests_.push(req); From 3573d8a36226429e3204402d1ca1b1d26a0612e1 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Fri, 24 Jun 2022 09:22:32 -0500 Subject: [PATCH 09/43] Should not have overrode it --- S3Source.h | 4 ---- SharedSourceBase.h | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/S3Source.h b/S3Source.h index 2e74dd7..d9765ab 100644 --- a/S3Source.h +++ b/S3Source.h @@ -50,10 +50,6 @@ class S3Source : public SharedSourceBase { void printSummary() const final; - bool mayBeAbleToGoToEvent(long int iEventIndex) const final { - return SharedSourceBase::mayBeAbleToGoToEvent(iEventIndex) and (iEventIndex < index_.totalevents()); - }; - private: void readEventAsync(unsigned int iLane, long iEventIndex, OptionalTaskHolder) final; diff --git a/SharedSourceBase.h b/SharedSourceBase.h index 49757c0..754744b 100644 --- a/SharedSourceBase.h +++ b/SharedSourceBase.h @@ -20,7 +20,7 @@ class SharedSourceBase { virtual std::vector& dataProducts(unsigned int iLane, long iEventIndex) = 0; virtual EventIdentifier eventIdentifier(unsigned int iLane, long iEventIndex) = 0; - virtual bool mayBeAbleToGoToEvent(long int iEventIndex) const; + bool mayBeAbleToGoToEvent(long int iEventIndex) const; //returns false if can immediately tell that can not continue processing void gotoEventAsync(unsigned int iLane, long iEventIndex, OptionalTaskHolder); From 6f36afd39719f35b25c855e3e08b8e8aeea38964 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Fri, 24 Jun 2022 14:33:32 -0500 Subject: [PATCH 10/43] Need a virtual destructor to clean up lambdas --- SerialTaskQueue.h | 1 + 1 file changed, 1 insertion(+) diff --git a/SerialTaskQueue.h b/SerialTaskQueue.h index b110a29..602eb57 100644 --- a/SerialTaskQueue.h +++ b/SerialTaskQueue.h @@ -121,6 +121,7 @@ class SerialTaskQueue { /** Base class for all tasks held by the SerialTaskQueue */ class TaskBase { friend class SerialTaskQueue; + virtual ~TaskBase() = default; tbb::task_group* group() { return m_group;} virtual void execute() = 0 ; From cb7185540df7e850aaceda5e4b4fcc4c362d2163 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Mon, 27 Jun 2022 16:29:37 -0500 Subject: [PATCH 11/43] Make S3LibWrapper singleton, async per request --- S3Common.cc | 109 ++++++++++++++++++++++++++-------------------------- S3Common.h | 19 +++++---- 2 files changed, 63 insertions(+), 65 deletions(-) diff --git a/S3Common.cc b/S3Common.cc index 4d6838d..e6cadb5 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -17,70 +17,75 @@ namespace cce::tf { class S3LibWrapper { public: - S3LibWrapper(bool async=false) : async_(async), running_(false) { - initStatus_ = S3_initialize("s3", S3_INIT_ALL, ""); - if ( initStatus_ != S3StatusOK ) { - std::cerr << "Failed to initialize libs3, error: " << S3_get_status_name(initStatus_) << "\n"; - return; - } - running_ = true; - if ( async_ ) { - throw std::runtime_error("Async not supported yet"); - loop_ = std::thread(&S3LibWrapper::loop_body, this); - } + static S3LibWrapper& instance() { + static S3LibWrapper instance; + return instance; } + S3LibWrapper(const S3LibWrapper&) = delete; + void operator=(const S3LibWrapper&) = delete; - ~S3LibWrapper() { - running_ = false; - if ( loop_.joinable() ) loop_.join(); - S3_deinitialize(); - } - - bool isAsync() { return async_; } bool running() { return running_; } - void get(const S3BucketContext* bucketCtx, const std::string key, S3Request::Callback&& cb) { + void get(const S3BucketContext* bucketCtx, const std::string& key, S3Request::Callback&& cb, bool async=false) { // start of S3Request lifecycle (s3lib will always call responseCompleteCallback) - auto req = new S3Request(S3Request::Type::get, bucketCtx, key, std::move(cb), this); - if ( async_ ) { + auto req = new S3Request(S3Request::Type::get, bucketCtx, key, std::move(cb), async); + if ( async ) { requests_.push(req); } else { - submit(req); + submit(req, nullptr); } } - void put(const S3BucketContext* bucketCtx, const std::string key, std::string&& value, S3Request::Callback&& cb) { + void put(const S3BucketContext* bucketCtx, const std::string& key, std::string&& value, S3Request::Callback&& cb, bool async=false) { // start of S3Request lifecycle (s3lib will always call responseCompleteCallback) - auto req = new S3Request(S3Request::Type::put, bucketCtx, key, std::move(cb), this, std::move(value)); - if ( async_ ) { + auto req = new S3Request(S3Request::Type::put, bucketCtx, key, std::move(cb), async, std::move(value)); + if ( async ) { requests_.push(req); } else { - submit(req); + submit(req, nullptr); } } private: + S3LibWrapper() : running_(false) { + initStatus_ = S3_initialize("s3", S3_INIT_ALL, ""); + if ( initStatus_ != S3StatusOK ) { + std::cerr << "Failed to initialize libs3, error: " << S3_get_status_name(initStatus_) << "\n"; + return; + } + running_ = true; + loop_ = std::thread(&S3LibWrapper::loop_body, this); + } + + ~S3LibWrapper() { + running_ = false; + if ( loop_.joinable() ) loop_.join(); + S3_deinitialize(); + } + void loop_body() { - S3_create_request_context(&requestContext_); + S3RequestContext * ctx; + S3_create_request_context(&ctx); while(running_) { - using namespace std::chrono_literals; - std::this_thread::sleep_for(1s); - // S3Status S3_get_request_context_fdsets(S3RequestContext *requestContext, fd_set *readFdSet, fd_set *writeFdSet, fd_set *exceptFdSet, int *maxFd); // int64_t S3_get_request_context_timeout(S3RequestContext *requestContext); // milliseconds // select() + std::this_thread::sleep_for(std::chrono::seconds(1)); // S3Status S3_runonce_request_context(S3RequestContext *requestContext, int *requestsRemainingReturn); // S3Request* req; // concurrency limit? - // while ( requests_.try_pop(req) ) submit(req); + // while ( requests_.try_pop(req) ) { + // submit(req, ctx); + // } } - S3_destroy_request_context(requestContext_); + // TODO: this may abort requests in flight, should we wait? + S3_destroy_request_context(ctx); } - void submit(S3Request* req) { - // this function will not block if requestContext_ is not null - // which should only be the case if async_ is true + void submit(S3Request* req, S3RequestContext* ctx) const { + // this function will block if ctx is null + assert(req->async xor ctx == nullptr); switch ( req->type ) { case S3Request::Type::undef: assert(false); // logic error @@ -92,7 +97,7 @@ class S3LibWrapper { nullptr, // S3GetConditions 0, // startByte 0, // byteCount - requestContext_, + ctx, req->_timeout, &S3LibWrapper::getObjectHandler, static_cast(req)); @@ -103,7 +108,7 @@ class S3LibWrapper { req->key.c_str(), req->buffer.size(), nullptr, // S3PutProperties (TODO probably want .md5) - requestContext_, + ctx, req->_timeout, &S3LibWrapper::putObjectHandler, static_cast(req)); @@ -132,8 +137,8 @@ class S3LibWrapper { static thread_local std::minstd_rand rng(std::hash{}(std::this_thread::get_id())); std::uniform_int_distribution dist(0l, std::min(S3Request::max_timeout.count(), req->_timeout)); auto dt = std::chrono::milliseconds(dist(rng)); - if ( req->_owner->async_ ) { - // TODO: how to async sleep? + if ( req->async ) { + // TODO: async sleep by setting a future submit time and checking in loop_body } else { // TODO: better option? std::this_thread::sleep_for(dt); @@ -144,11 +149,11 @@ class S3LibWrapper { } req->_put_offset = 0; req->_retries_executed++; - if ( req->_owner->async_ ) { - req->_owner->requests_.push(req); + if ( req->async ) { + instance().requests_.push(req); } else { // can libs3 callbacks recurse? probably... - req->_owner->submit(req); + instance().submit(req, nullptr); } return; // no delete! } @@ -201,18 +206,12 @@ class S3LibWrapper { private: S3Status initStatus_; - bool async_; std::thread loop_; std::atomic running_; - S3RequestContext* requestContext_{nullptr}; // all callbackData pointers are to S3Request objects tbb::concurrent_queue requests_; }; -// libs3 asks us to initialize and de-initialize once per process -// optional TODO: make it a singleton and only initialize when needed -S3LibWrapper s3lib; - std::ostream& operator<<(std::ostream& os, const S3Request& req) { os << "S3Request("; switch (req.type) { @@ -233,11 +232,11 @@ std::ostream& operator<<(std::ostream& os, const S3Request& req) { case S3Request::Status::error: os << "error"; break; } - os << ") (put offset: " << req._put_offset << ", retries executed: " << req._retries_executed << ")"; + os << "async=" << req.async << ") (put offset: " << req._put_offset << ", retries executed: " << req._retries_executed << ")"; return os; } -S3ConnectionRef S3Connection::from_config(std::string filename) { +S3ConnectionRef S3Connection::from_config(const std::string& filename) { std::ifstream fin(filename); if (not fin.is_open()) { std::cerr << "S3Connection config file " << filename << " could not be opened\n"; @@ -268,7 +267,7 @@ S3ConnectionRef S3Connection::from_config(std::string filename) { return {}; } - if ( not s3lib.running() ) { + if ( not S3LibWrapper::instance().running() ) { return {}; } @@ -310,18 +309,18 @@ S3Connection::S3Connection( }); }; -void S3Connection::get(const std::string key, S3Request::Callback&& cb) { +void S3Connection::get(const std::string& key, S3Request::Callback&& cb) { if ( ctx_ ) { - s3lib.get(ctx_.get(), key, std::move(cb)); + S3LibWrapper::instance().get(ctx_.get(), key, std::move(cb)); } else if ( cb ) { S3Request dummy(S3Request::Type::get, key, S3Request::Status::error); cb(&dummy); } }; -void S3Connection::put(const std::string key, std::string&& value, S3Request::Callback&& cb) { +void S3Connection::put(const std::string& key, std::string&& value, S3Request::Callback&& cb) { if ( ctx_ ) { - s3lib.put(ctx_.get(), key, std::move(value), std::move(cb)); + S3LibWrapper::instance().put(ctx_.get(), key, std::move(value), std::move(cb)); } else if ( cb ) { S3Request dummy(S3Request::Type::put, key, S3Request::Status::ok); cb(&dummy); diff --git a/S3Common.h b/S3Common.h index 9b8346a..2a2c6b2 100644 --- a/S3Common.h +++ b/S3Common.h @@ -1,6 +1,5 @@ #if !defined(S3Common_h) #define S3Common_h - #include #include #include @@ -27,28 +26,28 @@ class S3Request { const Callback callback; const std::chrono::milliseconds timeout{1000}; const int retries{5}; + const bool async{false}; std::string buffer; Status status; private: S3Request() = delete; // constructor for devnull connection - S3Request(Type iType, std::string iKey, Status stat): + S3Request(Type iType, const std::string& iKey, Status stat): type{iType}, key{iKey}, status{stat} {}; // get constructor - S3Request(Type iType, const S3BucketContext* iCtx, std::string iKey, Callback iCb, S3LibWrapper* iOwner): - type{iType}, bucketCtx{iCtx}, key{iKey}, callback{iCb}, _owner{iOwner} + S3Request(Type iType, const S3BucketContext* iCtx, const std::string& iKey, Callback iCb, bool iAsync): + type{iType}, bucketCtx{iCtx}, key{iKey}, callback{iCb}, async{iAsync} { _timeout = timeout.count(); }; // put constructor - S3Request(Type iType, const S3BucketContext* iCtx, std::string iKey, Callback iCb, S3LibWrapper* iOwner, std::string&& buf): - type{iType}, bucketCtx{iCtx}, key{iKey}, callback{iCb}, _owner{iOwner}, buffer{buf} + S3Request(Type iType, const S3BucketContext* iCtx, const std::string& iKey, Callback iCb, bool iAsync, std::string&& buf): + type{iType}, bucketCtx{iCtx}, key{iKey}, callback{iCb}, async{iAsync}, buffer{buf} { _timeout = timeout.count(); }; - S3LibWrapper *const _owner{nullptr}; size_t _put_offset{0}; int _retries_executed{0}; long _timeout; @@ -61,7 +60,7 @@ class S3Request { class S3Connection { public: - static S3ConnectionRef from_config(std::string filename); + static S3ConnectionRef from_config(const std::string& filename); S3Connection( std::string_view iHostName, @@ -71,8 +70,8 @@ class S3Connection { std::string_view iSecurityToken ); - void get(const std::string key, S3Request::Callback&& cb); - void put(const std::string key, std::string&& value, S3Request::Callback&& cb); + void get(const std::string& key, S3Request::Callback&& cb); + void put(const std::string& key, std::string&& value, S3Request::Callback&& cb); private: const std::string hostName_; From 12ac39dad442dea2adee4b05e732dacf69ca008a Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Mon, 27 Jun 2022 16:57:55 -0500 Subject: [PATCH 12/43] Address some more review by @Dr15Jones --- S3Common.cc | 8 ++++---- S3Common.h | 6 +++--- S3Outputer.cc | 2 +- S3Source.cc | 2 +- S3Source.h | 8 ++++---- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/S3Common.cc b/S3Common.cc index e6cadb5..537d59c 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -24,7 +24,7 @@ class S3LibWrapper { S3LibWrapper(const S3LibWrapper&) = delete; void operator=(const S3LibWrapper&) = delete; - bool running() { return running_; } + bool running() const { return running_; } void get(const S3BucketContext* bucketCtx, const std::string& key, S3Request::Callback&& cb, bool async=false) { // start of S3Request lifecycle (s3lib will always call responseCompleteCallback) @@ -277,7 +277,7 @@ S3ConnectionRef S3Connection::from_config(const std::string& filename) { return {}; } - return std::make_shared(hostName, bucketName, accessKeyId, secretAccessKey, securityToken); + return std::make_shared(hostName, bucketName, accessKeyId, secretAccessKey, securityToken); }; S3Connection::S3Connection( @@ -309,7 +309,7 @@ S3Connection::S3Connection( }); }; -void S3Connection::get(const std::string& key, S3Request::Callback&& cb) { +void S3Connection::get(const std::string& key, S3Request::Callback&& cb) const { if ( ctx_ ) { S3LibWrapper::instance().get(ctx_.get(), key, std::move(cb)); } else if ( cb ) { @@ -318,7 +318,7 @@ void S3Connection::get(const std::string& key, S3Request::Callback&& cb) { } }; -void S3Connection::put(const std::string& key, std::string&& value, S3Request::Callback&& cb) { +void S3Connection::put(const std::string& key, std::string&& value, S3Request::Callback&& cb) const { if ( ctx_ ) { S3LibWrapper::instance().put(ctx_.get(), key, std::move(value), std::move(cb)); } else if ( cb ) { diff --git a/S3Common.h b/S3Common.h index 2a2c6b2..106fde5 100644 --- a/S3Common.h +++ b/S3Common.h @@ -11,7 +11,7 @@ struct S3BucketContext; namespace cce::tf { class S3LibWrapper; class S3Connection; -typedef std::shared_ptr S3ConnectionRef; +typedef std::shared_ptr S3ConnectionRef; class S3Request { public: @@ -70,8 +70,8 @@ class S3Connection { std::string_view iSecurityToken ); - void get(const std::string& key, S3Request::Callback&& cb); - void put(const std::string& key, std::string&& value, S3Request::Callback&& cb); + void get(const std::string& key, S3Request::Callback&& cb) const; + void put(const std::string& key, std::string&& value, S3Request::Callback&& cb) const; private: const std::string hostName_; diff --git a/S3Outputer.cc b/S3Outputer.cc index 4934703..996527b 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -160,7 +160,7 @@ void S3Outputer::flushEventStripe(TaskHolder iCallback, bool last) const { if ( not last ) { // all buffers should be empty because the sizes all evenly divide eventFlushSize_ for(auto& p : currentProductStripes_) { - assert(p->offsets_size() == 0); + assert(p.offsets_size() == 0); } } index_.set_totalevents(eventGlobalOffset_); diff --git a/S3Source.cc b/S3Source.cc index e0d209e..2ad2a13 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -8,7 +8,7 @@ using namespace cce::tf; S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, unsigned long long iNEvents, S3ConnectionRef conn): SharedSourceBase(iNEvents), - objPrefix_(iObjPrefix), + objPrefix_(std::move(iObjPrefix)), verbose_(iVerbose), conn_(conn), readTime_{std::chrono::microseconds::zero()} diff --git a/S3Source.h b/S3Source.h index d9765ab..f09daf5 100644 --- a/S3Source.h +++ b/S3Source.h @@ -58,9 +58,9 @@ class S3Source : public SharedSourceBase { std::chrono::microseconds decompressTime() const; std::chrono::microseconds deserializeTime() const; - int verbose_; - std::string objPrefix_; - S3ConnectionRef conn_; + const int verbose_; + const std::string objPrefix_; + const S3ConnectionRef conn_; SerialTaskQueue queue_; objstripe::ObjectStripeIndex index_; @@ -85,11 +85,11 @@ class S3Source : public SharedSourceBase { ~LaneInfo(); }; + // mutated only by methods called in queue_ size_t nextEventStripe_ = 0; size_t nextEventInStripe_ = 0; objstripe::EventStripe currentEventStripe_; std::vector> currentProductStripes_; - std::vector laneInfos_; std::chrono::microseconds readTime_; }; From e15781df0c8f33073ea6c0dd30ee9a27b9220817 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Mon, 27 Jun 2022 18:05:59 -0500 Subject: [PATCH 13/43] EventStripe output safer but more serial --- S3Outputer.cc | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/S3Outputer.cc b/S3Outputer.cc index 996527b..ca24d41 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -170,24 +170,20 @@ void S3Outputer::flushEventStripe(TaskHolder iCallback, bool last) const { objstripe::EventStripe stripeOut; stripeOut.mutable_events()->Reserve(eventFlushSize_); std::swap(currentEventStripe_, stripeOut); - // TODO: are we sure writing to dest is threadsafe? auto dest = index_.add_packedeventstripes(); - iCallback.group()->run( - [this, dest, stripeOut=std::move(stripeOut), callback=iCallback]() { - auto start = std::chrono::high_resolution_clock::now(); - // TODO: compression - stripeOut.SerializeToString(dest); - if ( verbose_ >= 2 ) { - std::cout << "length of packed EventStripe: " << dest->size() << "\n"; - std::cout << stripeOut.DebugString() << "\n"; - } + // TODO: compression + stripeOut.SerializeToString(dest); + if ( verbose_ >= 2 ) { + std::cout << "length of packed EventStripe: " << dest->size() << "\n"; + std::cout << stripeOut.DebugString() << "\n"; + } - // TODO: checkpoint only every few event stripes? - if ( verbose_ >= 2 ) { - std::cout << index_.DebugString() << "\n"; - } + // TODO: checkpoint only every few event stripes? + iCallback.group()->run([this, idxcopy=index_, callback=iCallback]() { + // shallow copy index_ to ensure validity + auto start = std::chrono::high_resolution_clock::now(); std::string indexOut; - index_.SerializeToString(&indexOut); + idxcopy.SerializeToString(&indexOut); conn_->put(objPrefix_ + "index", std::move(indexOut), [callback=std::move(callback)](S3Request* req) { if ( req->status != S3Request::Status::ok ) { std::cerr << "failed to write product buffer index" << std::endl; @@ -195,8 +191,7 @@ void S3Outputer::flushEventStripe(TaskHolder iCallback, bool last) const { }); auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); parallelTime_ += time.count(); - } - ); + }); } namespace { From 28a8f8866cf27d6e706f0db5df2253ff7f92401c Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Mon, 27 Jun 2022 18:40:04 -0500 Subject: [PATCH 14/43] Make it faster! Order of magnitude speedup in serial section because compiler didn't figure out this was a memcpy with the old invocation --- S3Outputer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/S3Outputer.cc b/S3Outputer.cc index ca24d41..4be4727 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -89,7 +89,7 @@ void S3Outputer::output( } std::cout << "\n"; } - p->mutable_content()->append(s.blob().begin(), s.blob().end()); + p->mutable_content()->append(s.blob().data(), s.blob().size()); p->add_offsets(p->content().size()); p++; pi++; } From d705d542f01384ac454634ba971aa185214e4d11 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Tue, 28 Jun 2022 01:02:01 -0500 Subject: [PATCH 15/43] Fanout buffer appending --- S3Outputer.cc | 232 ++++++++++++++++++++++++++------------------- S3Outputer.h | 88 +++++++++-------- S3Source.cc | 10 +- objectstripe.proto | 20 ++-- 4 files changed, 191 insertions(+), 159 deletions(-) diff --git a/S3Outputer.cc b/S3Outputer.cc index 4be4727..f256059 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -22,29 +22,57 @@ void S3Outputer::setupForLane(unsigned int iLaneIndex, std::vectorReserve(iDPs.size()); for(auto const& ss: s) { - auto prod = index_.add_products(); + auto* prod = index_.add_products(); prod->set_productname(std::string(ss.name())); prod->set_producttype(ss.className()); prod->set_flushsize(0); + prod->set_flushminbytes(productBufferFlushMinBytes_); + buffers_.emplace_back(objPrefix_ + prod->productname(), prod); } } // all lanes see same products? if not we'll need a map - assert(currentProductStripes_.size() == iDPs.size()); + assert(buffers_.size() == iDPs.size()); +} + +void S3Outputer::productReadyAsync(unsigned int iLaneIndex, DataProductRetriever const& iDataProduct, TaskHolder iCallback) const { + assert(iLaneIndex < serializers_.size()); + auto& laneSerializers = serializers_[iLaneIndex]; + auto group = iCallback.group(); + assert(iDataProduct.index() < laneSerializers.size() ); + laneSerializers[iDataProduct.index()].doWorkAsync(*group, iDataProduct.address(), std::move(iCallback)); +} + +void S3Outputer::outputAsync(unsigned int iLaneIndex, EventIdentifier const& iEventID, TaskHolder iCallback) const { + auto start = std::chrono::high_resolution_clock::now(); + auto group = iCallback.group(); + collateQueue_.push(*group, [this, iEventID, iLaneIndex, callback=std::move(iCallback)]() mutable { + collateProducts(iEventID, serializers_[iLaneIndex], std::move(callback)); + }); + auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + parallelTime_ += time.count(); } void S3Outputer::printSummary() const { { tbb::task_group group; { - auto start = std::chrono::high_resolution_clock::now(); TaskHolder th(group, make_functor_task([](){})); - flushProductStripes(th, true); - flushEventStripe(th, true); - serialTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + TaskHolder productsDone(group, make_functor_task( + [this, stripeOut=std::move(currentEventStripe_), callback=std::move(th)]() mutable { + flushQueue_.push(*callback.group(), [this, stripeOut=std::move(stripeOut), callback=std::move(callback)]() { + flushEventStripe(stripeOut, std::move(callback), true); + }); + } + )); + for(auto& buf : buffers_) { + buf.appendQueue_.push(group, [this, &buf, cb=productsDone]() mutable { + appendProductBuffer(buf, {}, std::move(cb), true); + }); + } } group.wait(); } @@ -58,130 +86,137 @@ void S3Outputer::printSummary() const { serializerTime += s.accumulatedTime(); } } - std::cout <<"S3Outputer\n total serial time at end event: "<= 2) { - std::cout <<" run:"s+std::to_string(iEventID.run)+" lumi:"s+std::to_string(iEventID.lumi)+" event:"s+std::to_string(iEventID.event)+"\n"<set_offset(eventGlobalOffset_++); sev->set_run(iEventID.run); sev->set_lumi(iEventID.lumi); sev->set_event(iEventID.event); + if (verbose_ >= 2) { std::cout << sev->DebugString(); } - auto p = std::begin(currentProductStripes_); - auto pi = index_.mutable_products()->begin(); - for(const auto& s : iSerializers) { - if (verbose_ >= 2) { - std::cout << "adding blob len " << s.blob().size() << " to " << pi->productname() << "\n"; - for (auto c : s.blob()) { - if ( isprint(c) ) std::cout << c; - else std::cout << "\\x" << std::hex << (int) c << std::dec; + TaskHolder productsDoneCallback([this, cb=std::move(iCallback)]() mutable { + if ( currentEventStripe_.events_size() == eventFlushSize_ ) { + if(verbose_ >= 2) { std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n"; } + objstripe::EventStripe stripeOut; + stripeOut.mutable_events()->Reserve(eventFlushSize_); + std::swap(currentEventStripe_, stripeOut); + return TaskHolder(*cb.group(), make_functor_task( + [this, stripeOut=std::move(stripeOut), callback=std::move(cb)]() mutable { + flushQueue_.push(*callback.group(), [this, stripeOut=std::move(stripeOut), callback=std::move(callback)]() { + flushEventStripe(stripeOut, std::move(callback)); + }); + } + )); } - std::cout << "\n"; - } - p->mutable_content()->append(s.blob().data(), s.blob().size()); - p->add_offsets(p->content().size()); - p++; pi++; - } - - flushProductStripes(iCallback); + return cb; + }() + ); - if ( currentEventStripe_.events_size() == eventFlushSize_ ) { - if(verbose_ >= 2) { - std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n" << std::flush; - } - flushEventStripe(iCallback); + auto buf = std::begin(buffers_); + for (const auto& s : iSerializers) { + const std::string_view blob(s.blob().data(), s.blob().size()); + buf->appendQueue_.push(*productsDoneCallback.group(), [this, buf, blob, cb=productsDoneCallback]() mutable { + appendProductBuffer(*buf, blob, std::move(cb)); + }); + buf++; } + collateTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); } - -void S3Outputer::flushProductStripes(TaskHolder iCallback, bool last) const { +void S3Outputer::appendProductBuffer( + ProductOutputBuffer& buf, + const std::string_view blob, + TaskHolder iCallback, + bool last + ) const +{ using namespace std::string_literals; - auto p = currentProductStripes_.begin(); - auto pi = index_.mutable_products()->begin(); - for(; p != std::end(currentProductStripes_); ++p, ++pi) { - size_t offset = p->content().size(); - size_t bufferNevents = p->offsets_size(); - - // first flush when we exceed min size and have an even divisor of eventFlushSize_ - // subsequent flush when we reach productFlushSize - // always flush when we reach eventFlushSize_ (for buffers that never get big enough) - // flush if last call and we have something to write - if ( - ((pi->flushsize() == 0) && (offset > productBufferFlushMinSize_) && (eventFlushSize_ % bufferNevents == 0)) - || (bufferNevents == pi->flushsize()) - || (bufferNevents == eventFlushSize_) - || (last && bufferNevents > 0) - ) - { - if(verbose_ >= 2) { - std::cout << "product buffer for "s + std::string(pi->productname()) + " is full ("s + std::to_string(offset) - + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n" << std::flush; - } - objstripe::ProductStripe pOut; - pOut.mutable_offsets()->Reserve(bufferNevents); - pOut.mutable_content()->reserve(offset); - std::swap(*p, pOut); - std::string name = objPrefix_; - name += pi->productname(); - name += std::to_string(eventGlobalOffset_ - bufferNevents); - iCallback.group()->run( - [this, name=std::move(name), pOut=std::move(pOut), callback=iCallback]() { - auto start = std::chrono::high_resolution_clock::now(); - std::string finalbuf; - pOut.SerializeToString(&finalbuf); - conn_->put(name, std::move(finalbuf), [name=std::move(name), callback=std::move(callback)](S3Request* req) { - if ( req->status != S3Request::Status::ok ) { - std::cerr << "failed to write product buffer " << name << std::endl; - } - }); - auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); - parallelTime_ += time.count(); - } - ); - if ( pi->flushsize() == 0 ) { - pi->set_flushsize(bufferNevents); + auto start = std::chrono::high_resolution_clock::now(); + + if ( not last ) { + buf.buffer_.mutable_content()->append(blob); + buf.buffer_.add_offsets(buf.buffer_.content().size()); + } + size_t bufferNevents = buf.buffer_.offsets_size(); + + // first flush when we exceed min size and have an even divisor of eventFlushSize_ + // subsequent flush when we reach productFlushSize + // always flush when we reach eventFlushSize_ (for buffers that never get big enough) + // flush if last call and we have something to write + if ( + ( + (buf.info_->flushsize() == 0) + && (buf.buffer_.content().size() > buf.info_->flushminbytes()) + && (eventFlushSize_ % bufferNevents == 0) + ) + || (bufferNevents == buf.info_->flushsize()) + || (bufferNevents == eventFlushSize_) + || (last && bufferNevents > 0) + ) + { + objstripe::ProductStripe pOut; + pOut.mutable_offsets()->Reserve(bufferNevents); + pOut.mutable_content()->reserve(buf.buffer_.content().size()); + pOut.set_globaloffset(buf.buffer_.globaloffset() + bufferNevents); + std::swap(buf.buffer_, pOut); + std::string name = buf.prefix_; + name += std::to_string(buf.buffer_.globaloffset()); + iCallback.group()->run( + [this, name=std::move(name), pOut=std::move(pOut), callback=std::move(iCallback)]() { + std::string finalbuf; + pOut.SerializeToString(&finalbuf); + conn_->put(name, std::move(finalbuf), [name=std::move(name), callback=std::move(callback)](S3Request* req) { + if ( req->status != S3Request::Status::ok ) { + std::cerr << "failed to write product buffer " << name << std::endl; + } + }); } + ); + if ( buf.info_->flushsize() == 0 ) { + // only modification to info_, done inside serial appendQueue_ + buf.info_->set_flushsize(bufferNevents); } } + buf.appendTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); } -void S3Outputer::flushEventStripe(TaskHolder iCallback, bool last) const { - if ( not last ) { - // all buffers should be empty because the sizes all evenly divide eventFlushSize_ - for(auto& p : currentProductStripes_) { - assert(p.offsets_size() == 0); - } - } - index_.set_totalevents(eventGlobalOffset_); - if ( last and currentEventStripe_.events_size() == 0 ) { +void S3Outputer::flushEventStripe(const objstripe::EventStripe& stripe, TaskHolder iCallback, bool last) const { + if ( last and stripe.events_size() == 0 ) { return; } - objstripe::EventStripe stripeOut; - stripeOut.mutable_events()->Reserve(eventFlushSize_); - std::swap(currentEventStripe_, stripeOut); + auto start = std::chrono::high_resolution_clock::now(); + index_.set_totalevents(index_.totalevents() + stripe.events_size()); auto dest = index_.add_packedeventstripes(); // TODO: compression - stripeOut.SerializeToString(dest); + stripe.SerializeToString(dest); if ( verbose_ >= 2 ) { std::cout << "length of packed EventStripe: " << dest->size() << "\n"; - std::cout << stripeOut.DebugString() << "\n"; + std::cout << stripe.DebugString() << "\n"; } // TODO: checkpoint only every few event stripes? - iCallback.group()->run([this, idxcopy=index_, callback=iCallback]() { - // shallow copy index_ to ensure validity - auto start = std::chrono::high_resolution_clock::now(); + iCallback.group()->run( + // bind shallow copy of index_ to ensure validity + [this, idxcopy=index_, callback=std::move(iCallback)]() { std::string indexOut; idxcopy.SerializeToString(&indexOut); conn_->put(objPrefix_ + "index", std::move(indexOut), [callback=std::move(callback)](S3Request* req) { @@ -189,9 +224,8 @@ void S3Outputer::flushEventStripe(TaskHolder iCallback, bool last) const { std::cerr << "failed to write product buffer index" << std::endl; } }); - auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); - parallelTime_ += time.count(); }); + flushTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); } namespace { diff --git a/S3Outputer.h b/S3Outputer.h index acf57fc..0547492 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -18,16 +18,18 @@ #include "objectstripe.pb.h" namespace cce::tf { + class S3Outputer : public OutputerBase { public: S3Outputer(unsigned int iNLanes, std::string objPrefix, int iVerbose, size_t iProductBufferFlush, size_t iEventFlushSize, S3ConnectionRef conn): serializers_(iNLanes), objPrefix_(objPrefix), verbose_(iVerbose), - productBufferFlushMinSize_(iProductBufferFlush), + productBufferFlushMinBytes_(iProductBufferFlush), eventFlushSize_(iEventFlushSize), - conn_(conn), - serialTime_{std::chrono::microseconds::zero()}, + conn_(std::move(conn)), + collateTime_{std::chrono::microseconds::zero()}, + flushTime_{std::chrono::microseconds::zero()}, parallelTime_{0} { index_.set_eventstripesize(eventFlushSize_); @@ -37,53 +39,59 @@ class S3Outputer : public OutputerBase { } void setupForLane(unsigned int iLaneIndex, std::vector const& iDPs) final; - - void productReadyAsync(unsigned int iLaneIndex, DataProductRetriever const& iDataProduct, TaskHolder iCallback) const final { - assert(iLaneIndex < serializers_.size()); - auto& laneSerializers = serializers_[iLaneIndex]; - auto group = iCallback.group(); - assert(iDataProduct.index() < laneSerializers.size() ); - laneSerializers[iDataProduct.index()].doWorkAsync(*group, iDataProduct.address(), std::move(iCallback)); - } - - bool usesProductReadyAsync() const final {return true; } - - void outputAsync(unsigned int iLaneIndex, EventIdentifier const& iEventID, TaskHolder iCallback) const final { - auto start = std::chrono::high_resolution_clock::now(); - // all products - queue_.push(*iCallback.group(), [this, iEventID, iLaneIndex, callback=std::move(iCallback)]() mutable { - auto start = std::chrono::high_resolution_clock::now(); - output(iEventID, serializers_[iLaneIndex], std::move(callback)); - serialTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); - }); - auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); - parallelTime_ += time.count(); - } - + bool usesProductReadyAsync() const final {return true;} + void productReadyAsync(unsigned int iLaneIndex, DataProductRetriever const& iDataProduct, TaskHolder iCallback) const final; + void outputAsync(unsigned int iLaneIndex, EventIdentifier const& iEventID, TaskHolder iCallback) const final; void printSummary() const final; private: - void output(EventIdentifier const& iEventID, SerializeStrategy const& iSerializers, TaskHolder iCallback) const; - void flushProductStripes(TaskHolder iCallback, bool last=false) const; - void flushEventStripe(TaskHolder iCallback, bool last=false) const; + struct ProductOutputBuffer { + ProductOutputBuffer(const std::string& prefix, objstripe::ProductInfo* info) : + prefix_{prefix}, info_{info} {}; - mutable std::vector serializers_; - mutable SerialTaskQueue queue_; + const std::string prefix_; + objstripe::ProductInfo* info_; // owned by index_ + objstripe::ProductStripe buffer_; + SerialTaskQueue appendQueue_; + std::chrono::microseconds appendTime_{0}; + }; + + // Plan: + // productReadyAsync() is threadsafe because serializers_ is one per lane + // outputAsync puts collateProducts() in collateQueue_ + // collateProducts() appends a new objstripe::Event to currentEventStripe_ and if time to flush + // it creates a TaskHolder that appends flushEventStripe() to flushQueue_ + // then collate() calls appendProductBuffer() with the above TaskHolder as callback (or original callback) + // printSummary() takes care of the tails by setting last=true in the calls + void collateProducts(EventIdentifier const& iEventID, SerializeStrategy const& iSerializers, TaskHolder iCallback) const; + void appendProductBuffer(ProductOutputBuffer& buf, const std::string_view blob, TaskHolder iCallback, bool last=false) const; + void flushEventStripe(const objstripe::EventStripe& stripe, TaskHolder iCallback, bool last=false) const; // configuration options - int verbose_; - std::string objPrefix_; - size_t productBufferFlushMinSize_; - size_t eventFlushSize_; + const int verbose_; + const std::string objPrefix_; + const size_t productBufferFlushMinBytes_; + const size_t eventFlushSize_; S3ConnectionRef conn_; - // mutated only by methods called in queue_ - mutable objstripe::ObjectStripeIndex index_; - mutable objstripe::EventStripe currentEventStripe_; - mutable std::vector currentProductStripes_; + // only modified by productReadyAsync() + mutable std::vector serializers_; + + // only modified in collateProducts() + mutable SerialTaskQueue collateQueue_; mutable size_t eventGlobalOffset_{0}; + mutable objstripe::EventStripe currentEventStripe_; + mutable std::chrono::microseconds collateTime_; + + // only modified in appendProductBuffer() + mutable std::vector buffers_; + + // only modified in flushEventStripe() + // (for index_'s ProductInfos, appendProductBuffer() has finished before we access) + mutable SerialTaskQueue flushQueue_; + mutable objstripe::ObjectStripeIndex index_; + mutable std::chrono::microseconds flushTime_; - mutable std::chrono::microseconds serialTime_; mutable std::atomic parallelTime_; }; } diff --git a/S3Source.cc b/S3Source.cc index 2ad2a13..9dac3c3 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -89,6 +89,7 @@ std::pair DelayedProductStripeRetriever::bufferAt(size_t gl else { throw std::runtime_error("Could not retrieve ProductStripe for key " + name_); } }); }); + assert(globalOffset_ == data_.globaloffset()); assert(globalOffset_ <= globalEventIndex); size_t offset = globalEventIndex - globalOffset_; assert(offset < data_.offsets_size()); @@ -157,15 +158,6 @@ void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTask while ( it_stripe != stripes.cend() ) { auto start = std::chrono::high_resolution_clock::now(); auto [buf, len] = (*it_stripe)->bufferAt(globalEventIndex); - if ( verbose_ >= 3 ) { - std::cout << "got buffer " << uint64_t(buf) << " len " << len << "\n"; - for (size_t i=0; i(split - start); auto readSize = (*it_deserialize).deserialize(buf, len, *it_product->address()); diff --git a/objectstripe.proto b/objectstripe.proto index c8a37a8..767e428 100644 --- a/objectstripe.proto +++ b/objectstripe.proto @@ -7,20 +7,18 @@ enum SerializeStrategy { kRootUnrolled = 1; } +message ProductInfo { + optional string productName = 1; + optional string productType = 2; + optional uint32 flushSize = 3; + optional uint32 flushMinBytes = 4; +} + message ObjectStripeIndex { optional uint32 eventStripeSize = 1; optional uint64 totalEvents = 2; - - message Product { - optional string productName = 1; - optional string productType = 2; - optional uint32 flushSize = 3; - } - - repeated Product products = 3; - + repeated ProductInfo products = 3; repeated bytes packedEventStripes = 4; - optional SerializeStrategy serializeStrategy = 5; } @@ -37,7 +35,7 @@ message EventStripe { } message ProductStripe { - // TODO: global offset (as a cross check) repeated uint32 offsets = 1 [packed = true]; optional bytes content = 2; + optional uint64 globalOffset = 3; } From 1f3adee2acd0dbc50cdf902b96f95c7a4b465748 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Tue, 28 Jun 2022 18:45:58 -0500 Subject: [PATCH 16/43] Fix small bug in product offsets --- S3Common.cc | 9 ++++++++- S3Common.h | 3 +++ S3Outputer.cc | 18 ++++++++++++------ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/S3Common.cc b/S3Common.cc index 537d59c..8d3c6db 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -291,7 +291,8 @@ S3Connection::S3Connection( bucketName_(iBucketName), accessKeyId_(iAccessKey), secretAccessKey_(iSecretKey), - securityToken_(iSecurityToken) + securityToken_(iSecurityToken), + blockingTime_{0} { if ( hostName_ == "devnull") { // magic do-nothing connection @@ -310,21 +311,27 @@ S3Connection::S3Connection( }; void S3Connection::get(const std::string& key, S3Request::Callback&& cb) const { + auto start = std::chrono::high_resolution_clock::now(); if ( ctx_ ) { S3LibWrapper::instance().get(ctx_.get(), key, std::move(cb)); } else if ( cb ) { S3Request dummy(S3Request::Type::get, key, S3Request::Status::error); cb(&dummy); } + auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + blockingTime_ += time.count(); }; void S3Connection::put(const std::string& key, std::string&& value, S3Request::Callback&& cb) const { + auto start = std::chrono::high_resolution_clock::now(); if ( ctx_ ) { S3LibWrapper::instance().put(ctx_.get(), key, std::move(value), std::move(cb)); } else if ( cb ) { S3Request dummy(S3Request::Type::put, key, S3Request::Status::ok); cb(&dummy); } + auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + blockingTime_ += time.count(); }; } diff --git a/S3Common.h b/S3Common.h index 106fde5..f48576c 100644 --- a/S3Common.h +++ b/S3Common.h @@ -72,6 +72,7 @@ class S3Connection { void get(const std::string& key, S3Request::Callback&& cb) const; void put(const std::string& key, std::string&& value, S3Request::Callback&& cb) const; + std::chrono::microseconds blockingTime() const { return std::chrono::microseconds(blockingTime_.load()); } private: const std::string hostName_; @@ -81,6 +82,8 @@ class S3Connection { const std::string securityToken_; // holds pointers to c_str() of the above std::unique_ptr ctx_; + + mutable std::atomic blockingTime_; }; } diff --git a/S3Outputer.cc b/S3Outputer.cc index f256059..b845b21 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -95,7 +95,8 @@ void S3Outputer::printSummary() const { " total serial event stripe flush time at end event: "<blockingTime().count()<<"us\n"; } void S3Outputer::collateProducts( @@ -156,7 +157,8 @@ void S3Outputer::appendProductBuffer( buf.buffer_.mutable_content()->append(blob); buf.buffer_.add_offsets(buf.buffer_.content().size()); } - size_t bufferNevents = buf.buffer_.offsets_size(); + const size_t bufferNevents = buf.buffer_.offsets_size(); + const size_t bufferNbytes = buf.buffer_.content().size(); // first flush when we exceed min size and have an even divisor of eventFlushSize_ // subsequent flush when we reach productFlushSize @@ -165,7 +167,7 @@ void S3Outputer::appendProductBuffer( if ( ( (buf.info_->flushsize() == 0) - && (buf.buffer_.content().size() > buf.info_->flushminbytes()) + && (bufferNbytes > buf.info_->flushminbytes()) && (eventFlushSize_ % bufferNevents == 0) ) || (bufferNevents == buf.info_->flushsize()) @@ -173,13 +175,18 @@ void S3Outputer::appendProductBuffer( || (last && bufferNevents > 0) ) { + if(verbose_ >= 2) { + std::cout << "product buffer for "s + std::string(buf.info_->productname()) + + " is full ("s + std::to_string(bufferNbytes) + + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n"; + } objstripe::ProductStripe pOut; pOut.mutable_offsets()->Reserve(bufferNevents); - pOut.mutable_content()->reserve(buf.buffer_.content().size()); + pOut.mutable_content()->reserve(bufferNbytes); pOut.set_globaloffset(buf.buffer_.globaloffset() + bufferNevents); std::swap(buf.buffer_, pOut); std::string name = buf.prefix_; - name += std::to_string(buf.buffer_.globaloffset()); + name += std::to_string(pOut.globaloffset()); iCallback.group()->run( [this, name=std::move(name), pOut=std::move(pOut), callback=std::move(iCallback)]() { std::string finalbuf; @@ -210,7 +217,6 @@ void S3Outputer::flushEventStripe(const objstripe::EventStripe& stripe, TaskHold stripe.SerializeToString(dest); if ( verbose_ >= 2 ) { std::cout << "length of packed EventStripe: " << dest->size() << "\n"; - std::cout << stripe.DebugString() << "\n"; } // TODO: checkpoint only every few event stripes? From 3951ccdff2ba0e8c113feb77f3ebc6197dd23193 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Tue, 28 Jun 2022 22:31:25 -0500 Subject: [PATCH 17/43] Non-blocking product stripe retrieval --- S3Source.cc | 199 +++++++++++++++++++++++++++------------------------- S3Source.h | 71 +++++++++++-------- 2 files changed, 146 insertions(+), 124 deletions(-) diff --git a/S3Source.cc b/S3Source.cc index 9dac3c3..aa419f6 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -3,9 +3,87 @@ #include "SourceFactory.h" #include "Deserializer.h" #include "UnrolledDeserializer.h" +#include "FunctorTask.h" using namespace cce::tf; +void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { + auto this_state{State::unretrieved}; + if ( state_.compare_exchange_strong(this_state, State::retrieving) ) { + conn_->get(name_, [this, callback=std::move(callback)](S3Request* req) mutable { + if ( req->status == S3Request::Status::ok ) { + if ( not data_.ParseFromString(req->buffer) ) { + throw std::runtime_error("Could not deserialize ProductStripe for key " + name_); + } + state_ = State::retrieved; + callback.doneWaiting(); + for(auto& w : waiters_) w.doneWaiting(); + } + else { throw std::runtime_error("Could not retrieve ProductStripe for key " + name_); } + }); + } else if (this_state == State::retrieved ) { + return; + } else { + // TODO: check again if not State::retrieved? + waiters_.emplace_back(std::move(callback)); + } +} + +std::string_view DelayedProductStripeRetriever::bufferAt(size_t globalEventIndex) const { + assert(state_ == State::retrieved); + assert(globalOffset_ == data_.globaloffset()); + assert(globalOffset_ <= globalEventIndex); + size_t offset = globalEventIndex - globalOffset_; + assert(offset < data_.offsets_size()); + size_t bstart = (offset == 0) ? 0 : data_.offsets(offset-1); + size_t bstop = data_.offsets(offset); + return {&data_.content()[bstart], bstop - bstart}; +} + +S3DelayedRetriever::S3DelayedRetriever(objstripe::ObjectStripeIndex const& index, DeserializeStrategy strategy): + deserializers_{std::move(strategy)} +{ + dataProducts_.reserve(index.products_size()); + deserializers_.reserve(index.products_size()); + dataBuffers_.resize(index.products_size(), nullptr); + stripes_.resize(index.products_size()); + size_t i{0}; + for(auto const& pi : index.products()) { + TClass* cls = TClass::GetClass(pi.producttype().c_str()); + if ( cls == nullptr ) { + throw std::runtime_error("No TClass reflection available for " + pi.productname()); + } + dataBuffers_[i] = cls->New(); + dataProducts_.emplace_back(i, &dataBuffers_[i], pi.productname(), cls, this); + deserializers_.emplace_back(cls); + ++i; + } +} + +S3DelayedRetriever::~S3DelayedRetriever() { + auto it = dataProducts_.begin(); + for(void * b: dataBuffers_) { + it->classType()->Destructor(b); + ++it; + } +} + +void S3DelayedRetriever::getAsync(DataProductRetriever& product, int index, TaskHolder callback) { + assert(&product == &dataProducts_[index]); + assert(product.address() == &dataBuffers_[index]); + assert(stripes_[index]); + TaskHolder fetchCallback(*callback.group(), make_functor_task( + [this, index, callback=std::move(callback)]() mutable { + auto start = std::chrono::high_resolution_clock::now(); + auto buf = stripes_[index]->bufferAt(globalEventIndex_); + auto readSize = deserializers_[index].deserialize(buf.data(), buf.size(), *dataProducts_[index].address()); + dataProducts_[index].setSize(readSize); + deserializeTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + } + )); + stripes_[index]->fetch(std::move(fetchCallback)); +} + S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, unsigned long long iNEvents, S3ConnectionRef conn): SharedSourceBase(iNEvents), objPrefix_(std::move(iObjPrefix)), @@ -34,7 +112,7 @@ S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, u currentProductStripes_.resize(index_.products_size()); - laneInfos_.reserve(iNLanes); + laneRetrievers_.reserve(iNLanes); for(unsigned int i = 0; i< iNLanes; ++i) { DeserializeStrategy strategy; switch(index_.serializestrategy()) { @@ -45,69 +123,22 @@ S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, u strategy = DeserializeStrategy::make>(); break; } - laneInfos_.emplace_back(index_, std::move(strategy)); + laneRetrievers_.emplace_back(index_, std::move(strategy)); } readTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); } -S3Source::LaneInfo::LaneInfo(objstripe::ObjectStripeIndex const& index, DeserializeStrategy deserialize): - deserializers_{std::move(deserialize)} -{ - dataProducts_.reserve(index.products_size()); - dataBuffers_.resize(index.products_size(), nullptr); - deserializers_.reserve(index.products_size()); - size_t i{0}; - for(auto const& pi : index.products()) { - TClass* cls = TClass::GetClass(pi.producttype().c_str()); - if ( cls == nullptr ) { - throw std::runtime_error("No TClass reflection available for " + pi.productname()); - } - dataBuffers_[i] = cls->New(); - dataProducts_.emplace_back(i, &dataBuffers_[i], pi.productname(), cls, &delayedRetriever_); - deserializers_.emplace_back(cls); - ++i; - } -} - -S3Source::LaneInfo::~LaneInfo() { - auto it = dataProducts_.begin(); - for( void * b: dataBuffers_) { - it->classType()->Destructor(b); - ++it; - } -} - -std::pair DelayedProductStripeRetriever::bufferAt(size_t globalEventIndex) const { - std::call_once(flag_, [this](){ - conn_->get(name_, [this](S3Request* req) { - if ( req->status == S3Request::Status::ok ) { - if ( not data_.ParseFromString(req->buffer) ) { - throw std::runtime_error("Could not deserialize ProductStripe for key " + name_); - } - } - else { throw std::runtime_error("Could not retrieve ProductStripe for key " + name_); } - }); - }); - assert(globalOffset_ == data_.globaloffset()); - assert(globalOffset_ <= globalEventIndex); - size_t offset = globalEventIndex - globalOffset_; - assert(offset < data_.offsets_size()); - size_t bstart = (offset == 0) ? 0 : data_.offsets(offset-1); - size_t bstop = data_.offsets(offset); - return {&data_.content()[bstart], bstop - bstart}; -} - size_t S3Source::numberOfDataProducts() const { - return laneInfos_[0].dataProducts_.size(); + return index_.products_size(); } std::vector& S3Source::dataProducts(unsigned int iLane, long iEventIndex) { - return laneInfos_[iLane].dataProducts_; + return laneRetrievers_[iLane].dataProducts(); } EventIdentifier S3Source::eventIdentifier(unsigned int iLane, long iEventIndex) { - return laneInfos_[iLane].eventID_; + return laneRetrievers_[iLane].event(); } void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTaskHolder iTask) { @@ -127,48 +158,32 @@ void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTask } const auto event = currentEventStripe_.events(nextEventInStripe_); if ( verbose_ >= 2 ) std::cout << event.DebugString() << "\n"; + auto& retriever = laneRetrievers_[iLane]; size_t globalEventIndex = event.offset(); - laneInfos_[iLane].eventID_.run = event.run(); - laneInfos_[iLane].eventID_.lumi = event.lumi(); - laneInfos_[iLane].eventID_.event = event.event(); - auto stripes = std::vector>(); - stripes.reserve(currentProductStripes_.size()); + + auto productinfo = std::begin(index_.products()); size_t i{0}; for (auto& ps : currentProductStripes_) { - const auto& productinfo = index_.products(i++); + const auto& productinfo = index_.products(i); if ( nextEventInStripe_ % productinfo.flushsize() == 0 ) { auto new_ps = std::make_shared( conn_, objPrefix_ + productinfo.productname() + std::to_string(globalEventIndex), globalEventIndex ); + if ( verbose_ >= 2 ) { + std::cout << "setting lane " << iLane << "to read stripe " << + objPrefix_ + productinfo.productname() + std::to_string(globalEventIndex) << "\n"; + } std::swap(ps, new_ps); } - stripes.push_back(ps); + retriever.setStripe(i, ps); + i++; } + retriever.setEvent(globalEventIndex, {event.run(), event.lumi(), event.event()}); + optTask.releaseToTaskHolder(); ++nextEventInStripe_; - auto group = optTask.group(); - group->run([this, task=optTask.releaseToTaskHolder(), iLane, globalEventIndex, stripes=std::move(stripes)]() { - auto& laneInfo = this->laneInfos_[iLane]; - - auto it_stripe = stripes.cbegin(); - auto it_deserialize = laneInfo.deserializers_.begin(); - auto it_product = laneInfo.dataProducts_.begin(); - while ( it_stripe != stripes.cend() ) { - auto start = std::chrono::high_resolution_clock::now(); - auto [buf, len] = (*it_stripe)->bufferAt(globalEventIndex); - auto split = std::chrono::high_resolution_clock::now(); - laneInfo.readTime_ += std::chrono::duration_cast(split - start); - auto readSize = (*it_deserialize).deserialize(buf, len, *it_product->address()); - if ( verbose_ >= 3 ) std::cout << "read " << readSize << " bytes\n"; - it_product->setSize(readSize); - ++it_stripe; - ++it_deserialize; - ++it_product; - laneInfo.deserializeTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - split); - } - }); } readTime_ +=std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); }); @@ -176,36 +191,28 @@ void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTask void S3Source::printSummary() const { std::cout <<"\nSource:\n" - " serial read time: "<blockingTime().count()<<"us\n"; }; std::chrono::microseconds S3Source::serialReadTime() const { return readTime_; } -std::chrono::microseconds S3Source::parallelReadTime() const { - auto time = std::chrono::microseconds::zero(); - for(auto const& l : laneInfos_) { - time += l.readTime_; - } - return time; -} - std::chrono::microseconds S3Source::decompressTime() const { auto time = std::chrono::microseconds::zero(); - for(auto const& l : laneInfos_) { - time += l.decompressTime_; + for(auto const& l : laneRetrievers_) { + time += l.decompressTime(); } return time; } std::chrono::microseconds S3Source::deserializeTime() const { auto time = std::chrono::microseconds::zero(); - for(auto const& l : laneInfos_) { - time += l.deserializeTime_; + for(auto const& l : laneRetrievers_) { + time += l.deserializeTime(); } return time; } diff --git a/S3Source.h b/S3Source.h index f09daf5..c0185c5 100644 --- a/S3Source.h +++ b/S3Source.h @@ -7,6 +7,8 @@ #include #include +#include "tbb/concurrent_vector.h" + #include "SharedSourceBase.h" #include "DataProductRetriever.h" #include "DelayedProductRetriever.h" @@ -17,15 +19,12 @@ namespace cce::tf { -class S3DelayedRetriever : public DelayedProductRetriever { - void getAsync(DataProductRetriever&, int index, TaskHolder) final {} -}; - class DelayedProductStripeRetriever { public: DelayedProductStripeRetriever(S3ConnectionRef conn, std::string name, size_t globalOffset): - conn_(conn), name_(name), globalOffset_(globalOffset) {}; - std::pair bufferAt(size_t globalEventIndex) const; + conn_(conn), name_(name), globalOffset_(globalOffset), state_{State::unretrieved} {}; + void fetch(TaskHolder&& callback) const; + std::string_view bufferAt(size_t globalEventIndex) const; ~DelayedProductStripeRetriever() {}; private: @@ -33,10 +32,46 @@ class DelayedProductStripeRetriever { std::string name_; size_t globalOffset_; - mutable std::once_flag flag_; + enum class State {unretrieved, retrieving, retrieved}; + mutable std::atomic state_; + mutable tbb::concurrent_vector waiters_; mutable objstripe::ProductStripe data_; }; + +class S3DelayedRetriever : public DelayedProductRetriever { + public: + S3DelayedRetriever(objstripe::ObjectStripeIndex const&, DeserializeStrategy); + ~S3DelayedRetriever(); + + S3DelayedRetriever(S3DelayedRetriever&&) = default; + S3DelayedRetriever(S3DelayedRetriever const&) = delete; + S3DelayedRetriever& operator=(S3DelayedRetriever&&) = default; + S3DelayedRetriever& operator=(S3DelayedRetriever const&) = delete; + + EventIdentifier event() const { return eventID_; } + void setEvent(size_t globalEventIndex, EventIdentifier&& ev) { globalEventIndex_ = globalEventIndex; eventID_ = ev; } + + std::chrono::microseconds decompressTime() const { return decompressTime_; } + std::chrono::microseconds deserializeTime() const { return deserializeTime_; } + + void setStripe(size_t index, const std::shared_ptr& ptr) { stripes_[index] = ptr; } + + std::vector& dataProducts() { return dataProducts_; } + + void getAsync(DataProductRetriever& product, int index, TaskHolder callback) final; + + private: + size_t globalEventIndex_; + EventIdentifier eventID_; + std::vector dataProducts_; + std::vector dataBuffers_; + DeserializeStrategy deserializers_; + std::vector> stripes_; + std::chrono::microseconds decompressTime_{0}; + std::chrono::microseconds deserializeTime_{0}; +}; + class S3Source : public SharedSourceBase { public: S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, unsigned long long iNEvents, S3ConnectionRef conn); @@ -65,32 +100,12 @@ class S3Source : public SharedSourceBase { objstripe::ObjectStripeIndex index_; - struct LaneInfo { - LaneInfo(objstripe::ObjectStripeIndex const&, DeserializeStrategy); - - LaneInfo(LaneInfo&&) = default; - LaneInfo(LaneInfo const&) = delete; - - LaneInfo& operator=(LaneInfo&&) = default; - LaneInfo& operator=(LaneInfo const&) = delete; - - EventIdentifier eventID_; - std::vector dataProducts_; - std::vector dataBuffers_; - DeserializeStrategy deserializers_; - S3DelayedRetriever delayedRetriever_; - std::chrono::microseconds readTime_{0}; - std::chrono::microseconds decompressTime_{0}; - std::chrono::microseconds deserializeTime_{0}; - ~LaneInfo(); - }; - // mutated only by methods called in queue_ size_t nextEventStripe_ = 0; size_t nextEventInStripe_ = 0; objstripe::EventStripe currentEventStripe_; std::vector> currentProductStripes_; - std::vector laneInfos_; + std::vector laneRetrievers_; std::chrono::microseconds readTime_; }; } From 46fb3aefbd4df76b695c82af1eec6a40b5deea9d Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Wed, 29 Jun 2022 14:19:11 -0500 Subject: [PATCH 18/43] Implement zstd compression --- S3Outputer.cc | 120 +++++++++++++++++++++++++++++++++++++++++---- S3Outputer.h | 32 +++++++++--- S3Source.cc | 65 ++++++++++++++++++++---- S3Source.h | 11 +++-- objectstripe.proto | 14 +++++- 5 files changed, 211 insertions(+), 31 deletions(-) diff --git a/S3Outputer.cc b/S3Outputer.cc index b845b21..c056b12 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -4,8 +4,95 @@ #include "UnrolledSerializerWrapper.h" #include "FunctorTask.h" +#if ZSTD_VERSION_NUMBER < (1*100*100 + 3*100) +#error("zstd is too old") +#endif + using namespace cce::tf; +StreamCompressor::StreamCompressor(const objstripe::Compression& setting): + setting_{setting} +{ + switch ( setting_.type() ) { + case objstripe::CompressionType::kNone: + break; + case objstripe::CompressionType::kZSTD: + zstd_.reset(ZSTD_createCStream()); + ZSTD_CCtx_setParameter(zstd_.get(), ZSTD_c_compressionLevel, setting_.level()); + break; + } +} + +namespace { +size_t zstd_compress(ZSTD_CCtx* ctx, const std::string_view blob, std::string& out, bool flush) { + size_t tail{out.size()}; + if ( out.capacity() < ZSTD_CStreamOutSize() ) out.resize(ZSTD_CStreamOutSize()); + else out.resize(out.capacity()); + ZSTD_outBuffer_s obuf{.dst=out.data(), .size=out.size(), .pos=tail}; + + size_t status; + if ( flush ) { + ZSTD_inBuffer_s ibuf{.src=nullptr, .size=0, .pos=0}; + while ( status != 0 ) { + status = ZSTD_compressStream2(ctx, &obuf, &ibuf, ZSTD_e_end); + if ( ZSTD_isError(status) ) { + std::cerr <<"ERROR in compression " << ZSTD_getErrorName(status) << std::endl; + } + if ( obuf.pos == obuf.size ) { + size_t new_size = obuf.size * 2; + out.resize(new_size); + obuf.dst = out.data(); + obuf.size = new_size; + } + } + } else { + ZSTD_inBuffer_s ibuf{.src=blob.data(), .size=blob.size(), .pos=0}; + while ( ibuf.pos < ibuf.size ) { + status = ZSTD_compressStream2(ctx, &obuf, &ibuf, ZSTD_e_continue); + if ( ZSTD_isError(status) ) { + std::cerr <<"ERROR in compression " << ZSTD_getErrorName(status) << std::endl; + } + if ( obuf.pos == obuf.size ) { + size_t new_size = obuf.size * 2; + out.resize(new_size); + obuf.dst = out.data(); + obuf.size = new_size; + } + } + } + out.resize(obuf.pos); + // we are supposed to get a hint from ZSTD of the bytes left in internal buffers of CCtx + // but it doesn't appear to be nonzero + return status; +} +} + +size_t StreamCompressor::write(const std::string_view blob, std::string& out) { + switch ( setting_.type() ) { + case objstripe::CompressionType::kNone: + out.append(blob); + return 0; + case objstripe::CompressionType::kZSTD: + return ::zstd_compress(zstd_.get(), blob, out, false); + default: + assert(false); + return 0; + } +} + +void StreamCompressor::flush(std::string& out) { + switch ( setting_.type() ) { + case objstripe::CompressionType::kNone: + return; + case objstripe::CompressionType::kZSTD: + ::zstd_compress(zstd_.get(), {}, out, true); + return; + default: + assert(false); + return; + } +} + void S3Outputer::setupForLane(unsigned int iLaneIndex, std::vector const& iDPs) { auto& s = serializers_[iLaneIndex]; switch(index_.serializestrategy()) { @@ -31,7 +118,8 @@ void S3Outputer::setupForLane(unsigned int iLaneIndex, std::vectorset_producttype(ss.className()); prod->set_flushsize(0); prod->set_flushminbytes(productBufferFlushMinBytes_); - buffers_.emplace_back(objPrefix_ + prod->productname(), prod); + // TODO: choose compression setting based on properties of ss? + buffers_.emplace_back(objPrefix_ + prod->productname(), prod, defaultCompression_); } } // all lanes see same products? if not we'll need a map @@ -153,12 +241,19 @@ void S3Outputer::appendProductBuffer( using namespace std::string_literals; auto start = std::chrono::high_resolution_clock::now(); + size_t pendingbytes{0}; if ( not last ) { - buf.buffer_.mutable_content()->append(blob); - buf.buffer_.add_offsets(buf.buffer_.content().size()); + buf.stripe_.add_counts(blob.size()); + pendingbytes = buf.compressor_.write(blob, *buf.stripe_.mutable_content()); + } + const size_t bufferNevents = buf.stripe_.counts_size(); + size_t bufferNbytes = buf.stripe_.content().size(); + if ( pendingbytes > 0 ) { + std::cout << "product buffer for "s + std::string(buf.info_->productname()) + + " put " + std::to_string(blob.size()) + " bytes in" + " and has "s + std::to_string(bufferNbytes) + " bytes out" + " and "s + std::to_string(pendingbytes) + " bytes pending\n"; } - const size_t bufferNevents = buf.buffer_.offsets_size(); - const size_t bufferNbytes = buf.buffer_.content().size(); // first flush when we exceed min size and have an even divisor of eventFlushSize_ // subsequent flush when we reach productFlushSize @@ -175,16 +270,21 @@ void S3Outputer::appendProductBuffer( || (last && bufferNevents > 0) ) { + buf.compressor_.flush(*buf.stripe_.mutable_content()); + bufferNbytes = buf.stripe_.content().size(); if(verbose_ >= 2) { std::cout << "product buffer for "s + std::string(buf.info_->productname()) + " is full ("s + std::to_string(bufferNbytes) + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n"; } objstripe::ProductStripe pOut; - pOut.mutable_offsets()->Reserve(bufferNevents); + pOut.mutable_counts()->Reserve(bufferNevents); pOut.mutable_content()->reserve(bufferNbytes); - pOut.set_globaloffset(buf.buffer_.globaloffset() + bufferNevents); - std::swap(buf.buffer_, pOut); + pOut.set_globaloffset(buf.stripe_.globaloffset() + bufferNevents); + + std::swap(buf.stripe_, pOut); + + pOut.set_allocated_compression(new objstripe::Compression(buf.compressor_.getCompression())); std::string name = buf.prefix_; name += std::to_string(pOut.globaloffset()); iCallback.group()->run( @@ -245,8 +345,8 @@ class Maker : public OutputerMakerBase { std::cerr << "no object prefix given for S3Outputer\n"; return {}; } - auto productFlush = params.get("productFlush", 1024*512); - auto eventFlush = params.get("eventFlush", 24); + auto productFlush = params.get("productFlush", 1024*128); + auto eventFlush = params.get("eventFlush", 144); auto connfile = params.get("conn"); if(not connfile) { std::cerr <<"no connection configuration file name given for S3Outputer\n"; diff --git a/S3Outputer.h b/S3Outputer.h index 0547492..92d7773 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -6,6 +6,7 @@ #include #include +#include "zstd.h" #include "tbb/task_group.h" #include "OutputerBase.h" @@ -19,6 +20,20 @@ namespace cce::tf { +class StreamCompressor { + public: + StreamCompressor(const objstripe::Compression& setting); + const objstripe::Compression& getCompression() const { return setting_; } + size_t write(const std::string_view blob, std::string& out); + void flush(std::string& out); + + private: + objstripe::Compression setting_; + + struct ZSTDDeleter { void operator()(ZSTD_CStream* s) const {ZSTD_freeCStream(s);} }; + std::unique_ptr zstd_; +}; + class S3Outputer : public OutputerBase { public: S3Outputer(unsigned int iNLanes, std::string objPrefix, int iVerbose, size_t iProductBufferFlush, size_t iEventFlushSize, S3ConnectionRef conn): @@ -33,9 +48,12 @@ class S3Outputer : public OutputerBase { parallelTime_{0} { index_.set_eventstripesize(eventFlushSize_); + currentEventStripe_.mutable_events()->Reserve(eventFlushSize_); + // TODO: make configurable index_.set_serializestrategy(objstripe::SerializeStrategy::kRoot); - currentEventStripe_.mutable_events()->Reserve(eventFlushSize_); + defaultCompression_.set_type(objstripe::CompressionType::kZSTD); + defaultCompression_.set_level(4); } void setupForLane(unsigned int iLaneIndex, std::vector const& iDPs) final; @@ -46,13 +64,14 @@ class S3Outputer : public OutputerBase { private: struct ProductOutputBuffer { - ProductOutputBuffer(const std::string& prefix, objstripe::ProductInfo* info) : - prefix_{prefix}, info_{info} {}; + ProductOutputBuffer(const std::string& prefix, objstripe::ProductInfo* info, const objstripe::Compression& comp) : + prefix_{prefix}, info_{info}, compressor_{comp} {}; const std::string prefix_; objstripe::ProductInfo* info_; // owned by index_ - objstripe::ProductStripe buffer_; - SerialTaskQueue appendQueue_; + StreamCompressor compressor_; + objstripe::ProductStripe stripe_{}; + SerialTaskQueue appendQueue_{}; std::chrono::microseconds appendTime_{0}; }; @@ -73,6 +92,7 @@ class S3Outputer : public OutputerBase { const size_t productBufferFlushMinBytes_; const size_t eventFlushSize_; S3ConnectionRef conn_; + objstripe::Compression defaultCompression_{}; // only modified by productReadyAsync() mutable std::vector serializers_; @@ -80,7 +100,7 @@ class S3Outputer : public OutputerBase { // only modified in collateProducts() mutable SerialTaskQueue collateQueue_; mutable size_t eventGlobalOffset_{0}; - mutable objstripe::EventStripe currentEventStripe_; + mutable objstripe::EventStripe currentEventStripe_{}; mutable std::chrono::microseconds collateTime_; // only modified in appendProductBuffer() diff --git a/S3Source.cc b/S3Source.cc index aa419f6..4f75a8a 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -1,4 +1,5 @@ #include +#include "zstd.h" #include "S3Source.h" #include "SourceFactory.h" #include "Deserializer.h" @@ -7,14 +8,60 @@ using namespace cce::tf; +namespace { +struct ZSTD_ContextHolder { + ZSTD_ContextHolder() { ctx = ZSTD_createDCtx(); } + ~ZSTD_ContextHolder() { ZSTD_freeDCtx(ctx); } + ZSTD_DCtx* ctx; +}; + +size_t zstd_perthread_decompress(void* dst, size_t dstCapacity, const void* src, size_t compressedSize) { + static thread_local ZSTD_ContextHolder holder{}; + return ZSTD_decompressDCtx(holder.ctx, dst, dstCapacity, src, compressedSize); +} + +void decompress_stripe(const objstripe::Compression& setting, std::string& blob, std::string& out, size_t dSize) { + switch ( setting.type() ) { + case objstripe::CompressionType::kNone: + std::swap(blob, out); + return; + case objstripe::CompressionType::kZSTD: + out.resize(dSize); + size_t status = ZSTD_decompress(out.data(), out.size(), blob.data(), blob.size()); + // size_t status = zstd_perthread_decompress(out.data(), out.size(), blob.data(), blob.size()); + if ( ZSTD_isError(status) ) { + std::cerr <<"ERROR in decompression " << ZSTD_getErrorName(status) << std::endl; + } + if (status < dSize) { + std::cerr <<"ERROR in decompression, expected " << dSize << " bytes but only got " << status << std::endl; + } + blob.clear(); + blob.shrink_to_fit(); + return; + } +} +} + void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { auto this_state{State::unretrieved}; if ( state_.compare_exchange_strong(this_state, State::retrieving) ) { conn_->get(name_, [this, callback=std::move(callback)](S3Request* req) mutable { if ( req->status == S3Request::Status::ok ) { + auto start = std::chrono::high_resolution_clock::now(); if ( not data_.ParseFromString(req->buffer) ) { throw std::runtime_error("Could not deserialize ProductStripe for key " + name_); } + offsets_.reserve(data_.counts_size() + 1); + size_t nbytes{0}; + offsets_.push_back(nbytes); + for (const auto& c : data_.counts()) { + nbytes += c; + offsets_.push_back(nbytes); + } + assert(offsets_.size() == data_.counts_size() + 1); + ::decompress_stripe(data_.compression(), *data_.mutable_content(), content_, nbytes); + assert(nbytes == content_.size()); + decompressTime_ = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); state_ = State::retrieved; callback.doneWaiting(); for(auto& w : waiters_) w.doneWaiting(); @@ -33,11 +80,11 @@ std::string_view DelayedProductStripeRetriever::bufferAt(size_t globalEventIndex assert(state_ == State::retrieved); assert(globalOffset_ == data_.globaloffset()); assert(globalOffset_ <= globalEventIndex); - size_t offset = globalEventIndex - globalOffset_; - assert(offset < data_.offsets_size()); - size_t bstart = (offset == 0) ? 0 : data_.offsets(offset-1); - size_t bstop = data_.offsets(offset); - return {&data_.content()[bstart], bstop - bstart}; + size_t iOffset = globalEventIndex - globalOffset_; + assert(iOffset < data_.counts_size()); + size_t bstart = offsets_[iOffset]; + size_t bstop = offsets_[iOffset+1]; + return {&content_[bstart], bstop - bstart}; } S3DelayedRetriever::S3DelayedRetriever(objstripe::ObjectStripeIndex const& index, DeserializeStrategy strategy): @@ -176,6 +223,8 @@ void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTask objPrefix_ + productinfo.productname() + std::to_string(globalEventIndex) << "\n"; } std::swap(ps, new_ps); + // record decompress time of old stripe + if ( new_ps ) decompressTime_ += new_ps->decompressTime(); } retriever.setStripe(i, ps); i++; @@ -202,11 +251,7 @@ std::chrono::microseconds S3Source::serialReadTime() const { } std::chrono::microseconds S3Source::decompressTime() const { - auto time = std::chrono::microseconds::zero(); - for(auto const& l : laneRetrievers_) { - time += l.decompressTime(); - } - return time; + return decompressTime_; } std::chrono::microseconds S3Source::deserializeTime() const { diff --git a/S3Source.h b/S3Source.h index c0185c5..1adc11d 100644 --- a/S3Source.h +++ b/S3Source.h @@ -26,6 +26,7 @@ class DelayedProductStripeRetriever { void fetch(TaskHolder&& callback) const; std::string_view bufferAt(size_t globalEventIndex) const; ~DelayedProductStripeRetriever() {}; + std::chrono::microseconds decompressTime() const { return decompressTime_; } private: S3ConnectionRef conn_; @@ -34,8 +35,11 @@ class DelayedProductStripeRetriever { enum class State {unretrieved, retrieving, retrieved}; mutable std::atomic state_; - mutable tbb::concurrent_vector waiters_; - mutable objstripe::ProductStripe data_; + mutable tbb::concurrent_vector waiters_{}; + mutable objstripe::ProductStripe data_{}; + mutable std::vector offsets_{}; + mutable std::string content_{}; + mutable std::chrono::microseconds decompressTime_{0}; }; @@ -52,7 +56,6 @@ class S3DelayedRetriever : public DelayedProductRetriever { EventIdentifier event() const { return eventID_; } void setEvent(size_t globalEventIndex, EventIdentifier&& ev) { globalEventIndex_ = globalEventIndex; eventID_ = ev; } - std::chrono::microseconds decompressTime() const { return decompressTime_; } std::chrono::microseconds deserializeTime() const { return deserializeTime_; } void setStripe(size_t index, const std::shared_ptr& ptr) { stripes_[index] = ptr; } @@ -68,7 +71,6 @@ class S3DelayedRetriever : public DelayedProductRetriever { std::vector dataBuffers_; DeserializeStrategy deserializers_; std::vector> stripes_; - std::chrono::microseconds decompressTime_{0}; std::chrono::microseconds deserializeTime_{0}; }; @@ -107,6 +109,7 @@ class S3Source : public SharedSourceBase { std::vector> currentProductStripes_; std::vector laneRetrievers_; std::chrono::microseconds readTime_; + std::chrono::microseconds decompressTime_{0}; }; } diff --git a/objectstripe.proto b/objectstripe.proto index 767e428..69e5233 100644 --- a/objectstripe.proto +++ b/objectstripe.proto @@ -7,6 +7,17 @@ enum SerializeStrategy { kRootUnrolled = 1; } +enum CompressionType { + kNone = 0; + kZSTD = 1; +} + +message Compression { + optional CompressionType type = 1; + optional uint32 level = 2; + optional string dictionaryPath = 3; +} + message ProductInfo { optional string productName = 1; optional string productType = 2; @@ -35,7 +46,8 @@ message EventStripe { } message ProductStripe { - repeated uint32 offsets = 1 [packed = true]; + repeated uint32 counts = 1 [packed = true]; optional bytes content = 2; optional uint64 globalOffset = 3; + optional Compression compression = 4; } From 6b109543cbe0a5ca2c69f67687209b3461167da2 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Wed, 29 Jun 2022 16:38:54 -0500 Subject: [PATCH 19/43] Steal WaitingTaskList from cmssw --- CMakeLists.txt | 1 + S3Source.cc | 5 +- S3Source.h | 3 +- TaskHolder.h | 8 ++ WaitingTaskList.cc | 185 +++++++++++++++++++++++++++++++++++++++++++++ WaitingTaskList.h | 74 ++++++++++++++++++ 6 files changed, 272 insertions(+), 4 deletions(-) create mode 100644 WaitingTaskList.cc create mode 100644 WaitingTaskList.h diff --git a/CMakeLists.txt b/CMakeLists.txt index b8e71ce..3c113e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,7 @@ add_executable(threaded_io_test outputerFactoryGenerator.cc WaiterFactory.cc waiterFactoryGenerator.cc + WaitingTaskList.cc ScaleWaiter.cc EventSleepWaiter.cc EventUnevenSleepWaiter.cc diff --git a/S3Source.cc b/S3Source.cc index 4f75a8a..e2bc524 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -64,15 +64,14 @@ void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { decompressTime_ = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); state_ = State::retrieved; callback.doneWaiting(); - for(auto& w : waiters_) w.doneWaiting(); + waiters_.doneWaiting(); } else { throw std::runtime_error("Could not retrieve ProductStripe for key " + name_); } }); } else if (this_state == State::retrieved ) { return; } else { - // TODO: check again if not State::retrieved? - waiters_.emplace_back(std::move(callback)); + waiters_.add(std::move(callback)); } } diff --git a/S3Source.h b/S3Source.h index 1adc11d..204296f 100644 --- a/S3Source.h +++ b/S3Source.h @@ -13,6 +13,7 @@ #include "DataProductRetriever.h" #include "DelayedProductRetriever.h" #include "SerialTaskQueue.h" +#include "WaitingTaskList.h" #include "DeserializeStrategy.h" #include "S3Common.h" #include "objectstripe.pb.h" @@ -35,7 +36,7 @@ class DelayedProductStripeRetriever { enum class State {unretrieved, retrieving, retrieved}; mutable std::atomic state_; - mutable tbb::concurrent_vector waiters_{}; + mutable WaitingTaskList waiters_{}; mutable objstripe::ProductStripe data_{}; mutable std::vector offsets_{}; mutable std::string content_{}; diff --git a/TaskHolder.h b/TaskHolder.h index cb815f4..d672a4d 100644 --- a/TaskHolder.h +++ b/TaskHolder.h @@ -8,6 +8,8 @@ namespace cce::tf { class TaskHolder { public: + friend class WaitingTaskList; + TaskHolder(): group_{nullptr}, task_{nullptr} {} TaskHolder(tbb::task_group& iGroup, std::unique_ptr iTask): group_{&iGroup}, task_{iTask.release()} { @@ -66,6 +68,12 @@ class TaskHolder { } } private: + TaskBase* release_no_decrement() { + auto t = task_; + task_ = nullptr; + return t; + } + tbb::task_group* group_; TaskBase* task_; }; diff --git a/WaitingTaskList.cc b/WaitingTaskList.cc new file mode 100644 index 0000000..4f1c5ff --- /dev/null +++ b/WaitingTaskList.cc @@ -0,0 +1,185 @@ +// -*- C++ -*- +// +// Package: Concurrency +// Class : WaitingTaskList +// +// Implementation: +// [Notes on implementation] +// +// Original Author: Chris Jones +// Created: Thu Feb 21 13:46:45 CST 2013 +// $Id$ +// + +// system include files + +// user include files +#include "oneapi/tbb/task.h" +#include +#include + +#include "WaitingTaskList.h" + +using namespace cce::tf; + +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2) +#define hardware_pause() asm("") +#endif +#if defined(__x86_64__) || defined(__i386__) +#undef hardware_pause +#define hardware_pause() asm("pause") +#endif + +WaitingTaskList::WaitingTaskList(unsigned int iInitialSize) + : m_head{nullptr}, + m_nodeCache{new WaitNode[iInitialSize]}, + m_nodeCacheSize{iInitialSize}, + m_lastAssignedCacheIndex{0}, + m_waiting{true} { + auto nodeCache = m_nodeCache.get(); + for (auto it = nodeCache, itEnd = nodeCache + m_nodeCacheSize; it != itEnd; ++it) { + it->m_fromCache = true; + } +} + +// +// member functions +// +void WaitingTaskList::reset() { + unsigned int nSeenTasks = m_lastAssignedCacheIndex; + m_lastAssignedCacheIndex = 0; + assert(m_head == nullptr); + if (nSeenTasks > m_nodeCacheSize) { + //need to expand so next time we don't have to do any + // memory requests + m_nodeCacheSize = nSeenTasks; + m_nodeCache = std::make_unique(nSeenTasks); + auto nodeCache = m_nodeCache.get(); + for (auto it = nodeCache, itEnd = nodeCache + m_nodeCacheSize; it != itEnd; ++it) { + it->m_fromCache = true; + } + } + //this will make sure all cores see the changes + m_waiting = true; +} + +WaitingTaskList::WaitNode* WaitingTaskList::createNode(oneapi::tbb::task_group* iGroup, TaskBase* iTask) { + unsigned int index = m_lastAssignedCacheIndex++; + + WaitNode* returnValue; + if (index < m_nodeCacheSize) { + returnValue = m_nodeCache.get() + index; + } else { + returnValue = new WaitNode; + returnValue->m_fromCache = false; + } + returnValue->m_task = iTask; + returnValue->m_group = iGroup; + //No other thread can see m_next yet. The caller to create node + // will be doing a synchronization operation anyway which will + // make sure m_task and m_next are synched across threads + returnValue->m_next.store(returnValue, std::memory_order_relaxed); + + return returnValue; +} + +void WaitingTaskList::add(TaskHolder iTask) { + if (m_waiting) { + auto task = iTask.release_no_decrement(); + WaitNode* newHead = createNode(iTask.group(), task); + //This exchange is sequentially consistent thereby + // ensuring ordering between it and setNextNode + WaitNode* oldHead = m_head.exchange(newHead); + newHead->setNextNode(oldHead); + + //For the case where oldHead != nullptr, + // even if 'm_waiting' changed, we don't + // have to recheck since we beat 'announce()' in + // the ordering of 'm_head.exchange' call so iTask + // is guaranteed to be in the link list + + if (nullptr == oldHead) { + newHead->setNextNode(nullptr); + if (!m_waiting) { + //if finished waiting right before we did the + // exchange our task will not be run. Also, + // additional threads may be calling add() and swapping + // heads and linking us to the new head. + // It is safe to call announce from multiple threads + announce(); + } + } + } +} + +void WaitingTaskList::add(oneapi::tbb::task_group* iGroup, TaskBase* iTask) { + iTask->increment_ref_count(); + if (!m_waiting) { + if (iTask->decrement_ref_count()) { + iGroup->run([iTask]() { + iTask->execute(); + delete iTask; + }); + } + } else { + WaitNode* newHead = createNode(iGroup, iTask); + //This exchange is sequentially consistent thereby + // ensuring ordering between it and setNextNode + WaitNode* oldHead = m_head.exchange(newHead); + newHead->setNextNode(oldHead); + + //For the case where oldHead != nullptr, + // even if 'm_waiting' changed, we don't + // have to recheck since we beat 'announce()' in + // the ordering of 'm_head.exchange' call so iTask + // is guaranteed to be in the link list + + if (nullptr == oldHead) { + if (!m_waiting) { + //if finished waiting right before we did the + // exchange our task will not be run. Also, + // additional threads may be calling add() and swapping + // heads and linking us to the new head. + // It is safe to call announce from multiple threads + announce(); + } + } + } +} + +void WaitingTaskList::announce() { + //Need a temporary storage since one of these tasks could + // cause the next event to start processing which would refill + // this waiting list after it has been reset + WaitNode* n = m_head.exchange(nullptr); + WaitNode* next; + while (n) { + //it is possible that 'WaitingTaskList::add' is running in a different + // thread and we have a new 'head' but the old head has not yet been + // attached to the new head (we identify this since 'nextNode' will return itself). + // In that case we have to wait until the link has been established before going on. + while (n == (next = n->nextNode())) { + hardware_pause(); + } + auto t = n->m_task; + auto g = n->m_group; + if (!n->m_fromCache) { + delete n; + } + n = next; + + //the task may indirectly call WaitingTaskList::reset + // so we need to call spawn after we are done using the node. + if (t->decrement_ref_count()) { + g->run([t]() { + t->execute(); + delete t; + }); + } + } +} + +void WaitingTaskList::doneWaiting() { + m_waiting = false; + announce(); +} diff --git a/WaitingTaskList.h b/WaitingTaskList.h new file mode 100644 index 0000000..ccadac6 --- /dev/null +++ b/WaitingTaskList.h @@ -0,0 +1,74 @@ +#if !defined(WaitingTaskList_h) +#define WaitingTaskList_h +// +// Original Author: Chris Jones +// Created: Thu Feb 21 13:46:31 CST 2013 +// $Id$ +// + +// system include files +#include + +// user include files +#include "TaskBase.h" +#include "TaskHolder.h" + +// forward declarations + +namespace cce::tf { + class WaitingTaskList { + public: + explicit WaitingTaskList(unsigned int iInitialSize = 2); + WaitingTaskList(const WaitingTaskList&) = delete; // stop default + const WaitingTaskList& operator=(const WaitingTaskList&) = delete; // stop default + ~WaitingTaskList() = default; + + void add(oneapi::tbb::task_group*, TaskBase*); + void add(TaskHolder); + + ///Signals that the resource is now available and tasks should be spawned + /**The owner of the resource calls this function to allow the waiting tasks to + * start accessing it. + * If the task fails, a non 'null' std::exception_ptr should be used. + * To have tasks wait again one must call reset(). + * Calls to add() and doneWaiting() can safely be done concurrently. + */ + void doneWaiting(); + + ///Resets access to the resource so that added tasks will wait. + /**The owner of the resouce calls reset() to make tasks wait. + * Calling reset() is NOT thread safe. The system must guarantee that no tasks are + * using the resource when reset() is called and neither add() nor doneWaiting() can + * be called concurrently with reset(). + */ + void reset(); + + private: + /**Handles running the tasks, + * safe to call from multiple threads + */ + void announce(); + + struct WaitNode { + TaskBase* m_task; + oneapi::tbb::task_group* m_group; + std::atomic m_next; + bool m_fromCache; + + void setNextNode(WaitNode* iNext) { m_next = iNext; } + + WaitNode* nextNode() const { return m_next; } + }; + + WaitNode* createNode(oneapi::tbb::task_group* iGroup, TaskBase* iTask); + + // ---------- member data -------------------------------- + std::atomic m_head; + std::unique_ptr m_nodeCache; + unsigned int m_nodeCacheSize; + std::atomic m_lastAssignedCacheIndex; + std::atomic m_waiting; + }; +} // namespace cce::tf + +#endif From e5aaa54d2d5682e714338ac23b33fd1ea72f3194 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Tue, 12 Jul 2022 15:16:55 -0500 Subject: [PATCH 20/43] A somewhat working async implementation Some race is still around --- S3Common.cc | 223 ++++++++++++++++++++++++++++++-------------- S3Common.h | 50 +++------- S3Outputer.cc | 11 ++- S3Source.cc | 4 +- TaskHolder.h | 2 +- threaded_io_test.cc | 6 ++ 6 files changed, 182 insertions(+), 114 deletions(-) diff --git a/S3Common.cc b/S3Common.cc index 8d3c6db..20fdb7c 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -7,13 +7,54 @@ #include #include #include +#include #include "libs3.h" +#include "tbb/task_arena.h" +#include "tbb/task_group.h" #include "tbb/concurrent_queue.h" #include "S3Common.h" +#include "FunctorTask.h" -namespace cce::tf { +namespace { +using namespace cce::tf; + +class S3RequestWrapper { + public: + S3RequestWrapper(S3Request::Ptr iReq, const S3BucketContext* iCtx, S3Request::Callback iCb, tbb::task_group* iGroup): + req{std::move(iReq)}, bucketCtx{iCtx}, callback{iCb}, group{iGroup} + { + if ( group != nullptr ) { + arena = std::make_unique(tbb::task_arena::attach{}); + } + backoffTimeout = req->timeout.count(); + }; + + inline bool isAsync() const { return group != nullptr; }; + void done() { + if ( group == nullptr ) { + callback(std::move(req)); + } else { + std::cout << "Done request " << *req << std::endl; + // could not figure out how to capture the unique_ptr properly... so release and remake + auto task = [cb=std::move(callback), ptr=req.release()]() { + std::cout << "Callback request " << *ptr << std::endl; + cb(S3Request::Ptr(ptr)); + }; + arena->enqueue([group=group, task=std::move(task)]() { group->run(task); }); + } + }; + + S3Request::Ptr req; + const S3BucketContext* bucketCtx; + const S3Request::Callback callback; + tbb::task_group* group{nullptr}; + std::unique_ptr arena; + size_t put_offset{0}; + int retries_executed{0}; + long backoffTimeout; +}; class S3LibWrapper { public: @@ -26,23 +67,11 @@ class S3LibWrapper { bool running() const { return running_; } - void get(const S3BucketContext* bucketCtx, const std::string& key, S3Request::Callback&& cb, bool async=false) { - // start of S3Request lifecycle (s3lib will always call responseCompleteCallback) - auto req = new S3Request(S3Request::Type::get, bucketCtx, key, std::move(cb), async); - if ( async ) { + void submit(S3RequestWrapper* req) { + if ( req->isAsync() ) { requests_.push(req); } else { - submit(req, nullptr); - } - } - - void put(const S3BucketContext* bucketCtx, const std::string& key, std::string&& value, S3Request::Callback&& cb, bool async=false) { - // start of S3Request lifecycle (s3lib will always call responseCompleteCallback) - auto req = new S3Request(S3Request::Type::put, bucketCtx, key, std::move(cb), async, std::move(value)); - if ( async ) { - requests_.push(req); - } else { - submit(req, nullptr); + _submit(req, nullptr); } } @@ -65,51 +94,87 @@ class S3LibWrapper { void loop_body() { S3RequestContext * ctx; + fd_set read_fds, write_fds, except_fds; + int max_fd, activeRequests{0}; S3_create_request_context(&ctx); while(running_) { - // S3Status S3_get_request_context_fdsets(S3RequestContext *requestContext, fd_set *readFdSet, fd_set *writeFdSet, fd_set *exceptFdSet, int *maxFd); - // int64_t S3_get_request_context_timeout(S3RequestContext *requestContext); // milliseconds - // select() - std::this_thread::sleep_for(std::chrono::seconds(1)); - // S3Status S3_runonce_request_context(S3RequestContext *requestContext, int *requestsRemainingReturn); - - // S3Request* req; - // concurrency limit? - // while ( requests_.try_pop(req) ) { - // submit(req, ctx); - // } + FD_ZERO(&read_fds); + FD_ZERO(&write_fds); + FD_ZERO(&except_fds); + + switch (S3_get_request_context_fdsets(ctx, &read_fds, &write_fds, &except_fds, &max_fd)) { + case S3StatusOK: + break; + case S3StatusInternalError: + throw std::runtime_error("internal error in S3_get_request_context_fdsets"); + } + + if ( max_fd != -1 ) { + int64_t timeout = std::min(100l, S3_get_request_context_timeout(ctx)); // milliseconds + assert(timeout >= 0); + struct timeval tv { timeout / 1000, (timeout % 1000) * 1000 }; + select(max_fd+1, &read_fds, &write_fds, &except_fds, &tv); + } + + switch (S3_runonce_request_context(ctx, &activeRequests)) { + case S3StatusOK: + break; + case S3StatusConnectionFailed: + throw std::runtime_error("failed to connect in S3_runonce_request_context"); + case S3StatusServerFailedVerification: + throw std::runtime_error("SSL verification failure in S3_runonce_request_context"); + case S3StatusInternalError: + throw std::runtime_error("internal error in S3_runonce_request_context"); + case S3StatusOutOfMemory: + throw std::runtime_error("out of memory while processing S3_runonce_request_context"); + } + + std::string a(activeRequests, '#'); + std::cout << a << std::endl; + + S3RequestWrapper* req; + int currentlyActive{activeRequests}; + while ( (activeRequests < asyncRequestLimit_) and requests_.try_pop(req) and activeRequests < (currentlyActive+asyncAddRequestLimit_) ) { + std::cout << "Adding request " << *(req->req) << std::endl; + _submit(req, ctx); + activeRequests++; + } + if ( activeRequests == 0 ) { + // TODO: would be better to use a semaphore (submit() and ~S3LibWrapper need to notify) + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } } // TODO: this may abort requests in flight, should we wait? S3_destroy_request_context(ctx); } - void submit(S3Request* req, S3RequestContext* ctx) const { + void _submit(S3RequestWrapper* req, S3RequestContext* ctx) const { // this function will block if ctx is null - assert(req->async xor ctx == nullptr); - switch ( req->type ) { + assert(req->isAsync() xor ctx == nullptr); + switch ( req->req->type ) { case S3Request::Type::undef: assert(false); // logic error break; case S3Request::Type::get: S3_get_object( req->bucketCtx, - req->key.c_str(), + req->req->key.c_str(), nullptr, // S3GetConditions 0, // startByte 0, // byteCount ctx, - req->_timeout, + req->backoffTimeout, &S3LibWrapper::getObjectHandler, static_cast(req)); break; case S3Request::Type::put: S3_put_object( req->bucketCtx, - req->key.c_str(), - req->buffer.size(), + req->req->key.c_str(), + req->req->buffer.size(), nullptr, // S3PutProperties (TODO probably want .md5) ctx, - req->_timeout, + req->backoffTimeout, &S3LibWrapper::putObjectHandler, static_cast(req)); break; @@ -117,10 +182,10 @@ class S3LibWrapper { } static S3Status responsePropertiesCallback(const S3ResponseProperties *properties, void *callbackData) { - auto req = static_cast(callbackData); - if ( req->type == S3Request::Type::get ) { + auto req = static_cast(callbackData); + if ( req->req->type == S3Request::Type::get ) { if ( properties->contentLength > 0 ) { - req->buffer.reserve(properties->contentLength); + req->req->buffer.reserve(properties->contentLength); } // else what? // TODO: save headers? @@ -130,62 +195,62 @@ class S3LibWrapper { } static void responseCompleteCallback(S3Status status, const S3ErrorDetails *error, void *callbackData) { - auto req = static_cast(callbackData); - if ( S3_status_is_retryable(status) && req->_retries_executed < req->retries ) { + auto req = static_cast(callbackData); + if ( S3_status_is_retryable(status) && req->retries_executed < req->req->retries ) { if ( status == S3Status::S3StatusErrorRequestTimeout ) { // https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ static thread_local std::minstd_rand rng(std::hash{}(std::this_thread::get_id())); - std::uniform_int_distribution dist(0l, std::min(S3Request::max_timeout.count(), req->_timeout)); + std::uniform_int_distribution dist(0l, std::min(S3Request::max_timeout.count(), req->backoffTimeout)); auto dt = std::chrono::milliseconds(dist(rng)); - if ( req->async ) { + if ( req->isAsync() ) { // TODO: async sleep by setting a future submit time and checking in loop_body } else { // TODO: better option? std::this_thread::sleep_for(dt); - req->_timeout *= 2; + req->backoffTimeout *= 2; } } else { - std::cerr << "Got status " << S3_get_status_name(status) << " while running request " << *req << ", retrying\n"; + std::cerr << "Got status " << S3_get_status_name(status) << " while running request " << *(req->req) << ", retrying\n"; } - req->_put_offset = 0; - req->_retries_executed++; - if ( req->async ) { + req->put_offset = 0; + req->retries_executed++; + if ( req->isAsync() ) { instance().requests_.push(req); } else { // can libs3 callbacks recurse? probably... - instance().submit(req, nullptr); + instance()._submit(req, nullptr); } return; // no delete! } switch ( status ) { case S3StatusOK: - req->status = S3Request::Status::ok; + req->req->status = S3Request::Status::ok; break; default: - req->status = S3Request::Status::error; + req->req->status = S3Request::Status::error; } - if ( req->callback ) req->callback(req); - // end of S3Request lifecycle (s3lib will always call responseCompleteCallback) + req->done(); + // end of S3RequestWrapper lifecycle delete req; } static int putObjectDataCallback(int bufferSize, char *buffer, void *callbackData) { - auto req = static_cast(callbackData); - int toWrite = std::min(bufferSize, (int) (req->buffer.size() - req->_put_offset)); + auto req = static_cast(callbackData); + int toWrite = std::min(bufferSize, (int) (req->req->buffer.size() - req->put_offset)); assert(toWrite >= 0); if ( toWrite > 0 ) { - std::copy_n(req->buffer.begin() + req->_put_offset, toWrite, buffer); - req->_put_offset += toWrite; + std::copy_n(req->req->buffer.begin() + req->put_offset, toWrite, buffer); + req->put_offset += toWrite; } // return > 0 = bytes written, 0 = done, -1 = S3StatusAbortedByCallback return toWrite; } static S3Status getObjectDataCallback(int bufferSize, const char *buffer, void *callbackData) { - auto req = static_cast(callbackData); - auto offset = req->buffer.size(); - req->buffer.resize(offset + bufferSize); // out of memory exception? - std::copy_n(buffer, bufferSize, req->buffer.begin() + offset); + auto req = static_cast(callbackData); + auto offset = req->req->buffer.size(); + req->req->buffer.resize(offset + bufferSize); // out of memory exception? + std::copy_n(buffer, bufferSize, req->req->buffer.begin() + offset); return S3StatusOK; // can also return S3StatusAbortedByCallback } @@ -206,12 +271,18 @@ class S3LibWrapper { private: S3Status initStatus_; + int asyncRequestLimit_{256}; + int asyncAddRequestLimit_{10000}; // TODO: when this is a reasonable number, there's a race std::thread loop_; std::atomic running_; - // all callbackData pointers are to S3Request objects - tbb::concurrent_queue requests_; + // all callbackData pointers are to S3RequestWrapper objects + tbb::concurrent_queue requests_; }; +} // anon namespace + +namespace cce::tf { + std::ostream& operator<<(std::ostream& os, const S3Request& req) { os << "S3Request("; switch (req.type) { @@ -232,7 +303,7 @@ std::ostream& operator<<(std::ostream& os, const S3Request& req) { case S3Request::Status::error: os << "error"; break; } - os << "async=" << req.async << ") (put offset: " << req._put_offset << ", retries executed: " << req._retries_executed << ")"; + os << ")"; return os; } @@ -310,25 +381,35 @@ S3Connection::S3Connection( }); }; -void S3Connection::get(const std::string& key, S3Request::Callback&& cb) const { +void S3Connection::get(const std::string& key, tbb::task_group* group, S3Request::Callback&& cb) const { auto start = std::chrono::high_resolution_clock::now(); if ( ctx_ ) { - S3LibWrapper::instance().get(ctx_.get(), key, std::move(cb)); + auto req = std::make_unique(S3Request::Type::get, key); + // start of S3RequestWrapper lifecycle (ends in S3LibWrapper::responseCompleteCallback) + auto wrapper = new S3RequestWrapper(std::move(req), ctx_.get(), std::move(cb), group); + S3LibWrapper::instance().submit(wrapper); } else if ( cb ) { - S3Request dummy(S3Request::Type::get, key, S3Request::Status::error); - cb(&dummy); + auto dummy = std::make_unique(S3Request::Type::get, key); + dummy->status = S3Request::Status::error; + cb(std::move(dummy)); } auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); blockingTime_ += time.count(); }; -void S3Connection::put(const std::string& key, std::string&& value, S3Request::Callback&& cb) const { +void S3Connection::put(const std::string& key, std::string&& value, tbb::task_group* group, S3Request::Callback&& cb) const { auto start = std::chrono::high_resolution_clock::now(); if ( ctx_ ) { - S3LibWrapper::instance().put(ctx_.get(), key, std::move(value), std::move(cb)); + auto req = std::make_unique(S3Request::Type::put, key); + req->buffer = std::move(value); + // start of S3RequestWrapper lifecycle (ends in S3LibWrapper::responseCompleteCallback) + auto wrapper = new S3RequestWrapper(std::move(req), ctx_.get(), std::move(cb), group); + S3LibWrapper::instance().submit(wrapper); } else if ( cb ) { - S3Request dummy(S3Request::Type::put, key, S3Request::Status::ok); - cb(&dummy); + auto dummy = std::make_unique(S3Request::Type::put, key); + dummy->buffer = std::move(value); + dummy->status = S3Request::Status::ok; + cb(std::move(dummy)); } auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); blockingTime_ += time.count(); diff --git a/S3Common.h b/S3Common.h index f48576c..4a3e328 100644 --- a/S3Common.h +++ b/S3Common.h @@ -9,7 +9,6 @@ struct S3BucketContext; namespace cce::tf { -class S3LibWrapper; class S3Connection; typedef std::shared_ptr S3ConnectionRef; @@ -17,47 +16,24 @@ class S3Request { public: enum class Type {undef, get, put}; enum class Status {waiting, ok, error}; - typedef std::function Callback; + typedef std::unique_ptr Ptr; + typedef std::function Callback; static constexpr std::chrono::milliseconds max_timeout{60000}; + S3Request() = delete; + S3Request(Type iType, const std::string& iKey, std::chrono::milliseconds iTimeout=std::chrono::milliseconds(1000), int iRetries=5): + type{iType}, key{iKey}, timeout{iTimeout}, retries{iRetries} {}; + const Type type; - const S3BucketContext* bucketCtx; const std::string key; - const Callback callback; - const std::chrono::milliseconds timeout{1000}; - const int retries{5}; - const bool async{false}; + const std::chrono::milliseconds timeout; + const int retries; std::string buffer; - Status status; + Status status{Status::waiting}; - private: - S3Request() = delete; - // constructor for devnull connection - S3Request(Type iType, const std::string& iKey, Status stat): - type{iType}, key{iKey}, status{stat} {}; - // get constructor - S3Request(Type iType, const S3BucketContext* iCtx, const std::string& iKey, Callback iCb, bool iAsync): - type{iType}, bucketCtx{iCtx}, key{iKey}, callback{iCb}, async{iAsync} - { - _timeout = timeout.count(); - }; - // put constructor - S3Request(Type iType, const S3BucketContext* iCtx, const std::string& iKey, Callback iCb, bool iAsync, std::string&& buf): - type{iType}, bucketCtx{iCtx}, key{iKey}, callback{iCb}, async{iAsync}, buffer{buf} - { - _timeout = timeout.count(); - }; - - size_t _put_offset{0}; - int _retries_executed{0}; - long _timeout; - - friend class S3LibWrapper; - friend class S3Connection; - friend std::ostream& operator<<(std::ostream& os, const S3Request& req); + friend std::ostream& operator<<(std::ostream& os, const S3Request& req); }; - class S3Connection { public: static S3ConnectionRef from_config(const std::string& filename); @@ -70,8 +46,10 @@ class S3Connection { std::string_view iSecurityToken ); - void get(const std::string& key, S3Request::Callback&& cb) const; - void put(const std::string& key, std::string&& value, S3Request::Callback&& cb) const; + // if group == nullptr, these functions execute synchronously + // else, the request will execute async and schedule the callback to run in the group when done + void get(const std::string& key, tbb::task_group* group, S3Request::Callback&& cb) const; + void put(const std::string& key, std::string&& value, tbb::task_group* group, S3Request::Callback&& cb) const; std::chrono::microseconds blockingTime() const { return std::chrono::microseconds(blockingTime_.load()); } private: diff --git a/S3Outputer.cc b/S3Outputer.cc index c056b12..0bae0da 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -147,8 +147,9 @@ void S3Outputer::outputAsync(unsigned int iLaneIndex, EventIdentifier const& iEv void S3Outputer::printSummary() const { { tbb::task_group group; + std::atomic busy{true}; { - TaskHolder th(group, make_functor_task([](){})); + TaskHolder th(group, make_functor_task([&busy](){ busy = false; })); TaskHolder productsDone(group, make_functor_task( [this, stripeOut=std::move(currentEventStripe_), callback=std::move(th)]() mutable { flushQueue_.push(*callback.group(), [this, stripeOut=std::move(stripeOut), callback=std::move(callback)]() { @@ -162,6 +163,8 @@ void S3Outputer::printSummary() const { }); } } + do { group.wait(); } + while ( busy ); group.wait(); } @@ -291,9 +294,9 @@ void S3Outputer::appendProductBuffer( [this, name=std::move(name), pOut=std::move(pOut), callback=std::move(iCallback)]() { std::string finalbuf; pOut.SerializeToString(&finalbuf); - conn_->put(name, std::move(finalbuf), [name=std::move(name), callback=std::move(callback)](S3Request* req) { + conn_->put(name, std::move(finalbuf), callback.group(), [name=std::move(name), callback=std::move(callback)](S3Request::Ptr req) { if ( req->status != S3Request::Status::ok ) { - std::cerr << "failed to write product buffer " << name << std::endl; + std::cerr << "failed to write product buffer " << name << *req << std::endl; } }); } @@ -325,7 +328,7 @@ void S3Outputer::flushEventStripe(const objstripe::EventStripe& stripe, TaskHold [this, idxcopy=index_, callback=std::move(iCallback)]() { std::string indexOut; idxcopy.SerializeToString(&indexOut); - conn_->put(objPrefix_ + "index", std::move(indexOut), [callback=std::move(callback)](S3Request* req) { + conn_->put(objPrefix_ + "index", std::move(indexOut), callback.group(), [callback=std::move(callback)](S3Request::Ptr req) { if ( req->status != S3Request::Status::ok ) { std::cerr << "failed to write product buffer index" << std::endl; } diff --git a/S3Source.cc b/S3Source.cc index e2bc524..73bffe3 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -45,7 +45,7 @@ void decompress_stripe(const objstripe::Compression& setting, std::string& blob, void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { auto this_state{State::unretrieved}; if ( state_.compare_exchange_strong(this_state, State::retrieving) ) { - conn_->get(name_, [this, callback=std::move(callback)](S3Request* req) mutable { + conn_->get(name_, callback.group(), [this, callback=std::move(callback)](S3Request::Ptr req) mutable { if ( req->status == S3Request::Status::ok ) { auto start = std::chrono::high_resolution_clock::now(); if ( not data_.ParseFromString(req->buffer) ) { @@ -139,7 +139,7 @@ S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, u { auto start = std::chrono::high_resolution_clock::now(); - conn->get(objPrefix_ + "index", [this](S3Request* req) mutable { + conn->get(objPrefix_ + "index", nullptr, [this](S3Request::Ptr req) mutable { if ( req->status == S3Request::Status::ok ) { if ( not index_.ParseFromString(req->buffer) ) { throw std::runtime_error("Could not deserialize index in S3Source construction"); diff --git a/TaskHolder.h b/TaskHolder.h index d672a4d..024944a 100644 --- a/TaskHolder.h +++ b/TaskHolder.h @@ -54,7 +54,7 @@ class TaskHolder { TaskHolder& operator=(TaskHolder const&) = delete; TaskHolder& operator=(TaskHolder&&) = delete; - tbb::task_group* group() { return group_;} + tbb::task_group* group() const { return group_;} void doneWaiting() { auto t = task_; task_ = nullptr; diff --git a/threaded_io_test.cc b/threaded_io_test.cc index b8ebe94..96d7f19 100644 --- a/threaded_io_test.cc +++ b/threaded_io_test.cc @@ -127,6 +127,10 @@ int main(int argc, char* argv[]) { group.run([&]() { lane.processEventsAsync(ievt, group, *pOut, AtomicRefCounter(count)); }); + do { + group.wait(); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } while(count != 0); group.wait(); }); } @@ -171,6 +175,8 @@ int main(int argc, char* argv[]) { for(auto& group: groups) { group.wait(); } + // std::this_thread::yield(); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); } while(nLanesWaiting != 0); //be sure all groups have fully finished for(auto& group: groups) { From 4affc33a8277310848eb740574e81e64e6d97ea6 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Tue, 12 Jul 2022 17:31:57 -0500 Subject: [PATCH 21/43] Use task_group::defer for final task --- Lane.cc | 12 ++++++------ Lane.h | 5 ++--- threaded_io_test.cc | 23 +++++++---------------- 3 files changed, 15 insertions(+), 25 deletions(-) diff --git a/Lane.cc b/Lane.cc index b8267c4..4e2c93d 100644 --- a/Lane.cc +++ b/Lane.cc @@ -11,8 +11,8 @@ Lane::Lane(unsigned int iIndex, SharedSourceBase* iSource, WaiterBase const* iWa } void Lane::processEventsAsync(std::atomic& index, tbb::task_group& group, const OutputerBase& outputer, - AtomicRefCounter counter) { - doNextEvent(index, group, outputer, std::move(counter)); + TaskHolder finalTask) { + doNextEvent(index, group, outputer, std::move(finalTask)); } @@ -61,7 +61,7 @@ void Lane::processEventAsync(tbb::task_group& group, TaskHolder iCallback, const } } -void Lane::doNextEvent(std::atomic& index, tbb::task_group& group, const OutputerBase& outputer, AtomicRefCounter counter) { +void Lane::doNextEvent(std::atomic& index, tbb::task_group& group, const OutputerBase& outputer, TaskHolder finalTask) { using namespace std::string_literals; presentEventIndex_ = index++; if(source_->mayBeAbleToGoToEvent(presentEventIndex_)) { @@ -69,9 +69,9 @@ void Lane::doNextEvent(std::atomic& index, tbb::task_group& group, const std::cout <<"event "+std::to_string(presentEventIndex_)+"\n"<& index, tbb::task_group& group, const OutputerBase& outputer, AtomicRefCounter); + void processEventsAsync(std::atomic& index, tbb::task_group& group, const OutputerBase& outputer, TaskHolder finalTask); void setVerbose(bool iSet) { verbose_ = iSet; } @@ -34,7 +33,7 @@ class Lane { void processEventAsync(tbb::task_group& group, TaskHolder iCallback, const OutputerBase& outputer); void doNextEvent(std::atomic& index, tbb::task_group& group, const OutputerBase& outputer, - AtomicRefCounter counter); + TaskHolder finalTask); SharedSourceBase* source_; WaiterBase const* waiter_; diff --git a/threaded_io_test.cc b/threaded_io_test.cc index 96d7f19..d9a60e9 100644 --- a/threaded_io_test.cc +++ b/threaded_io_test.cc @@ -10,11 +10,14 @@ #include "CLI11.hpp" +#define TBB_PREVIEW_TASK_GROUP_EXTENSIONS 1 // for task_group::defer + #include "outputerFactoryGenerator.h" #include "sourceFactoryGenerator.h" #include "waiterFactoryGenerator.h" #include "Lane.h" +#include "FunctorTask.h" #include "tbb/task_group.h" #include "tbb/global_control.h" @@ -123,14 +126,10 @@ int main(int argc, char* argv[]) { arena.execute([&lane,pOut]() { tbb::task_group group; std::atomic ievt{0}; - std::atomic count{0}; + TaskHolder finalTask(group, make_functor_task([&group, task=group.defer([](){})]() mutable { group.run(std::move(task)); })); group.run([&]() { - lane.processEventsAsync(ievt, group, *pOut, AtomicRefCounter(count)); + lane.processEventsAsync(ievt, group, *pOut, std::move(finalTask)); }); - do { - group.wait(); - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } while(count != 0); group.wait(); }); } @@ -159,25 +158,17 @@ int main(int argc, char* argv[]) { decltype(std::chrono::high_resolution_clock::now()) start; auto pOut = out.get(); arena.execute([&lanes, &ievt, pOut, &start]() { - std::atomic nLanesWaiting{ 0 }; std::vector groups(lanes.size()); start = std::chrono::high_resolution_clock::now(); auto itGroup = groups.begin(); { - AtomicRefCounter laneCounter(nLanesWaiting); for(auto& lane: lanes) { auto& group = *itGroup; - group.run([&, laneCounter]() {lane.processEventsAsync(ievt,group, *pOut,laneCounter);}); + TaskHolder finalTask(group, make_functor_task([&group, task=group.defer([](){})]() mutable { group.run(std::move(task)); })); + group.run([&]() {lane.processEventsAsync(ievt, group, *pOut, std::move(finalTask));}); ++itGroup; } } - do { - for(auto& group: groups) { - group.wait(); - } - // std::this_thread::yield(); - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } while(nLanesWaiting != 0); //be sure all groups have fully finished for(auto& group: groups) { group.wait(); From 891d88a011c008d647534d0210e56e783e328474 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Tue, 12 Jul 2022 17:32:21 -0500 Subject: [PATCH 22/43] Use task_group::defer also for S3 stuff --- S3Common.cc | 10 ++-------- S3Outputer.cc | 7 ++----- S3Outputer.h | 2 ++ S3Source.cc | 4 +++- 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/S3Common.cc b/S3Common.cc index 20fdb7c..36899d0 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -36,10 +36,8 @@ class S3RequestWrapper { if ( group == nullptr ) { callback(std::move(req)); } else { - std::cout << "Done request " << *req << std::endl; // could not figure out how to capture the unique_ptr properly... so release and remake auto task = [cb=std::move(callback), ptr=req.release()]() { - std::cout << "Callback request " << *ptr << std::endl; cb(S3Request::Ptr(ptr)); }; arena->enqueue([group=group, task=std::move(task)]() { group->run(task); }); @@ -129,13 +127,9 @@ class S3LibWrapper { throw std::runtime_error("out of memory while processing S3_runonce_request_context"); } - std::string a(activeRequests, '#'); - std::cout << a << std::endl; - S3RequestWrapper* req; int currentlyActive{activeRequests}; while ( (activeRequests < asyncRequestLimit_) and requests_.try_pop(req) and activeRequests < (currentlyActive+asyncAddRequestLimit_) ) { - std::cout << "Adding request " << *(req->req) << std::endl; _submit(req, ctx); activeRequests++; } @@ -144,7 +138,7 @@ class S3LibWrapper { std::this_thread::sleep_for(std::chrono::milliseconds(10)); } } - // TODO: this may abort requests in flight, should we wait? + // TODO: this may abort requests in flight, do we wait or is it synchronous? S3_destroy_request_context(ctx); } @@ -272,7 +266,7 @@ class S3LibWrapper { private: S3Status initStatus_; int asyncRequestLimit_{256}; - int asyncAddRequestLimit_{10000}; // TODO: when this is a reasonable number, there's a race + int asyncAddRequestLimit_{16}; // TODO: when this is a reasonable number, there's a race std::thread loop_; std::atomic running_; // all callbackData pointers are to S3RequestWrapper objects diff --git a/S3Outputer.cc b/S3Outputer.cc index 0bae0da..cf567d5 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -147,11 +147,10 @@ void S3Outputer::outputAsync(unsigned int iLaneIndex, EventIdentifier const& iEv void S3Outputer::printSummary() const { { tbb::task_group group; - std::atomic busy{true}; { - TaskHolder th(group, make_functor_task([&busy](){ busy = false; })); + TaskHolder finalTask(group, make_functor_task([&group, task=group.defer([](){})]() mutable { group.run(std::move(task)); })); TaskHolder productsDone(group, make_functor_task( - [this, stripeOut=std::move(currentEventStripe_), callback=std::move(th)]() mutable { + [this, stripeOut=std::move(currentEventStripe_), callback=std::move(finalTask)]() mutable { flushQueue_.push(*callback.group(), [this, stripeOut=std::move(stripeOut), callback=std::move(callback)]() { flushEventStripe(stripeOut, std::move(callback), true); }); @@ -163,8 +162,6 @@ void S3Outputer::printSummary() const { }); } } - do { group.wait(); } - while ( busy ); group.wait(); } diff --git a/S3Outputer.h b/S3Outputer.h index 92d7773..9c3b320 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -7,6 +7,8 @@ #include #include "zstd.h" + +#define TBB_PREVIEW_TASK_GROUP_EXTENSIONS 1 // for task_group::defer #include "tbb/task_group.h" #include "OutputerBase.h" diff --git a/S3Source.cc b/S3Source.cc index 73bffe3..3255b65 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -63,6 +63,7 @@ void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { assert(nbytes == content_.size()); decompressTime_ = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); state_ = State::retrieved; + std::cout << "retrieved ProductStripe " + name_ << std::endl; callback.doneWaiting(); waiters_.doneWaiting(); } @@ -121,6 +122,7 @@ void S3DelayedRetriever::getAsync(DataProductRetriever& product, int index, Task TaskHolder fetchCallback(*callback.group(), make_functor_task( [this, index, callback=std::move(callback)]() mutable { auto start = std::chrono::high_resolution_clock::now(); + std::cout << "deserialize buffer " + dataProducts_[index].name() + std::to_string(globalEventIndex_) << std::endl; auto buf = stripes_[index]->bufferAt(globalEventIndex_); auto readSize = deserializers_[index].deserialize(buf.data(), buf.size(), *dataProducts_[index].address()); dataProducts_[index].setSize(readSize); @@ -203,7 +205,7 @@ void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTask nextEventInStripe_ = 0; } const auto event = currentEventStripe_.events(nextEventInStripe_); - if ( verbose_ >= 2 ) std::cout << event.DebugString() << "\n"; + if ( verbose_ >= 1 ) std::cout << event.DebugString() << "\n"; auto& retriever = laneRetrievers_[iLane]; size_t globalEventIndex = event.offset(); From 8c773852449bdbd5821b1ea11b256cd78054543d Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Tue, 12 Jul 2022 19:21:49 -0500 Subject: [PATCH 23/43] bugs squashed --- S3Common.cc | 6 +++++- S3Source.cc | 2 -- threaded_io_test.cc | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/S3Common.cc b/S3Common.cc index 36899d0..2204002 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -129,7 +129,11 @@ class S3LibWrapper { S3RequestWrapper* req; int currentlyActive{activeRequests}; - while ( (activeRequests < asyncRequestLimit_) and requests_.try_pop(req) and activeRequests < (currentlyActive+asyncAddRequestLimit_) ) { + while ( + (activeRequests < asyncRequestLimit_) + and activeRequests < (currentlyActive+asyncAddRequestLimit_) + and requests_.try_pop(req) // test this last! + ) { _submit(req, ctx); activeRequests++; } diff --git a/S3Source.cc b/S3Source.cc index 3255b65..9b3c9d3 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -63,7 +63,6 @@ void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { assert(nbytes == content_.size()); decompressTime_ = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); state_ = State::retrieved; - std::cout << "retrieved ProductStripe " + name_ << std::endl; callback.doneWaiting(); waiters_.doneWaiting(); } @@ -122,7 +121,6 @@ void S3DelayedRetriever::getAsync(DataProductRetriever& product, int index, Task TaskHolder fetchCallback(*callback.group(), make_functor_task( [this, index, callback=std::move(callback)]() mutable { auto start = std::chrono::high_resolution_clock::now(); - std::cout << "deserialize buffer " + dataProducts_[index].name() + std::to_string(globalEventIndex_) << std::endl; auto buf = stripes_[index]->bufferAt(globalEventIndex_); auto readSize = deserializers_[index].deserialize(buf.data(), buf.size(), *dataProducts_[index].address()); dataProducts_[index].setSize(readSize); diff --git a/threaded_io_test.cc b/threaded_io_test.cc index d9a60e9..fa01ded 100644 --- a/threaded_io_test.cc +++ b/threaded_io_test.cc @@ -165,7 +165,7 @@ int main(int argc, char* argv[]) { for(auto& lane: lanes) { auto& group = *itGroup; TaskHolder finalTask(group, make_functor_task([&group, task=group.defer([](){})]() mutable { group.run(std::move(task)); })); - group.run([&]() {lane.processEventsAsync(ievt, group, *pOut, std::move(finalTask));}); + group.run([&, ft=std::move(finalTask)]() {lane.processEventsAsync(ievt, group, *pOut, std::move(ft));}); ++itGroup; } } From 58678438b1a9672a9a0526df27524a3a7478f6e3 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Tue, 12 Jul 2022 19:42:43 -0500 Subject: [PATCH 24/43] No need to schedule puts now that they don't block --- S3Outputer.cc | 34 +++++++++++++--------------------- S3Source.cc | 2 +- TaskHolder.h | 2 +- 3 files changed, 15 insertions(+), 23 deletions(-) diff --git a/S3Outputer.cc b/S3Outputer.cc index cf567d5..07bf68e 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -287,17 +287,13 @@ void S3Outputer::appendProductBuffer( pOut.set_allocated_compression(new objstripe::Compression(buf.compressor_.getCompression())); std::string name = buf.prefix_; name += std::to_string(pOut.globaloffset()); - iCallback.group()->run( - [this, name=std::move(name), pOut=std::move(pOut), callback=std::move(iCallback)]() { - std::string finalbuf; - pOut.SerializeToString(&finalbuf); - conn_->put(name, std::move(finalbuf), callback.group(), [name=std::move(name), callback=std::move(callback)](S3Request::Ptr req) { - if ( req->status != S3Request::Status::ok ) { - std::cerr << "failed to write product buffer " << name << *req << std::endl; - } - }); - } - ); + std::string finalbuf; + pOut.SerializeToString(&finalbuf); + conn_->put(name, std::move(finalbuf), iCallback.group(), [name=std::move(name), callback=std::move(iCallback)](S3Request::Ptr req) { + if ( req->status != S3Request::Status::ok ) { + std::cerr << "failed to write product buffer " << name << *req << std::endl; + } + }); if ( buf.info_->flushsize() == 0 ) { // only modification to info_, done inside serial appendQueue_ buf.info_->set_flushsize(bufferNevents); @@ -320,16 +316,12 @@ void S3Outputer::flushEventStripe(const objstripe::EventStripe& stripe, TaskHold } // TODO: checkpoint only every few event stripes? - iCallback.group()->run( - // bind shallow copy of index_ to ensure validity - [this, idxcopy=index_, callback=std::move(iCallback)]() { - std::string indexOut; - idxcopy.SerializeToString(&indexOut); - conn_->put(objPrefix_ + "index", std::move(indexOut), callback.group(), [callback=std::move(callback)](S3Request::Ptr req) { - if ( req->status != S3Request::Status::ok ) { - std::cerr << "failed to write product buffer index" << std::endl; - } - }); + std::string indexOut; + index_.SerializeToString(&indexOut); + conn_->put(objPrefix_ + "index", std::move(indexOut), iCallback.group(), [callback=std::move(iCallback)](S3Request::Ptr req) { + if ( req->status != S3Request::Status::ok ) { + std::cerr << "failed to write product buffer index" << std::endl; + } }); flushTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); } diff --git a/S3Source.cc b/S3Source.cc index 9b3c9d3..1c149a3 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -139,7 +139,7 @@ S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, u { auto start = std::chrono::high_resolution_clock::now(); - conn->get(objPrefix_ + "index", nullptr, [this](S3Request::Ptr req) mutable { + conn_->get(objPrefix_ + "index", nullptr, [this](S3Request::Ptr req) mutable { if ( req->status == S3Request::Status::ok ) { if ( not index_.ParseFromString(req->buffer) ) { throw std::runtime_error("Could not deserialize index in S3Source construction"); diff --git a/TaskHolder.h b/TaskHolder.h index 024944a..d672a4d 100644 --- a/TaskHolder.h +++ b/TaskHolder.h @@ -54,7 +54,7 @@ class TaskHolder { TaskHolder& operator=(TaskHolder const&) = delete; TaskHolder& operator=(TaskHolder&&) = delete; - tbb::task_group* group() const { return group_;} + tbb::task_group* group() { return group_;} void doneWaiting() { auto t = task_; task_ = nullptr; From c47512b8b347539c13cfbc57d599ae7291f93870 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Wed, 13 Jul 2022 12:41:38 -0500 Subject: [PATCH 25/43] Migrate S3 callbacks to FunctorTask --- S3Common.cc | 67 +++++++++++++++++---------------------------------- S3Common.h | 9 +++---- S3Outputer.cc | 40 +++++++++++++++--------------- S3Outputer.h | 5 +++- S3Source.cc | 34 +++++++++++++++++--------- S3Source.h | 4 +-- TaskHolder.h | 31 ++++++++++++++---------- 7 files changed, 90 insertions(+), 100 deletions(-) diff --git a/S3Common.cc b/S3Common.cc index 2204002..d774d6a 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -11,10 +11,8 @@ #include "libs3.h" #include "tbb/task_arena.h" -#include "tbb/task_group.h" #include "tbb/concurrent_queue.h" #include "S3Common.h" -#include "FunctorTask.h" namespace { @@ -22,32 +20,27 @@ using namespace cce::tf; class S3RequestWrapper { public: - S3RequestWrapper(S3Request::Ptr iReq, const S3BucketContext* iCtx, S3Request::Callback iCb, tbb::task_group* iGroup): - req{std::move(iReq)}, bucketCtx{iCtx}, callback{iCb}, group{iGroup} + S3RequestWrapper(std::shared_ptr iReq, const S3BucketContext* iCtx, TaskHolder&& iCallback, bool iAsync): + req{std::move(iReq)}, bucketCtx{iCtx}, callback{std::move(iCallback)}, async{iAsync} { - if ( group != nullptr ) { + if ( async ) { arena = std::make_unique(tbb::task_arena::attach{}); } backoffTimeout = req->timeout.count(); }; - inline bool isAsync() const { return group != nullptr; }; void done() { - if ( group == nullptr ) { - callback(std::move(req)); + if ( async ) { + arena->enqueue([callback=std::move(callback)]() { }); } else { - // could not figure out how to capture the unique_ptr properly... so release and remake - auto task = [cb=std::move(callback), ptr=req.release()]() { - cb(S3Request::Ptr(ptr)); - }; - arena->enqueue([group=group, task=std::move(task)]() { group->run(task); }); + callback.doneWaiting(); } }; - S3Request::Ptr req; + std::shared_ptr req; const S3BucketContext* bucketCtx; - const S3Request::Callback callback; - tbb::task_group* group{nullptr}; + TaskHolder callback; + const bool async; std::unique_ptr arena; size_t put_offset{0}; int retries_executed{0}; @@ -66,7 +59,7 @@ class S3LibWrapper { bool running() const { return running_; } void submit(S3RequestWrapper* req) { - if ( req->isAsync() ) { + if ( req->async ) { requests_.push(req); } else { _submit(req, nullptr); @@ -148,7 +141,7 @@ class S3LibWrapper { void _submit(S3RequestWrapper* req, S3RequestContext* ctx) const { // this function will block if ctx is null - assert(req->isAsync() xor ctx == nullptr); + assert(req->async xor ctx == nullptr); switch ( req->req->type ) { case S3Request::Type::undef: assert(false); // logic error @@ -200,7 +193,7 @@ class S3LibWrapper { static thread_local std::minstd_rand rng(std::hash{}(std::this_thread::get_id())); std::uniform_int_distribution dist(0l, std::min(S3Request::max_timeout.count(), req->backoffTimeout)); auto dt = std::chrono::milliseconds(dist(rng)); - if ( req->isAsync() ) { + if ( req->async ) { // TODO: async sleep by setting a future submit time and checking in loop_body } else { // TODO: better option? @@ -212,7 +205,7 @@ class S3LibWrapper { } req->put_offset = 0; req->retries_executed++; - if ( req->isAsync() ) { + if ( req->async ) { instance().requests_.push(req); } else { // can libs3 callbacks recurse? probably... @@ -379,35 +372,19 @@ S3Connection::S3Connection( }); }; -void S3Connection::get(const std::string& key, tbb::task_group* group, S3Request::Callback&& cb) const { +void S3Connection::submit(std::shared_ptr req, TaskHolder&& callback, bool async) const { auto start = std::chrono::high_resolution_clock::now(); if ( ctx_ ) { - auto req = std::make_unique(S3Request::Type::get, key); // start of S3RequestWrapper lifecycle (ends in S3LibWrapper::responseCompleteCallback) - auto wrapper = new S3RequestWrapper(std::move(req), ctx_.get(), std::move(cb), group); + auto wrapper = new S3RequestWrapper(std::move(req), ctx_.get(), std::move(callback), async); S3LibWrapper::instance().submit(wrapper); - } else if ( cb ) { - auto dummy = std::make_unique(S3Request::Type::get, key); - dummy->status = S3Request::Status::error; - cb(std::move(dummy)); - } - auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); - blockingTime_ += time.count(); -}; - -void S3Connection::put(const std::string& key, std::string&& value, tbb::task_group* group, S3Request::Callback&& cb) const { - auto start = std::chrono::high_resolution_clock::now(); - if ( ctx_ ) { - auto req = std::make_unique(S3Request::Type::put, key); - req->buffer = std::move(value); - // start of S3RequestWrapper lifecycle (ends in S3LibWrapper::responseCompleteCallback) - auto wrapper = new S3RequestWrapper(std::move(req), ctx_.get(), std::move(cb), group); - S3LibWrapper::instance().submit(wrapper); - } else if ( cb ) { - auto dummy = std::make_unique(S3Request::Type::put, key); - dummy->buffer = std::move(value); - dummy->status = S3Request::Status::ok; - cb(std::move(dummy)); + } else { + if ( req->type == S3Request::Type::put ) { + req->status = S3Request::Status::ok; + } else { + req->status = S3Request::Status::error; + } + callback.doneWaiting(); } auto time = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); blockingTime_ += time.count(); diff --git a/S3Common.h b/S3Common.h index 4a3e328..5944454 100644 --- a/S3Common.h +++ b/S3Common.h @@ -5,6 +5,8 @@ #include #include +#include "TaskHolder.h" + // libs3.h struct S3BucketContext; @@ -16,8 +18,6 @@ class S3Request { public: enum class Type {undef, get, put}; enum class Status {waiting, ok, error}; - typedef std::unique_ptr Ptr; - typedef std::function Callback; static constexpr std::chrono::milliseconds max_timeout{60000}; S3Request() = delete; @@ -46,10 +46,7 @@ class S3Connection { std::string_view iSecurityToken ); - // if group == nullptr, these functions execute synchronously - // else, the request will execute async and schedule the callback to run in the group when done - void get(const std::string& key, tbb::task_group* group, S3Request::Callback&& cb) const; - void put(const std::string& key, std::string&& value, tbb::task_group* group, S3Request::Callback&& cb) const; + void submit(std::shared_ptr req, TaskHolder&& callback, bool async) const; std::chrono::microseconds blockingTime() const { return std::chrono::microseconds(blockingTime_.load()); } private: diff --git a/S3Outputer.cc b/S3Outputer.cc index 07bf68e..2d56534 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -202,14 +202,16 @@ void S3Outputer::collateProducts( sev->set_event(iEventID.event); if (verbose_ >= 2) { std::cout << sev->DebugString(); } - TaskHolder productsDoneCallback([this, cb=std::move(iCallback)]() mutable { + TaskHolder productsDoneCallback( + // make lambda and call, since move assignment is disabled + [this, cb=std::move(iCallback)]() mutable { if ( currentEventStripe_.events_size() == eventFlushSize_ ) { - if(verbose_ >= 2) { std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n"; } objstripe::EventStripe stripeOut; stripeOut.mutable_events()->Reserve(eventFlushSize_); std::swap(currentEventStripe_, stripeOut); return TaskHolder(*cb.group(), make_functor_task( [this, stripeOut=std::move(stripeOut), callback=std::move(cb)]() mutable { + if(verbose_ >= 2) { std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n"; } flushQueue_.push(*callback.group(), [this, stripeOut=std::move(stripeOut), callback=std::move(callback)]() { flushEventStripe(stripeOut, std::move(callback)); }); @@ -277,23 +279,20 @@ void S3Outputer::appendProductBuffer( + " is full ("s + std::to_string(bufferNbytes) + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n"; } - objstripe::ProductStripe pOut; - pOut.mutable_counts()->Reserve(bufferNevents); - pOut.mutable_content()->reserve(bufferNbytes); - pOut.set_globaloffset(buf.stripe_.globaloffset() + bufferNevents); - std::swap(buf.stripe_, pOut); - - pOut.set_allocated_compression(new objstripe::Compression(buf.compressor_.getCompression())); - std::string name = buf.prefix_; - name += std::to_string(pOut.globaloffset()); - std::string finalbuf; - pOut.SerializeToString(&finalbuf); - conn_->put(name, std::move(finalbuf), iCallback.group(), [name=std::move(name), callback=std::move(iCallback)](S3Request::Ptr req) { + std::string name = buf.prefix_ + std::to_string(buf.stripe_.globaloffset()); + auto req = std::make_shared(S3Request::Type::put, name); + buf.stripe_.SerializeToString(&req->buffer); + auto putDoneTask = TaskHolder(*iCallback.group(), make_functor_task([req, callback=std::move(iCallback)]() { if ( req->status != S3Request::Status::ok ) { - std::cerr << "failed to write product buffer " << name << *req << std::endl; + std::cerr << "failed to write product buffer " << *req << std::endl; } - }); + })); + conn_->submit(std::move(req), std::move(putDoneTask), true); + + buf.stripe_.clear_counts(); + buf.stripe_.clear_content(); + buf.stripe_.set_globaloffset(buf.stripe_.globaloffset() + bufferNevents); if ( buf.info_->flushsize() == 0 ) { // only modification to info_, done inside serial appendQueue_ buf.info_->set_flushsize(bufferNevents); @@ -316,13 +315,14 @@ void S3Outputer::flushEventStripe(const objstripe::EventStripe& stripe, TaskHold } // TODO: checkpoint only every few event stripes? - std::string indexOut; - index_.SerializeToString(&indexOut); - conn_->put(objPrefix_ + "index", std::move(indexOut), iCallback.group(), [callback=std::move(iCallback)](S3Request::Ptr req) { + auto req = std::make_shared(S3Request::Type::put, objPrefix_ + "index"); + index_.SerializeToString(&req->buffer); + auto putDoneTask = TaskHolder(*iCallback.group(), make_functor_task([req, callback=std::move(iCallback)]() { if ( req->status != S3Request::Status::ok ) { std::cerr << "failed to write product buffer index" << std::endl; } - }); + })); + conn_->submit(std::move(req), std::move(putDoneTask), true); flushTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); } diff --git a/S3Outputer.h b/S3Outputer.h index 9c3b320..c4963e7 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -67,7 +67,10 @@ class S3Outputer : public OutputerBase { private: struct ProductOutputBuffer { ProductOutputBuffer(const std::string& prefix, objstripe::ProductInfo* info, const objstripe::Compression& comp) : - prefix_{prefix}, info_{info}, compressor_{comp} {}; + prefix_{prefix}, info_{info}, compressor_{comp} { + stripe_.set_content(""); + stripe_.set_allocated_compression(new objstripe::Compression(compressor_.getCompression())); + }; const std::string prefix_; objstripe::ProductInfo* info_; // owned by index_ diff --git a/S3Source.cc b/S3Source.cc index 1c149a3..4722f87 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -45,7 +45,10 @@ void decompress_stripe(const objstripe::Compression& setting, std::string& blob, void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { auto this_state{State::unretrieved}; if ( state_.compare_exchange_strong(this_state, State::retrieving) ) { - conn_->get(name_, callback.group(), [this, callback=std::move(callback)](S3Request::Ptr req) mutable { + auto req = std::make_shared(S3Request::Type::get, name_); + auto group = callback.group(); + waiters_.add(std::move(callback)); + auto getDoneTask = TaskHolder(*group, make_functor_task([this, req]() { if ( req->status == S3Request::Status::ok ) { auto start = std::chrono::high_resolution_clock::now(); if ( not data_.ParseFromString(req->buffer) ) { @@ -61,13 +64,13 @@ void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { assert(offsets_.size() == data_.counts_size() + 1); ::decompress_stripe(data_.compression(), *data_.mutable_content(), content_, nbytes); assert(nbytes == content_.size()); + data_.clear_content(); decompressTime_ = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); state_ = State::retrieved; - callback.doneWaiting(); waiters_.doneWaiting(); - } - else { throw std::runtime_error("Could not retrieve ProductStripe for key " + name_); } - }); + } else { throw std::runtime_error("Could not retrieve ProductStripe for key " + name_); } + })); + conn_->submit(std::move(req), std::move(getDoneTask), true); } else if (this_state == State::retrieved ) { return; } else { @@ -139,14 +142,21 @@ S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, u { auto start = std::chrono::high_resolution_clock::now(); - conn_->get(objPrefix_ + "index", nullptr, [this](S3Request::Ptr req) mutable { - if ( req->status == S3Request::Status::ok ) { - if ( not index_.ParseFromString(req->buffer) ) { - throw std::runtime_error("Could not deserialize index in S3Source construction"); + { + tbb::task_group group; + auto req = std::make_shared(S3Request::Type::get, objPrefix_ + "index"); + auto getDoneTask = TaskHolder(group, make_functor_task([this, req]() { + if ( req->status == S3Request::Status::ok ) { + if ( not index_.ParseFromString(req->buffer) ) { + throw std::runtime_error("Could not deserialize index in S3Source construction"); + } } - } - else { throw std::runtime_error("Could not retrieve index in S3Source construction"); } - }); + else { throw std::runtime_error("Could not retrieve index in S3Source construction"); } + })); + conn_->submit(std::move(req), std::move(getDoneTask), false); + group.wait(); + } + if ( verbose_ >= 3 ) { std::cout << index_.DebugString() << "\n"; } diff --git a/S3Source.h b/S3Source.h index 204296f..e7a04fe 100644 --- a/S3Source.h +++ b/S3Source.h @@ -7,15 +7,13 @@ #include #include -#include "tbb/concurrent_vector.h" - +#include "S3Common.h" #include "SharedSourceBase.h" #include "DataProductRetriever.h" #include "DelayedProductRetriever.h" #include "SerialTaskQueue.h" #include "WaitingTaskList.h" #include "DeserializeStrategy.h" -#include "S3Common.h" #include "objectstripe.pb.h" diff --git a/TaskHolder.h b/TaskHolder.h index d672a4d..1b94964 100644 --- a/TaskHolder.h +++ b/TaskHolder.h @@ -2,6 +2,7 @@ #define TaskHolder_h #include +#include #include "tbb/task_group.h" #include "TaskBase.h" @@ -11,9 +12,9 @@ class TaskHolder { friend class WaitingTaskList; TaskHolder(): group_{nullptr}, task_{nullptr} {} - TaskHolder(tbb::task_group& iGroup, std::unique_ptr iTask): - group_{&iGroup}, task_{iTask.release()} { - //std::cout <<"new task "< iTask, bool track=false): + group_{&iGroup}, task_{iTask.release()}, track_{track} { + if ( track_ ) std::cout << "New holder for task " + std::to_string(reinterpret_cast(task_)) << std::endl; task_->increment_ref_count(); } @@ -25,14 +26,16 @@ class TaskHolder { TaskHolder( const TaskHolder& iOther): group_{iOther.group_}, - task_{iOther.task_} { - //std::cout <<"copy holder with task "<(task_)) << std::endl; if(task_) { task_->increment_ref_count(); } } TaskHolder(TaskHolder&& iOther): group_{iOther.group_}, - task_{iOther.task_} { - //std::cout <<"move holder with task "<(task_)) << std::endl; iOther.task_ = nullptr; } @@ -59,12 +62,13 @@ class TaskHolder { auto t = task_; task_ = nullptr; if(t->decrement_ref_count()) { - //std::cout <<"Task "<run([t]() { - t->execute(); - //std::cout <<"delete "<(t)) << std::endl; + group_->run([t, track=track_]() { + if ( track ) std::cout << "Running task " + std::to_string(reinterpret_cast(t)) << std::endl; + t->execute(); + if ( track ) std::cout << "Deleting task " + std::to_string(reinterpret_cast(t)) << std::endl; + delete t; + }); } } private: @@ -76,6 +80,7 @@ class TaskHolder { tbb::task_group* group_; TaskBase* task_; + bool track_; }; } #endif From 33e2f1d69a48d89f1a3ff2cac81c1b340393d8b6 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Wed, 13 Jul 2022 12:46:55 -0500 Subject: [PATCH 26/43] Let source use arena --- threaded_io_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/threaded_io_test.cc b/threaded_io_test.cc index fa01ded..e40ab61 100644 --- a/threaded_io_test.cc +++ b/threaded_io_test.cc @@ -135,6 +135,8 @@ int main(int argc, char* argv[]) { } std::cout <<"finished warmup"< waiter; @@ -152,8 +154,6 @@ int main(int argc, char* argv[]) { } std::atomic ievt{0}; - - tbb::task_arena arena(parallelism); decltype(std::chrono::high_resolution_clock::now()) start; auto pOut = out.get(); From 918bbbb06d27727b686c79ba18dbd263e131b7c4 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Wed, 13 Jul 2022 15:32:33 -0500 Subject: [PATCH 27/43] Track max concurrent requests --- S3Common.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/S3Common.cc b/S3Common.cc index d774d6a..71b23af 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -87,6 +87,7 @@ class S3LibWrapper { S3RequestContext * ctx; fd_set read_fds, write_fds, except_fds; int max_fd, activeRequests{0}; + int topfds{0}, topreq{0}; S3_create_request_context(&ctx); while(running_) { FD_ZERO(&read_fds); @@ -100,6 +101,8 @@ class S3LibWrapper { throw std::runtime_error("internal error in S3_get_request_context_fdsets"); } + topfds = std::max(topfds, max_fd); + if ( max_fd != -1 ) { int64_t timeout = std::min(100l, S3_get_request_context_timeout(ctx)); // milliseconds assert(timeout >= 0); @@ -119,6 +122,7 @@ class S3LibWrapper { case S3StatusOutOfMemory: throw std::runtime_error("out of memory while processing S3_runonce_request_context"); } + topreq = std::max(topreq, activeRequests); S3RequestWrapper* req; int currentlyActive{activeRequests}; @@ -137,6 +141,7 @@ class S3LibWrapper { } // TODO: this may abort requests in flight, do we wait or is it synchronous? S3_destroy_request_context(ctx); + std::cout << "S3LibWrapper: max open file descriptors: " << topfds << ", max concurrent requests: " << topreq << std::endl; } void _submit(S3RequestWrapper* req, S3RequestContext* ctx) const { @@ -262,8 +267,8 @@ class S3LibWrapper { private: S3Status initStatus_; - int asyncRequestLimit_{256}; - int asyncAddRequestLimit_{16}; // TODO: when this is a reasonable number, there's a race + int asyncRequestLimit_{512}; // no more than FD_SETSIZE (1024) + int asyncAddRequestLimit_{64}; std::thread loop_; std::atomic running_; // all callbackData pointers are to S3RequestWrapper objects From 204af1ee61e53a9e24d281b54682281ee600c553 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Thu, 14 Jul 2022 10:11:40 -0500 Subject: [PATCH 28/43] Add event stripe compression --- S3Outputer.cc | 15 ++++++++++----- S3Outputer.h | 4 ++++ S3Source.cc | 16 +++++++++++++--- objectstripe.proto | 2 ++ 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/S3Outputer.cc b/S3Outputer.cc index 2d56534..6a6082b 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -307,11 +307,16 @@ void S3Outputer::flushEventStripe(const objstripe::EventStripe& stripe, TaskHold } auto start = std::chrono::high_resolution_clock::now(); index_.set_totalevents(index_.totalevents() + stripe.events_size()); - auto dest = index_.add_packedeventstripes(); - // TODO: compression - stripe.SerializeToString(dest); - if ( verbose_ >= 2 ) { - std::cout << "length of packed EventStripe: " << dest->size() << "\n"; + { + auto dest = index_.add_packedeventstripes(); + std::string buf; + stripe.SerializeToString(&buf); + eventStripeCompressor_.write(buf, *dest); + eventStripeCompressor_.flush(*dest); + index_.add_eventstripesizes(buf.size()); + if ( verbose_ >= 2 ) { + std::cout << "length of packed EventStripe: " << dest->size() << "\n"; + } } // TODO: checkpoint only every few event stripes? diff --git a/S3Outputer.h b/S3Outputer.h index c4963e7..dec5911 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -24,6 +24,7 @@ namespace cce::tf { class StreamCompressor { public: + StreamCompressor() {}; StreamCompressor(const objstripe::Compression& setting); const objstripe::Compression& getCompression() const { return setting_; } size_t write(const std::string_view blob, std::string& out); @@ -56,6 +57,8 @@ class S3Outputer : public OutputerBase { index_.set_serializestrategy(objstripe::SerializeStrategy::kRoot); defaultCompression_.set_type(objstripe::CompressionType::kZSTD); defaultCompression_.set_level(4); + index_.set_allocated_eventstripecompression(new objstripe::Compression(defaultCompression_)); + eventStripeCompressor_ = StreamCompressor(index_.eventstripecompression()); } void setupForLane(unsigned int iLaneIndex, std::vector const& iDPs) final; @@ -115,6 +118,7 @@ class S3Outputer : public OutputerBase { // (for index_'s ProductInfos, appendProductBuffer() has finished before we access) mutable SerialTaskQueue flushQueue_; mutable objstripe::ObjectStripeIndex index_; + mutable StreamCompressor eventStripeCompressor_; mutable std::chrono::microseconds flushTime_; mutable std::atomic parallelTime_; diff --git a/S3Source.cc b/S3Source.cc index 4722f87..eb6a420 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -207,9 +207,19 @@ void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTask { // default-constructed currentEventStripe_ will have size zero, so 0, 0 will load first stripe if(nextEventInStripe_ == currentEventStripe_.events_size()) { - // need to read ahead - // TODO: compression - currentEventStripe_.ParseFromString(index_.packedeventstripes(nextEventStripe_++)); + // Need to read ahead + // TODO: perhaps not the best idea to clobber index_? At least for now we don't need it again + auto* stripeData = index_.mutable_packedeventstripes(nextEventStripe_); + if ( index_.has_eventstripecompression() ) { + auto dsize = index_.eventstripesizes(nextEventStripe_); + std::string decompressedStripe; + decompressedStripe.resize(dsize); + ::decompress_stripe(index_.eventstripecompression(), *stripeData, decompressedStripe, dsize); + currentEventStripe_.ParseFromString(decompressedStripe); + } else { + currentEventStripe_.ParseFromString(*stripeData); + } + nextEventStripe_++; nextEventInStripe_ = 0; } const auto event = currentEventStripe_.events(nextEventInStripe_); diff --git a/objectstripe.proto b/objectstripe.proto index 69e5233..d6e5f93 100644 --- a/objectstripe.proto +++ b/objectstripe.proto @@ -31,6 +31,8 @@ message ObjectStripeIndex { repeated ProductInfo products = 3; repeated bytes packedEventStripes = 4; optional SerializeStrategy serializeStrategy = 5; + optional Compression eventStripeCompression = 6; + repeated uint32 eventStripeSizes = 7 [packed = true]; } message EventStripe { From 083f1b9a187c96164e8ed1d81da344cb0309ab45 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Thu, 14 Jul 2022 10:24:29 -0500 Subject: [PATCH 29/43] Nicer object naming --- S3Outputer.cc | 6 +++--- S3Source.cc | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/S3Outputer.cc b/S3Outputer.cc index 6a6082b..c2ae40a 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -119,7 +119,7 @@ void S3Outputer::setupForLane(unsigned int iLaneIndex, std::vectorset_flushsize(0); prod->set_flushminbytes(productBufferFlushMinBytes_); // TODO: choose compression setting based on properties of ss? - buffers_.emplace_back(objPrefix_ + prod->productname(), prod, defaultCompression_); + buffers_.emplace_back(objPrefix_ + "/" + prod->productname(), prod, defaultCompression_); } } // all lanes see same products? if not we'll need a map @@ -280,7 +280,7 @@ void S3Outputer::appendProductBuffer( + " bytes, "s + std::to_string(bufferNevents) + " events), flushing\n"; } - std::string name = buf.prefix_ + std::to_string(buf.stripe_.globaloffset()); + std::string name = buf.prefix_ + "/" + std::to_string(buf.stripe_.globaloffset()); auto req = std::make_shared(S3Request::Type::put, name); buf.stripe_.SerializeToString(&req->buffer); auto putDoneTask = TaskHolder(*iCallback.group(), make_functor_task([req, callback=std::move(iCallback)]() { @@ -320,7 +320,7 @@ void S3Outputer::flushEventStripe(const objstripe::EventStripe& stripe, TaskHold } // TODO: checkpoint only every few event stripes? - auto req = std::make_shared(S3Request::Type::put, objPrefix_ + "index"); + auto req = std::make_shared(S3Request::Type::put, "index/" + objPrefix_); index_.SerializeToString(&req->buffer); auto putDoneTask = TaskHolder(*iCallback.group(), make_functor_task([req, callback=std::move(iCallback)]() { if ( req->status != S3Request::Status::ok ) { diff --git a/S3Source.cc b/S3Source.cc index eb6a420..084d54d 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -144,7 +144,7 @@ S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, u { tbb::task_group group; - auto req = std::make_shared(S3Request::Type::get, objPrefix_ + "index"); + auto req = std::make_shared(S3Request::Type::get, "index/" + objPrefix_); auto getDoneTask = TaskHolder(group, make_functor_task([this, req]() { if ( req->status == S3Request::Status::ok ) { if ( not index_.ParseFromString(req->buffer) ) { @@ -234,12 +234,12 @@ void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTask if ( nextEventInStripe_ % productinfo.flushsize() == 0 ) { auto new_ps = std::make_shared( conn_, - objPrefix_ + productinfo.productname() + std::to_string(globalEventIndex), + objPrefix_ + "/" + productinfo.productname() + "/" + std::to_string(globalEventIndex), globalEventIndex ); if ( verbose_ >= 2 ) { std::cout << "setting lane " << iLane << "to read stripe " << - objPrefix_ + productinfo.productname() + std::to_string(globalEventIndex) << "\n"; + objPrefix_ + "/" + productinfo.productname() + "/" + std::to_string(globalEventIndex) << "\n"; } std::swap(ps, new_ps); // record decompress time of old stripe From a2e02a19ae76fb16758624d285b57597a7e2651a Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Fri, 15 Jul 2022 15:23:17 -0500 Subject: [PATCH 30/43] fix uninitialized var --- S3Outputer.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/S3Outputer.cc b/S3Outputer.cc index c2ae40a..5511582 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -33,18 +33,18 @@ size_t zstd_compress(ZSTD_CCtx* ctx, const std::string_view blob, std::string& o size_t status; if ( flush ) { ZSTD_inBuffer_s ibuf{.src=nullptr, .size=0, .pos=0}; - while ( status != 0 ) { - status = ZSTD_compressStream2(ctx, &obuf, &ibuf, ZSTD_e_end); - if ( ZSTD_isError(status) ) { - std::cerr <<"ERROR in compression " << ZSTD_getErrorName(status) << std::endl; - } + do { if ( obuf.pos == obuf.size ) { - size_t new_size = obuf.size * 2; + size_t new_size = (obuf.size * 3) / 2; out.resize(new_size); obuf.dst = out.data(); obuf.size = new_size; } - } + status = ZSTD_compressStream2(ctx, &obuf, &ibuf, ZSTD_e_end); + if ( ZSTD_isError(status) ) { + std::cerr <<"ERROR in compression " << ZSTD_getErrorName(status) << std::endl; + } + } while ( status != 0 ); } else { ZSTD_inBuffer_s ibuf{.src=blob.data(), .size=blob.size(), .pos=0}; while ( ibuf.pos < ibuf.size ) { From 4483ebbeec03cb0425667a920bab6ddc2d979d3a Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Mon, 22 Aug 2022 14:27:20 -0500 Subject: [PATCH 31/43] Add some docs --- README.md | 24 ++++++++++++++++++++++++ s3conn_local.ini | 5 +++++ s3io.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ s3localserver.sh | 7 +++++++ 4 files changed, 82 insertions(+) create mode 100644 s3conn_local.ini create mode 100644 s3io.md create mode 100755 s3localserver.sh diff --git a/README.md b/README.md index 2f78ec9..a52bbfc 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,17 @@ This is similar to SharedRootEventSource except this time each entry in the `Eve > threaded_io_test -s SharedRootBatchEventsSource=test.eroot -t 1 -n 10 ``` +#### S3Source +Reads individual data product _stripes_--compressed concatenated serialized data products--from an S3 server, along with the appropriate +event-level metadata to index them. See s3io.md for further details of the data layout. + +- verbose (int): increase number to get more detail +- prefix (string): the object prefix in the S3 bucket +- conn (string): path to the S3 connection configuration file +``` +> threaded_io_test -s S3Source=prefix=testproducts:conn=s3conn.ini -t 1 -n 10 +``` + ### Outputers #### DummyOutputer @@ -311,6 +322,19 @@ or > threaded_io_test -s ReplicatedRootSource=test.root -t 1 -n 10 -o RootBatchEventsOutputer=test.root:batchSize=4 ``` +#### S3Outputer +Outputs individual data product _stripes_--compressed concatenated serialized data products--to an S3 server, along with the appropriate +event-level metadata to index them. See s3io.md for further details of the data layout. + +- verbose (int): increase number to get more detail +- prefix (string): the object prefix to use when storing data in the S3 bucket +- productFlush (int): the minimum number of (possibly compressed) bytes to accumulate in the product stripe output buffer before flushing it to S3 +- eventFlush (int): the maximum number of events that can be contained in a single product stripe. +- conn (string): path to the S3 connection configuration file +``` +> threaded_io_test -s TestProductsSource -t 1 -n 10 -o S3Outputer=prefix=testproducts:conn=s3conn.ini +``` + ### Waiters #### ScaleWaiter diff --git a/s3conn_local.ini b/s3conn_local.ini new file mode 100644 index 0000000..e03ede0 --- /dev/null +++ b/s3conn_local.ini @@ -0,0 +1,5 @@ +hostName=localhost:9000 +bucketName=test +accessKeyId=minio +secretAccessKey=miniotestpass +#securityToken=blah diff --git a/s3io.md b/s3io.md new file mode 100644 index 0000000..c7d37d9 --- /dev/null +++ b/s3io.md @@ -0,0 +1,46 @@ +# S3 I/O components + +As part of the "Object Storage for CMS in the HL-LHC era" project, the +`S3Source` and `S3Outputer` provide an IO system that can write to S3 buckets, +with the main purpose to explore the performance and parallelization +capabilities of the Ceph RadosGW S3 service. + +## Building +To build with S3 support, you will need to download and install [libs3](https://github.com/bji/libs3): +```bash +mkdir -p external +git clone git@github.com:bji/libs3.git +cd libs3 +make DESTDIR=../external install +``` + +the rest of the dependencies can be sourced, e.g., from a recent CMSSW release: +```bash +source /cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/cmake/3.18.2/etc/profile.d/init.sh +pushd /cvmfs/cms.cern.ch/slc7_amd64_gcc10/cms/cmssw/CMSSW_12_3_0_pre5/ +cmsenv +popd +git clone git@github.com:hep-cce2/root_serialization.git +cd root_serialization +mkdir build && cd build +cmake ../ \ + -DCMAKE_PREFIX_PATH="/cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/lz4/1.9.2-373b1f6c80ba13e93f436c77aa63c026;/cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/protobuf/3.15.1-b2ca6d3fa59916150b27c3d598c7c7ac" \ + -Dzstd_DIR=/cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/zstd/1.4.5-ec760e16a89e932fdc84f1fd3192f206/lib/cmake/zstd \ + -DTBB_DIR=/cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/tbb/v2021.4.0-75e6d730601d8461f20893321f4f7660/lib/cmake/TBB \ + -DROOT_DIR=$ROOTSYS/cmake \ + -DLIBS3_DIR=$(realpath ../../external) \ + -DENABLE_HDF5=OFF -DENABLE_S3=ON +``` + +## Running with local server +The S3 connection settings are specified in an ini file. A local server can be +brought up using the `./s3localserver.sh` script, assuming singularity is +available at your site. Then use `conn=s3conn_local.ini` in the source/outputer +configuration. + +## Data layout +There are two types of binary data blobs written to the S3 service: +- An event index, unique per processing task, is stored at `index/{prefix}`. It contains an event number index, stored as a list of compressed _event stripes_, and is rewritten after each new event stripe is flushed. The frequency of this is controlled by the `eventFlush` parameter in S3Outputer. +- Several product _stripes_, stored at `{prefix}/{product_name}/{offset}` where the offset indexes the event stripe list. The number of product stripes written per event stripe depends on the size of the product and the `productFlush` parameter, as well as the requirement that the number of events worth of data products per stripe is divisible by `eventFlush`. + +The binary data format of the blobs is specified by the `objectstripe.proto` protobuf schema. diff --git a/s3localserver.sh b/s3localserver.sh new file mode 100755 index 0000000..adba9f7 --- /dev/null +++ b/s3localserver.sh @@ -0,0 +1,7 @@ +#!/bin/bash +mkdir -p data/test +singularity run \ + -B ${PWD}/data:/data \ + --env MINIO_ROOT_USER=minio \ + --env MINIO_ROOT_PASSWORD=miniotestpass \ + docker://quay.io/minio/minio server /data --console-address ":9001" From d0df7618c215ad00b1e9346a62911302f0b7bb04 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Mon, 23 Jan 2023 13:11:37 -0600 Subject: [PATCH 32/43] Implement lzma compression --- CMakeLists.txt | 3 ++ S3Outputer.cc | 109 +++++++++++++++++++++++++++++++++++++++------ S3Outputer.h | 9 ++-- S3Source.cc | 60 +++++++++++++++++++------ objectstripe.proto | 1 + s3io.md | 2 +- 6 files changed, 153 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 90a2f43..d5061e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,11 +24,13 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g3") +add_compile_definitions(TBB_USE_THREADING_TOOLS=1) find_package(ROOT REQUIRED COMPONENTS Core RIO Tree) find_package(TBB REQUIRED) find_package(zstd REQUIRED) find_package(lz4 REQUIRED) +find_package(LibLZMA REQUIRED) set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) @@ -100,6 +102,7 @@ target_compile_definitions(threaded_io_test PUBLIC TBB_PREVIEW_TASK_GROUP_EXTENS target_link_libraries(threaded_io_test PRIVATE LZ4::lz4 + LibLZMA::LibLZMA ROOT::Core ROOT::RIO ROOT::Tree diff --git a/S3Outputer.cc b/S3Outputer.cc index 5511582..dfa20cb 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -10,19 +10,6 @@ using namespace cce::tf; -StreamCompressor::StreamCompressor(const objstripe::Compression& setting): - setting_{setting} -{ - switch ( setting_.type() ) { - case objstripe::CompressionType::kNone: - break; - case objstripe::CompressionType::kZSTD: - zstd_.reset(ZSTD_createCStream()); - ZSTD_CCtx_setParameter(zstd_.get(), ZSTD_c_compressionLevel, setting_.level()); - break; - } -} - namespace { size_t zstd_compress(ZSTD_CCtx* ctx, const std::string_view blob, std::string& out, bool flush) { size_t tail{out.size()}; @@ -65,6 +52,75 @@ size_t zstd_compress(ZSTD_CCtx* ctx, const std::string_view blob, std::string& o // but it doesn't appear to be nonzero return status; } + +lzma_ret lzma_init(lzma_stream* strm, uint32_t level) { + lzma_options_lzma opt_lzma2; + lzma_filter filters[] = { + { .id = LZMA_FILTER_LZMA2, .options = &opt_lzma2 }, + { .id = LZMA_VLI_UNKNOWN, .options = NULL }, + }; + lzma_lzma_preset(&opt_lzma2, level); + // TODO: pass through target stripe size for better choice of dict size? + // ROOT choice: input size / 4 + opt_lzma2.dict_size = std::max(LZMA_DICT_SIZE_MIN, 32768u); + return lzma_stream_encoder(strm, filters, LZMA_CHECK_CRC32); +} + +size_t lzma_compress(lzma_stream* strm, const std::string_view blob, std::string& out, bool flush) { + size_t tail{out.size()}; + if ( out.capacity() < BUFSIZ ) out.resize(BUFSIZ); + else out.resize(out.capacity()); + + lzma_action action = LZMA_RUN; + strm->next_out = (uint8_t*) out.data() + tail; + strm->avail_out = out.size() - tail; + if ( flush ) { + action = LZMA_FINISH; + strm->next_in = NULL; + strm->avail_in = 0; + } else { + strm->next_in = (const uint8_t*) blob.data(); + strm->avail_in = blob.size(); + } + + while ( (strm->avail_in > 0) || flush ) { + lzma_ret ret = lzma_code(strm, action); + if ( ret == LZMA_STREAM_END ) break; + else if ( strm->avail_out == 0 ) { + size_t old_size = out.size(); + size_t new_size = (old_size * 3) / 2; + out.resize(new_size); + strm->next_out = (uint8_t*) out.data() + old_size; + strm->avail_out = new_size - old_size; + } + else if (ret != LZMA_OK) { + std::cerr << "ERROR in lzma compression " << ret << std::endl; + break; + } + } + + out.resize(out.size() - strm->avail_out); + return 0; +} +} // anonymous namespace + +StreamCompressor::StreamCompressor(const objstripe::Compression& setting): + setting_{setting} +{ + switch ( setting_.type() ) { + case objstripe::CompressionType::kNone: + break; + case objstripe::CompressionType::kZSTD: + zstd_.reset(ZSTD_createCStream()); + ZSTD_CCtx_setParameter(zstd_.get(), ZSTD_c_compressionLevel, setting_.level()); + break; + case objstripe::CompressionType::kLZMA: + lzma_.reset((lzma_stream*) malloc(sizeof(lzma_stream))); + memset(lzma_.get(), 0, sizeof(lzma_stream)); + lzma_ret ret = ::lzma_init(lzma_.get(), setting_.level()); + if (ret != LZMA_OK) { throw std::runtime_error("Could not initialize LZMA encoder: " + std::to_string(ret)); } + break; + } } size_t StreamCompressor::write(const std::string_view blob, std::string& out) { @@ -74,6 +130,8 @@ size_t StreamCompressor::write(const std::string_view blob, std::string& out) { return 0; case objstripe::CompressionType::kZSTD: return ::zstd_compress(zstd_.get(), blob, out, false); + case objstripe::CompressionType::kLZMA: + return ::lzma_compress(lzma_.get(), blob, out, false); default: assert(false); return 0; @@ -87,6 +145,13 @@ void StreamCompressor::flush(std::string& out) { case objstripe::CompressionType::kZSTD: ::zstd_compress(zstd_.get(), {}, out, true); return; + case objstripe::CompressionType::kLZMA: + ::lzma_compress(lzma_.get(), {}, out, true); + // unlike zstd, lzma must be (TODO: true?) reset after each finish + if ( + ::lzma_init(lzma_.get(), setting_.level()) != LZMA_OK + ) { throw std::runtime_error("Could not initialize LZMA encoder"); } + return; default: assert(false); return; @@ -353,8 +418,24 @@ class Maker : public OutputerMakerBase { if(not conn) { return {}; } + auto cType = objstripe::CompressionType::kZSTD; + uint32_t cLevel = 4; + auto cTypeStr = params.get("compression"); + if(cTypeStr) { + if ( cTypeStr.value() == "ZSTD" ) { + cType = objstripe::CompressionType::kZSTD; + } + else if ( cTypeStr.value() == "LZMA" ) { + cType = objstripe::CompressionType::kLZMA; + cLevel = 9; + } + else { + std::cerr << "Unrecognized compression type: " << cTypeStr.value() << "\n"; + return {}; + } + } - return std::make_unique(iNLanes, objPrefix.value(), verbose, productFlush, eventFlush, conn); + return std::make_unique(iNLanes, objPrefix.value(), verbose, productFlush, eventFlush, conn, cType, cLevel); } }; diff --git a/S3Outputer.h b/S3Outputer.h index dec5911..7da03bf 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -7,6 +7,7 @@ #include #include "zstd.h" +#include "lzma.h" #define TBB_PREVIEW_TASK_GROUP_EXTENSIONS 1 // for task_group::defer #include "tbb/task_group.h" @@ -35,11 +36,13 @@ class StreamCompressor { struct ZSTDDeleter { void operator()(ZSTD_CStream* s) const {ZSTD_freeCStream(s);} }; std::unique_ptr zstd_; + struct LZMADeleter { void operator()(lzma_stream* s) const {lzma_end(s); free(s);} }; + std::unique_ptr lzma_; }; class S3Outputer : public OutputerBase { public: - S3Outputer(unsigned int iNLanes, std::string objPrefix, int iVerbose, size_t iProductBufferFlush, size_t iEventFlushSize, S3ConnectionRef conn): + S3Outputer(unsigned int iNLanes, std::string objPrefix, int iVerbose, size_t iProductBufferFlush, size_t iEventFlushSize, S3ConnectionRef conn, objstripe::CompressionType cType, uint32_t cLevel): serializers_(iNLanes), objPrefix_(objPrefix), verbose_(iVerbose), @@ -55,8 +58,8 @@ class S3Outputer : public OutputerBase { // TODO: make configurable index_.set_serializestrategy(objstripe::SerializeStrategy::kRoot); - defaultCompression_.set_type(objstripe::CompressionType::kZSTD); - defaultCompression_.set_level(4); + defaultCompression_.set_type(cType); + defaultCompression_.set_level(cLevel); index_.set_allocated_eventstripecompression(new objstripe::Compression(defaultCompression_)); eventStripeCompressor_ = StreamCompressor(index_.eventstripecompression()); } diff --git a/S3Source.cc b/S3Source.cc index 084d54d..b6ae633 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -1,5 +1,6 @@ #include #include "zstd.h" +#include "lzma.h" #include "S3Source.h" #include "SourceFactory.h" #include "Deserializer.h" @@ -20,24 +21,57 @@ size_t zstd_perthread_decompress(void* dst, size_t dstCapacity, const void* src, return ZSTD_decompressDCtx(holder.ctx, dst, dstCapacity, src, compressedSize); } +void zstd_decompress(std::string& blob, std::string& out, size_t dSize) { + out.resize(dSize); + size_t status = ZSTD_decompress(out.data(), out.size(), blob.data(), blob.size()); + // size_t status = zstd_perthread_decompress(out.data(), out.size(), blob.data(), blob.size()); + if ( ZSTD_isError(status) ) { + std::cerr <<"ERROR in decompression " << ZSTD_getErrorName(status) << std::endl; + } + if (status < dSize) { + std::cerr <<"ERROR in decompression, expected " << dSize << " bytes but only got " << status << std::endl; + } + blob.clear(); + blob.shrink_to_fit(); +} + +// /cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/xz/5.2.5-d6fed2038c4e8d6e04531d1adba59f37 +void lzma_decompress(std::string& blob, std::string& out, size_t dSize) { + lzma_stream strm = LZMA_STREAM_INIT; + lzma_ret ret = lzma_stream_decoder(&strm, UINT64_MAX, 0); + if (ret != LZMA_OK) { throw std::runtime_error("Could not initialize LZMA encoder"); } + + out.resize(dSize); + strm.next_in = (const uint8_t*) blob.data(); + strm.avail_in = blob.size(); + strm.next_out = (uint8_t*) out.data(); + strm.avail_out = out.size(); + while ( strm.avail_in > 0 ) { + ret = lzma_code(&strm, LZMA_RUN); + if ( ret == LZMA_STREAM_END ) break; + else if (ret != LZMA_OK) { + std::cerr << "ERROR in lzma compression " << ret << std::endl; + break; + } + } + if ( strm.avail_out > 0 ) { + std::cerr <<"ERROR in decompression, expected " << dSize << " bytes but only got " << dSize - strm.avail_out << std::endl; + } + blob.clear(); + blob.shrink_to_fit(); +} + void decompress_stripe(const objstripe::Compression& setting, std::string& blob, std::string& out, size_t dSize) { switch ( setting.type() ) { case objstripe::CompressionType::kNone: std::swap(blob, out); - return; + break; case objstripe::CompressionType::kZSTD: - out.resize(dSize); - size_t status = ZSTD_decompress(out.data(), out.size(), blob.data(), blob.size()); - // size_t status = zstd_perthread_decompress(out.data(), out.size(), blob.data(), blob.size()); - if ( ZSTD_isError(status) ) { - std::cerr <<"ERROR in decompression " << ZSTD_getErrorName(status) << std::endl; - } - if (status < dSize) { - std::cerr <<"ERROR in decompression, expected " << dSize << " bytes but only got " << status << std::endl; - } - blob.clear(); - blob.shrink_to_fit(); - return; + ::zstd_decompress(blob, out, dSize); + break; + case objstripe::CompressionType::kLZMA: + ::lzma_decompress(blob, out, dSize); + break; } } } diff --git a/objectstripe.proto b/objectstripe.proto index d6e5f93..dfdb76f 100644 --- a/objectstripe.proto +++ b/objectstripe.proto @@ -10,6 +10,7 @@ enum SerializeStrategy { enum CompressionType { kNone = 0; kZSTD = 1; + kLZMA = 2; } message Compression { diff --git a/s3io.md b/s3io.md index c7d37d9..81f7303 100644 --- a/s3io.md +++ b/s3io.md @@ -24,7 +24,7 @@ git clone git@github.com:hep-cce2/root_serialization.git cd root_serialization mkdir build && cd build cmake ../ \ - -DCMAKE_PREFIX_PATH="/cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/lz4/1.9.2-373b1f6c80ba13e93f436c77aa63c026;/cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/protobuf/3.15.1-b2ca6d3fa59916150b27c3d598c7c7ac" \ + -DCMAKE_PREFIX_PATH="/cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/lz4/1.9.2-373b1f6c80ba13e93f436c77aa63c026;/cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/protobuf/3.15.1-b2ca6d3fa59916150b27c3d598c7c7ac;/cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/xz/5.2.5-d6fed2038c4e8d6e04531d1adba59f37" \ -Dzstd_DIR=/cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/zstd/1.4.5-ec760e16a89e932fdc84f1fd3192f206/lib/cmake/zstd \ -DTBB_DIR=/cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/tbb/v2021.4.0-75e6d730601d8461f20893321f4f7660/lib/cmake/TBB \ -DROOT_DIR=$ROOTSYS/cmake \ From d33cca24130e2f00dd862fedd1b50c139013039c Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Wed, 19 Apr 2023 11:09:04 -0500 Subject: [PATCH 33/43] Implement proper backoff+retry for async S3 requests --- S3Common.cc | 43 ++++++++++++++++++++++++++++--------------- S3Common.h | 3 ++- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/S3Common.cc b/S3Common.cc index 71b23af..cc0ae49 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -27,6 +27,7 @@ class S3RequestWrapper { arena = std::make_unique(tbb::task_arena::attach{}); } backoffTimeout = req->timeout.count(); + submit_after = std::chrono::steady_clock::now(); }; void done() { @@ -45,6 +46,8 @@ class S3RequestWrapper { size_t put_offset{0}; int retries_executed{0}; long backoffTimeout; + std::chrono::steady_clock::time_point submit_after; + static_assert(std::chrono::steady_clock::duration() <= std::chrono::milliseconds(1)); }; class S3LibWrapper { @@ -89,6 +92,7 @@ class S3LibWrapper { int max_fd, activeRequests{0}; int topfds{0}, topreq{0}; S3_create_request_context(&ctx); + std::vector to_defer; while(running_) { FD_ZERO(&read_fds); FD_ZERO(&write_fds); @@ -131,9 +135,18 @@ class S3LibWrapper { and activeRequests < (currentlyActive+asyncAddRequestLimit_) and requests_.try_pop(req) // test this last! ) { - _submit(req, ctx); - activeRequests++; + if ( req->submit_after <= std::chrono::steady_clock::now() ) { + _submit(req, ctx); + activeRequests++; + } else { + to_defer.push_back(req); + } } + for (auto req : to_defer) { + requests_.push(req); + } + to_defer.clear(); + if ( activeRequests == 0 ) { // TODO: would be better to use a semaphore (submit() and ~S3LibWrapper need to notify) std::this_thread::sleep_for(std::chrono::milliseconds(10)); @@ -193,23 +206,22 @@ class S3LibWrapper { static void responseCompleteCallback(S3Status status, const S3ErrorDetails *error, void *callbackData) { auto req = static_cast(callbackData); if ( S3_status_is_retryable(status) && req->retries_executed < req->req->retries ) { - if ( status == S3Status::S3StatusErrorRequestTimeout ) { - // https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ - static thread_local std::minstd_rand rng(std::hash{}(std::this_thread::get_id())); - std::uniform_int_distribution dist(0l, std::min(S3Request::max_timeout.count(), req->backoffTimeout)); - auto dt = std::chrono::milliseconds(dist(rng)); - if ( req->async ) { - // TODO: async sleep by setting a future submit time and checking in loop_body - } else { - // TODO: better option? - std::this_thread::sleep_for(dt); - req->backoffTimeout *= 2; - } + // e.g. S3StatusErrorRequestTimeout or ErrorSlowDown + // Run backoff algo, https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ + static thread_local std::minstd_rand rng(std::hash{}(std::this_thread::get_id())); + std::uniform_int_distribution dist(0l, std::min(S3Request::max_timeout.count(), req->backoffTimeout)); + const auto dt = std::chrono::milliseconds(dist(rng)); + std::cerr << "Got status " << S3_get_status_name(status) << " while running request " + << *(req->req) << ", will retry in " << dt.count() << "ms\n"; + if ( req->async ) { + req->submit_after = std::chrono::steady_clock::now() + dt; } else { - std::cerr << "Got status " << S3_get_status_name(status) << " while running request " << *(req->req) << ", retrying\n"; + // TODO: better option? + std::this_thread::sleep_for(dt); } req->put_offset = 0; req->retries_executed++; + req->backoffTimeout *= 2; if ( req->async ) { instance().requests_.push(req); } else { @@ -223,6 +235,7 @@ class S3LibWrapper { req->req->status = S3Request::Status::ok; break; default: + std::cerr << "Got status " << S3_get_status_name(status) << " at end request " << *(req->req) << "\n"; req->req->status = S3Request::Status::error; } req->done(); diff --git a/S3Common.h b/S3Common.h index 5944454..55ec098 100644 --- a/S3Common.h +++ b/S3Common.h @@ -18,10 +18,11 @@ class S3Request { public: enum class Type {undef, get, put}; enum class Status {waiting, ok, error}; + static constexpr std::chrono::milliseconds default_timeout{5000}; static constexpr std::chrono::milliseconds max_timeout{60000}; S3Request() = delete; - S3Request(Type iType, const std::string& iKey, std::chrono::milliseconds iTimeout=std::chrono::milliseconds(1000), int iRetries=5): + S3Request(Type iType, const std::string& iKey, std::chrono::milliseconds iTimeout=default_timeout, int iRetries=5): type{iType}, key{iKey}, timeout{iTimeout}, retries{iRetries} {}; const Type type; From b584f9a92943c6e1690852943df0d420fd6155c0 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Wed, 19 Apr 2023 11:09:32 -0500 Subject: [PATCH 34/43] Switch to HTTPS --- S3Common.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/S3Common.cc b/S3Common.cc index cc0ae49..9584f11 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -381,7 +381,7 @@ S3Connection::S3Connection( ctx_.reset(new S3BucketContext{ .hostName = hostName_.c_str(), .bucketName = bucketName_.c_str(), - .protocol = S3ProtocolHTTP, + .protocol = S3ProtocolHTTPS, .uriStyle = S3UriStylePath, .accessKeyId = accessKeyId_.c_str(), .secretAccessKey = secretAccessKey_.c_str(), From 0cee12f36e597147d2ff64cca6a8165e2ca47bb6 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Mon, 1 May 2023 06:50:14 -0500 Subject: [PATCH 35/43] Implement S3Source product prefetching also const-ify some stuff --- S3Source.cc | 91 ++++++++++++++++++++++++++++++++--------------------- S3Source.h | 27 +++++++++++++--- 2 files changed, 77 insertions(+), 41 deletions(-) diff --git a/S3Source.cc b/S3Source.cc index b6ae633..b69bd14 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -21,7 +21,7 @@ size_t zstd_perthread_decompress(void* dst, size_t dstCapacity, const void* src, return ZSTD_decompressDCtx(holder.ctx, dst, dstCapacity, src, compressedSize); } -void zstd_decompress(std::string& blob, std::string& out, size_t dSize) { +void zstd_decompress(const std::string& blob, std::string& out, size_t dSize) { out.resize(dSize); size_t status = ZSTD_decompress(out.data(), out.size(), blob.data(), blob.size()); // size_t status = zstd_perthread_decompress(out.data(), out.size(), blob.data(), blob.size()); @@ -31,12 +31,10 @@ void zstd_decompress(std::string& blob, std::string& out, size_t dSize) { if (status < dSize) { std::cerr <<"ERROR in decompression, expected " << dSize << " bytes but only got " << status << std::endl; } - blob.clear(); - blob.shrink_to_fit(); } // /cvmfs/cms.cern.ch/slc7_amd64_gcc10/external/xz/5.2.5-d6fed2038c4e8d6e04531d1adba59f37 -void lzma_decompress(std::string& blob, std::string& out, size_t dSize) { +void lzma_decompress(const std::string& blob, std::string& out, size_t dSize) { lzma_stream strm = LZMA_STREAM_INIT; lzma_ret ret = lzma_stream_decoder(&strm, UINT64_MAX, 0); if (ret != LZMA_OK) { throw std::runtime_error("Could not initialize LZMA encoder"); } @@ -57,14 +55,12 @@ void lzma_decompress(std::string& blob, std::string& out, size_t dSize) { if ( strm.avail_out > 0 ) { std::cerr <<"ERROR in decompression, expected " << dSize << " bytes but only got " << dSize - strm.avail_out << std::endl; } - blob.clear(); - blob.shrink_to_fit(); } -void decompress_stripe(const objstripe::Compression& setting, std::string& blob, std::string& out, size_t dSize) { +void decompress_stripe(const objstripe::Compression& setting, const std::string& blob, std::string& out, size_t dSize) { switch ( setting.type() ) { case objstripe::CompressionType::kNone: - std::swap(blob, out); + out = blob; break; case objstripe::CompressionType::kZSTD: ::zstd_decompress(blob, out, dSize); @@ -96,7 +92,7 @@ void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { offsets_.push_back(nbytes); } assert(offsets_.size() == data_.counts_size() + 1); - ::decompress_stripe(data_.compression(), *data_.mutable_content(), content_, nbytes); + ::decompress_stripe(data_.compression(), data_.content(), content_, nbytes); assert(nbytes == content_.size()); data_.clear_content(); decompressTime_ = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); @@ -123,6 +119,41 @@ std::string_view DelayedProductStripeRetriever::bufferAt(size_t globalEventIndex return {&content_[bstart], bstop - bstart}; } +ProductStripeGenerator::ProductStripeGenerator(const S3ConnectionRef& conn, const std::string& prefix, unsigned int flushSize, size_t globalIndexStart, size_t globalIndexEnd) : + conn_(conn), prefix_(prefix), flushSize_(flushSize), globalIndexStart_(globalIndexStart), globalIndexEnd_(globalIndexEnd) +{ + auto indexThis = globalIndexStart - (globalIndexStart % flushSize_); + auto indexNext = indexThis + flushSize_; + currentStripe_ = std::make_shared(conn_, prefix_ + "/" + std::to_string(indexThis), indexThis); + nextStripe_ = std::make_shared(conn_, prefix_ + "/" + std::to_string(indexNext), indexNext); + prefetch_group_ = std::make_unique(); +} + +std::shared_ptr +ProductStripeGenerator::stripeFor(size_t globalEventIndex) { + assert(globalEventIndex >= globalIndexStart_ and globalEventIndex < globalIndexEnd_); + if ( globalEventIndex == nextStripe_->globalOffset() ) { + auto indexNext = globalEventIndex + flushSize_; + auto new_ps = std::make_shared(conn_, prefix_ + "/" + std::to_string(indexNext), indexNext); + // record decompress time of old stripe + decompressTime_ += currentStripe_->decompressTime(); + // shuffle new_ps -> nextStripe_ -> currentStripe_ + std::swap(nextStripe_, currentStripe_); + std::swap(new_ps, nextStripe_); + } + if ( + currentStripe_->wasFetched() + and ~nextStripe_->wasFetched() + and (globalEventIndex % flushSize_ >= flushSize_ / 2) + and (globalEventIndex + flushSize_ < globalIndexEnd_) + ) + { + // somewhere in the middle of current stripe, prefetch next + nextStripe_->fetch(TaskHolder(*prefetch_group_, make_functor_task([](){}))); + } + return currentStripe_; +} + S3DelayedRetriever::S3DelayedRetriever(objstripe::ObjectStripeIndex const& index, DeserializeStrategy strategy): deserializers_{std::move(strategy)} { @@ -167,7 +198,7 @@ void S3DelayedRetriever::getAsync(DataProductRetriever& product, int index, Task stripes_[index]->fetch(std::move(fetchCallback)); } -S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, unsigned long long iNEvents, S3ConnectionRef conn): +S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, unsigned long long iNEvents, const S3ConnectionRef& conn): SharedSourceBase(iNEvents), objPrefix_(std::move(iObjPrefix)), verbose_(iVerbose), @@ -200,7 +231,10 @@ S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, u << index_.totalevents() << " vs. " << iNEvents << ". Will read all available events instead.\n"; } - currentProductStripes_.resize(index_.products_size()); + productRetrievers_.reserve(index_.products_size()); + for(const auto& productInfo : index_.products()) { + productRetrievers_.emplace_back(conn_, objPrefix_ + "/" + productInfo.productname(), productInfo.flushsize(), 0ul, index_.totalevents()); + } laneRetrievers_.reserve(iNLanes); for(unsigned int i = 0; i< iNLanes; ++i) { @@ -242,16 +276,15 @@ void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTask // default-constructed currentEventStripe_ will have size zero, so 0, 0 will load first stripe if(nextEventInStripe_ == currentEventStripe_.events_size()) { // Need to read ahead - // TODO: perhaps not the best idea to clobber index_? At least for now we don't need it again - auto* stripeData = index_.mutable_packedeventstripes(nextEventStripe_); + const auto& stripeData = index_.packedeventstripes(nextEventStripe_); if ( index_.has_eventstripecompression() ) { auto dsize = index_.eventstripesizes(nextEventStripe_); std::string decompressedStripe; decompressedStripe.resize(dsize); - ::decompress_stripe(index_.eventstripecompression(), *stripeData, decompressedStripe, dsize); + ::decompress_stripe(index_.eventstripecompression(), stripeData, decompressedStripe, dsize); currentEventStripe_.ParseFromString(decompressedStripe); } else { - currentEventStripe_.ParseFromString(*stripeData); + currentEventStripe_.ParseFromString(stripeData); } nextEventStripe_++; nextEventInStripe_ = 0; @@ -261,26 +294,8 @@ void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTask auto& retriever = laneRetrievers_[iLane]; size_t globalEventIndex = event.offset(); - auto productinfo = std::begin(index_.products()); - size_t i{0}; - for (auto& ps : currentProductStripes_) { - const auto& productinfo = index_.products(i); - if ( nextEventInStripe_ % productinfo.flushsize() == 0 ) { - auto new_ps = std::make_shared( - conn_, - objPrefix_ + "/" + productinfo.productname() + "/" + std::to_string(globalEventIndex), - globalEventIndex - ); - if ( verbose_ >= 2 ) { - std::cout << "setting lane " << iLane << "to read stripe " << - objPrefix_ + "/" + productinfo.productname() + "/" + std::to_string(globalEventIndex) << "\n"; - } - std::swap(ps, new_ps); - // record decompress time of old stripe - if ( new_ps ) decompressTime_ += new_ps->decompressTime(); - } - retriever.setStripe(i, ps); - i++; + for (size_t i=0; i < productRetrievers_.size(); ++i) { + retriever.setStripe(i, productRetrievers_[i].stripeFor(globalEventIndex)); } retriever.setEvent(globalEventIndex, {event.run(), event.lumi(), event.event()}); @@ -304,7 +319,11 @@ std::chrono::microseconds S3Source::serialReadTime() const { } std::chrono::microseconds S3Source::decompressTime() const { - return decompressTime_; + auto time = std::chrono::microseconds::zero(); + for(auto const& p : productRetrievers_) { + time += p.decompressTime(); + } + return time; } std::chrono::microseconds S3Source::deserializeTime() const { diff --git a/S3Source.h b/S3Source.h index e7a04fe..2441af1 100644 --- a/S3Source.h +++ b/S3Source.h @@ -20,15 +20,17 @@ namespace cce::tf { class DelayedProductStripeRetriever { public: - DelayedProductStripeRetriever(S3ConnectionRef conn, std::string name, size_t globalOffset): + DelayedProductStripeRetriever(const S3ConnectionRef& conn, std::string name, size_t globalOffset): conn_(conn), name_(name), globalOffset_(globalOffset), state_{State::unretrieved} {}; void fetch(TaskHolder&& callback) const; std::string_view bufferAt(size_t globalEventIndex) const; ~DelayedProductStripeRetriever() {}; + size_t globalOffset() const { return globalOffset_; }; + bool wasFetched() const { return state_ != State::unretrieved; }; std::chrono::microseconds decompressTime() const { return decompressTime_; } private: - S3ConnectionRef conn_; + const S3ConnectionRef conn_; std::string name_; size_t globalOffset_; @@ -41,6 +43,22 @@ class DelayedProductStripeRetriever { mutable std::chrono::microseconds decompressTime_{0}; }; +class ProductStripeGenerator { + public: + ProductStripeGenerator(const S3ConnectionRef& conn, const std::string& prefix, unsigned int flushSize, size_t globalIndexStart, size_t globalIndexEnd); + std::shared_ptr stripeFor(size_t globalEventIndex); + std::chrono::microseconds decompressTime() const { return decompressTime_; }; + + private: + const S3ConnectionRef conn_; + const std::string prefix_; + const unsigned int flushSize_; + size_t globalIndexStart_, globalIndexEnd_; + std::shared_ptr currentStripe_; + std::shared_ptr nextStripe_; + std::chrono::microseconds decompressTime_{0}; + std::unique_ptr prefetch_group_; +}; class S3DelayedRetriever : public DelayedProductRetriever { public: @@ -75,7 +93,7 @@ class S3DelayedRetriever : public DelayedProductRetriever { class S3Source : public SharedSourceBase { public: - S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, unsigned long long iNEvents, S3ConnectionRef conn); + S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, unsigned long long iNEvents, const S3ConnectionRef& conn); S3Source(S3Source&&) = delete; S3Source(S3Source const&) = delete; ~S3Source() = default; @@ -105,10 +123,9 @@ class S3Source : public SharedSourceBase { size_t nextEventStripe_ = 0; size_t nextEventInStripe_ = 0; objstripe::EventStripe currentEventStripe_; - std::vector> currentProductStripes_; + std::vector productRetrievers_; std::vector laneRetrievers_; std::chrono::microseconds readTime_; - std::chrono::microseconds decompressTime_{0}; }; } From 1d78b7e439947bd5f940a1fb4ec0e449e0e806ac Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Mon, 1 May 2023 06:52:58 -0500 Subject: [PATCH 36/43] Request timing log (to remove0 --- S3Common.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/S3Common.cc b/S3Common.cc index 9584f11..08c02c9 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -205,6 +205,7 @@ class S3LibWrapper { static void responseCompleteCallback(S3Status status, const S3ErrorDetails *error, void *callbackData) { auto req = static_cast(callbackData); + auto now = std::chrono::steady_clock::now(); if ( S3_status_is_retryable(status) && req->retries_executed < req->req->retries ) { // e.g. S3StatusErrorRequestTimeout or ErrorSlowDown // Run backoff algo, https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ @@ -214,7 +215,7 @@ class S3LibWrapper { std::cerr << "Got status " << S3_get_status_name(status) << " while running request " << *(req->req) << ", will retry in " << dt.count() << "ms\n"; if ( req->async ) { - req->submit_after = std::chrono::steady_clock::now() + dt; + req->submit_after = now + dt; } else { // TODO: better option? std::this_thread::sleep_for(dt); @@ -233,6 +234,11 @@ class S3LibWrapper { switch ( status ) { case S3StatusOK: req->req->status = S3Request::Status::ok; + std::cerr << ((req->req->type == S3Request::Type::get) ? "get: " : "put: ") + + std::to_string(std::chrono::duration_cast(now - req->submit_after).count()) + + " " + std::to_string(req->req->buffer.size()) + + " " + std::to_string(std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count()) + + "\n"; break; default: std::cerr << "Got status " << S3_get_status_name(status) << " at end request " << *(req->req) << "\n"; @@ -280,7 +286,7 @@ class S3LibWrapper { private: S3Status initStatus_; - int asyncRequestLimit_{512}; // no more than FD_SETSIZE (1024) + int asyncRequestLimit_{64}; // no more than FD_SETSIZE (1024) int asyncAddRequestLimit_{64}; std::thread loop_; std::atomic running_; From 88298754a82a4e68708384f3fdef6ec2b1bb1a63 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Tue, 2 May 2023 03:58:01 -0500 Subject: [PATCH 37/43] Implement "fire and forget" event stripe flushing --- S3Common.cc | 49 ++++++++++++++----------------------------------- S3Common.h | 2 +- S3Outputer.cc | 17 ++++++++++++----- S3Outputer.h | 2 ++ S3Source.cc | 4 ++-- 5 files changed, 31 insertions(+), 43 deletions(-) diff --git a/S3Common.cc b/S3Common.cc index 08c02c9..e11eaa2 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -20,28 +20,21 @@ using namespace cce::tf; class S3RequestWrapper { public: - S3RequestWrapper(std::shared_ptr iReq, const S3BucketContext* iCtx, TaskHolder&& iCallback, bool iAsync): - req{std::move(iReq)}, bucketCtx{iCtx}, callback{std::move(iCallback)}, async{iAsync} + S3RequestWrapper(std::shared_ptr iReq, const S3BucketContext* iCtx, tbb::task_handle&& iCallback): + req{std::move(iReq)}, bucketCtx{iCtx}, callback{std::move(iCallback)} { - if ( async ) { - arena = std::make_unique(tbb::task_arena::attach{}); - } + arena = std::make_unique(tbb::task_arena::attach{}); backoffTimeout = req->timeout.count(); submit_after = std::chrono::steady_clock::now(); }; void done() { - if ( async ) { - arena->enqueue([callback=std::move(callback)]() { }); - } else { - callback.doneWaiting(); - } + arena->enqueue(std::move(callback)); }; std::shared_ptr req; const S3BucketContext* bucketCtx; - TaskHolder callback; - const bool async; + tbb::task_handle callback; std::unique_ptr arena; size_t put_offset{0}; int retries_executed{0}; @@ -62,11 +55,7 @@ class S3LibWrapper { bool running() const { return running_; } void submit(S3RequestWrapper* req) { - if ( req->async ) { - requests_.push(req); - } else { - _submit(req, nullptr); - } + requests_.push(req); } private: @@ -158,8 +147,7 @@ class S3LibWrapper { } void _submit(S3RequestWrapper* req, S3RequestContext* ctx) const { - // this function will block if ctx is null - assert(req->async xor ctx == nullptr); + assert(ctx != nullptr); switch ( req->req->type ) { case S3Request::Type::undef: assert(false); // logic error @@ -214,21 +202,11 @@ class S3LibWrapper { const auto dt = std::chrono::milliseconds(dist(rng)); std::cerr << "Got status " << S3_get_status_name(status) << " while running request " << *(req->req) << ", will retry in " << dt.count() << "ms\n"; - if ( req->async ) { - req->submit_after = now + dt; - } else { - // TODO: better option? - std::this_thread::sleep_for(dt); - } + req->submit_after = now + dt; req->put_offset = 0; req->retries_executed++; req->backoffTimeout *= 2; - if ( req->async ) { - instance().requests_.push(req); - } else { - // can libs3 callbacks recurse? probably... - instance()._submit(req, nullptr); - } + instance().requests_.push(req); return; // no delete! } switch ( status ) { @@ -237,7 +215,7 @@ class S3LibWrapper { std::cerr << ((req->req->type == S3Request::Type::get) ? "get: " : "put: ") + std::to_string(std::chrono::duration_cast(now - req->submit_after).count()) + " " + std::to_string(req->req->buffer.size()) - + " " + std::to_string(std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count()) + + " " + std::to_string(std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1)) + "\n"; break; default: @@ -286,7 +264,7 @@ class S3LibWrapper { private: S3Status initStatus_; - int asyncRequestLimit_{64}; // no more than FD_SETSIZE (1024) + int asyncRequestLimit_{32}; // no more than FD_SETSIZE (1024) int asyncAddRequestLimit_{64}; std::thread loop_; std::atomic running_; @@ -396,11 +374,12 @@ S3Connection::S3Connection( }); }; -void S3Connection::submit(std::shared_ptr req, TaskHolder&& callback, bool async) const { +void S3Connection::submit(std::shared_ptr req, TaskHolder&& callback) const { auto start = std::chrono::high_resolution_clock::now(); if ( ctx_ ) { + auto task_handle = callback.group()->defer([cb=std::move(callback)](){}); // start of S3RequestWrapper lifecycle (ends in S3LibWrapper::responseCompleteCallback) - auto wrapper = new S3RequestWrapper(std::move(req), ctx_.get(), std::move(callback), async); + auto wrapper = new S3RequestWrapper(std::move(req), ctx_.get(), std::move(task_handle)); S3LibWrapper::instance().submit(wrapper); } else { if ( req->type == S3Request::Type::put ) { diff --git a/S3Common.h b/S3Common.h index 55ec098..febf701 100644 --- a/S3Common.h +++ b/S3Common.h @@ -47,7 +47,7 @@ class S3Connection { std::string_view iSecurityToken ); - void submit(std::shared_ptr req, TaskHolder&& callback, bool async) const; + void submit(std::shared_ptr req, TaskHolder&& callback) const; std::chrono::microseconds blockingTime() const { return std::chrono::microseconds(blockingTime_.load()); } private: diff --git a/S3Outputer.cc b/S3Outputer.cc index dfa20cb..f659341 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -269,13 +269,19 @@ void S3Outputer::collateProducts( TaskHolder productsDoneCallback( // make lambda and call, since move assignment is disabled - [this, cb=std::move(iCallback)]() mutable { + // (copy callback so it lasts duration of this scope) + [this, cb=iCallback]() mutable { if ( currentEventStripe_.events_size() == eventFlushSize_ ) { objstripe::EventStripe stripeOut; stripeOut.mutable_events()->Reserve(eventFlushSize_); std::swap(currentEventStripe_, stripeOut); - return TaskHolder(*cb.group(), make_functor_task( - [this, stripeOut=std::move(stripeOut), callback=std::move(cb)]() mutable { + std::cerr << "flush " + std::to_string(numFireAndForgetCollates_) + + " " + std::to_string(std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1)) + + "\n"; + auto nextCallback = ( numFireAndForgetCollates_ < maxFireAndForgetCollates_ ) ? + numFireAndForgetCollates_++, TaskHolder(*cb.group(), make_functor_task([this]() mutable {numFireAndForgetCollates_--;})) : std::move(cb); + return TaskHolder(*nextCallback.group(), make_functor_task( + [this, stripeOut=std::move(stripeOut), callback=std::move(nextCallback)]() mutable { if(verbose_ >= 2) { std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n"; } flushQueue_.push(*callback.group(), [this, stripeOut=std::move(stripeOut), callback=std::move(callback)]() { flushEventStripe(stripeOut, std::move(callback)); @@ -353,7 +359,7 @@ void S3Outputer::appendProductBuffer( std::cerr << "failed to write product buffer " << *req << std::endl; } })); - conn_->submit(std::move(req), std::move(putDoneTask), true); + conn_->submit(std::move(req), std::move(putDoneTask)); buf.stripe_.clear_counts(); buf.stripe_.clear_content(); @@ -390,9 +396,10 @@ void S3Outputer::flushEventStripe(const objstripe::EventStripe& stripe, TaskHold auto putDoneTask = TaskHolder(*iCallback.group(), make_functor_task([req, callback=std::move(iCallback)]() { if ( req->status != S3Request::Status::ok ) { std::cerr << "failed to write product buffer index" << std::endl; + // TODO: if several failures, maybe exit? } })); - conn_->submit(std::move(req), std::move(putDoneTask), true); + conn_->submit(std::move(req), std::move(putDoneTask)); flushTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); } diff --git a/S3Outputer.h b/S3Outputer.h index 7da03bf..046869b 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -113,6 +113,8 @@ class S3Outputer : public OutputerBase { mutable size_t eventGlobalOffset_{0}; mutable objstripe::EventStripe currentEventStripe_{}; mutable std::chrono::microseconds collateTime_; + constexpr static unsigned int maxFireAndForgetCollates_{4}; + mutable std::atomic numFireAndForgetCollates_{0}; // only modified in appendProductBuffer() mutable std::vector buffers_; diff --git a/S3Source.cc b/S3Source.cc index b69bd14..4c8639e 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -100,7 +100,7 @@ void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { waiters_.doneWaiting(); } else { throw std::runtime_error("Could not retrieve ProductStripe for key " + name_); } })); - conn_->submit(std::move(req), std::move(getDoneTask), true); + conn_->submit(std::move(req), std::move(getDoneTask)); } else if (this_state == State::retrieved ) { return; } else { @@ -218,7 +218,7 @@ S3Source::S3Source(unsigned int iNLanes, std::string iObjPrefix, int iVerbose, u } else { throw std::runtime_error("Could not retrieve index in S3Source construction"); } })); - conn_->submit(std::move(req), std::move(getDoneTask), false); + conn_->submit(std::move(req), std::move(getDoneTask)); group.wait(); } From 009a51e2ba1b7ae05a23d7844341772c87f4fd78 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Wed, 3 May 2023 16:15:10 -0500 Subject: [PATCH 38/43] Implement product groups --- S3Outputer.cc | 124 +++++++++++++++++++++++++++------------------ S3Outputer.h | 8 ++- objectstripe.proto | 10 ++++ 3 files changed, 92 insertions(+), 50 deletions(-) diff --git a/S3Outputer.cc b/S3Outputer.cc index f659341..f98052b 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -214,16 +214,11 @@ void S3Outputer::printSummary() const { tbb::task_group group; { TaskHolder finalTask(group, make_functor_task([&group, task=group.defer([](){})]() mutable { group.run(std::move(task)); })); - TaskHolder productsDone(group, make_functor_task( - [this, stripeOut=std::move(currentEventStripe_), callback=std::move(finalTask)]() mutable { - flushQueue_.push(*callback.group(), [this, stripeOut=std::move(stripeOut), callback=std::move(callback)]() { - flushEventStripe(stripeOut, std::move(callback), true); - }); - } - )); + SmallBufferMapPtr smallbuffers = std::make_shared(); + auto productsDoneCallback = makeProductsDoneCallback(finalTask, smallbuffers, true); for(auto& buf : buffers_) { - buf.appendQueue_.push(group, [this, &buf, cb=productsDone]() mutable { - appendProductBuffer(buf, {}, std::move(cb), true); + buf.appendQueue_.push(group, [this, &buf, cb=productsDoneCallback, smallbuffers]() mutable { + appendProductBuffer(buf, {}, std::move(cb), true, smallbuffers); }); } } @@ -258,7 +253,6 @@ void S3Outputer::collateProducts( TaskHolder iCallback ) const { - using namespace std::string_literals; auto start = std::chrono::high_resolution_clock::now(); auto sev = currentEventStripe_.add_events(); sev->set_offset(eventGlobalOffset_++); @@ -267,48 +261,76 @@ void S3Outputer::collateProducts( sev->set_event(iEventID.event); if (verbose_ >= 2) { std::cout << sev->DebugString(); } - TaskHolder productsDoneCallback( - // make lambda and call, since move assignment is disabled - // (copy callback so it lasts duration of this scope) - [this, cb=iCallback]() mutable { - if ( currentEventStripe_.events_size() == eventFlushSize_ ) { - objstripe::EventStripe stripeOut; - stripeOut.mutable_events()->Reserve(eventFlushSize_); - std::swap(currentEventStripe_, stripeOut); - std::cerr << "flush " + std::to_string(numFireAndForgetCollates_) - + " " + std::to_string(std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1)) - + "\n"; - auto nextCallback = ( numFireAndForgetCollates_ < maxFireAndForgetCollates_ ) ? - numFireAndForgetCollates_++, TaskHolder(*cb.group(), make_functor_task([this]() mutable {numFireAndForgetCollates_--;})) : std::move(cb); - return TaskHolder(*nextCallback.group(), make_functor_task( - [this, stripeOut=std::move(stripeOut), callback=std::move(nextCallback)]() mutable { - if(verbose_ >= 2) { std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n"; } - flushQueue_.push(*callback.group(), [this, stripeOut=std::move(stripeOut), callback=std::move(callback)]() { - flushEventStripe(stripeOut, std::move(callback)); - }); - } - )); - } - return cb; - }() - ); + SmallBufferMapPtr smallbuffers = std::make_shared(); + // pass a copy of iCallback + auto productsDoneCallback = makeProductsDoneCallback(iCallback, smallbuffers, false); auto buf = std::begin(buffers_); for (const auto& s : iSerializers) { const std::string_view blob(s.blob().data(), s.blob().size()); - buf->appendQueue_.push(*productsDoneCallback.group(), [this, buf, blob, cb=productsDoneCallback]() mutable { - appendProductBuffer(*buf, blob, std::move(cb)); + buf->appendQueue_.push(*productsDoneCallback.group(), [this, buf, blob, cb=productsDoneCallback, smallbuffers]() mutable { + appendProductBuffer(*buf, blob, std::move(cb), false, smallbuffers); }); buf++; } collateTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); } +TaskHolder S3Outputer::makeProductsDoneCallback(TaskHolder iCallback, SmallBufferMapPtr smallbuffers, bool last) const { + using namespace std::string_literals; + if ( (currentEventStripe_.events_size() == eventFlushSize_) || last ) { + objstripe::EventStripe stripeOut; + stripeOut.mutable_events()->Reserve(eventFlushSize_); + std::swap(currentEventStripe_, stripeOut); + std::cerr << "flush " + std::to_string(numFireAndForgetCollates_) + + " " + std::to_string(std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1)) + + "\n"; + auto nextCallback = ( (numFireAndForgetCollates_ < maxFireAndForgetCollates_) || last ) ? + numFireAndForgetCollates_++, TaskHolder(*iCallback.group(), make_functor_task([this]() mutable {numFireAndForgetCollates_--;})) : std::move(iCallback); + return TaskHolder(*nextCallback.group(), make_functor_task( + [this, stripeOut=std::move(stripeOut), callback=std::move(nextCallback), smallbuffers]() mutable { + if(verbose_ >= 2) { std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n"; } + // merge buffers by greedy algorithm + std::sort(smallbuffers->begin(), smallbuffers->end(), [](const auto &a, const auto &b){ return a.second.content().size() > b.second.content().size(); }); + size_t iGroup{0}; + auto it = smallbuffers->begin(); + objstripe::ProductGroupStripe gout; + do { + size_t nbytes{0}; + auto* group = stripeOut.add_groups(); + while ( (nbytes < productBufferFlushMinBytes_) and (it != smallbuffers->end()) ) { + nbytes += it->second.content().size(); + group->mutable_names()->Add(std::move(it->first)); + gout.mutable_products()->Add(std::move(it->second)); + it++; + } + auto req = std::make_shared(S3Request::Type::put, objPrefix_ + "/group" + std::to_string(iGroup) + "/" + std::to_string(stripeOut.events(0).offset())); + gout.SerializeToString(&req->buffer); + auto putDoneTask = TaskHolder(*callback.group(), make_functor_task([req, callback]() { + if ( req->status != S3Request::Status::ok ) { + std::cerr << "failed to write product buffer " << *req << std::endl; + } + })); + conn_->submit(std::move(req), std::move(putDoneTask)); + iGroup++; + gout.clear_products(); + } while ( it != smallbuffers->end() ); + flushQueue_.push(*callback.group(), [this, stripeOut=std::move(stripeOut), callback=std::move(callback)]() { + flushEventStripe(stripeOut, std::move(callback)); + }); + } + )); + } + return iCallback; +} + + void S3Outputer::appendProductBuffer( ProductOutputBuffer& buf, const std::string_view blob, TaskHolder iCallback, - bool last + bool last, + SmallBufferMapPtr smallbuffers ) const { using namespace std::string_literals; @@ -330,8 +352,8 @@ void S3Outputer::appendProductBuffer( // first flush when we exceed min size and have an even divisor of eventFlushSize_ // subsequent flush when we reach productFlushSize - // always flush when we reach eventFlushSize_ (for buffers that never get big enough) // flush if last call and we have something to write + // for buffers that never get big enough, flush when we reach eventFlushSize_ but to a merge queue if ( ( (buf.info_->flushsize() == 0) @@ -339,8 +361,8 @@ void S3Outputer::appendProductBuffer( && (eventFlushSize_ % bufferNevents == 0) ) || (bufferNevents == buf.info_->flushsize()) - || (bufferNevents == eventFlushSize_) || (last && bufferNevents > 0) + || (bufferNevents == eventFlushSize_) ) { buf.compressor_.flush(*buf.stripe_.mutable_content()); @@ -352,14 +374,20 @@ void S3Outputer::appendProductBuffer( } std::string name = buf.prefix_ + "/" + std::to_string(buf.stripe_.globaloffset()); - auto req = std::make_shared(S3Request::Type::put, name); - buf.stripe_.SerializeToString(&req->buffer); - auto putDoneTask = TaskHolder(*iCallback.group(), make_functor_task([req, callback=std::move(iCallback)]() { - if ( req->status != S3Request::Status::ok ) { - std::cerr << "failed to write product buffer " << *req << std::endl; - } - })); - conn_->submit(std::move(req), std::move(putDoneTask)); + if ( (bufferNevents == eventFlushSize_) && (bufferNbytes <= buf.info_->flushminbytes()) ) { + // too small buffer, put it on a merge queue + smallbuffers->push_back({name, buf.stripe_}); // TODO: swap + // leave iCallback alive til end of function + } else { + auto req = std::make_shared(S3Request::Type::put, name); + buf.stripe_.SerializeToString(&req->buffer); + auto putDoneTask = TaskHolder(*iCallback.group(), make_functor_task([req, callback=std::move(iCallback)]() { + if ( req->status != S3Request::Status::ok ) { + std::cerr << "failed to write product buffer " << *req << std::endl; + } + })); + conn_->submit(std::move(req), std::move(putDoneTask)); + } buf.stripe_.clear_counts(); buf.stripe_.clear_content(); diff --git a/S3Outputer.h b/S3Outputer.h index 046869b..98f5f5e 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -11,6 +11,7 @@ #define TBB_PREVIEW_TASK_GROUP_EXTENSIONS 1 // for task_group::defer #include "tbb/task_group.h" +#include "tbb/concurrent_vector.h" #include "OutputerBase.h" #include "EventIdentifier.h" @@ -85,6 +86,8 @@ class S3Outputer : public OutputerBase { SerialTaskQueue appendQueue_{}; std::chrono::microseconds appendTime_{0}; }; + // product buffer name, bytes + typedef std::shared_ptr>> SmallBufferMapPtr; // Plan: // productReadyAsync() is threadsafe because serializers_ is one per lane @@ -94,7 +97,8 @@ class S3Outputer : public OutputerBase { // then collate() calls appendProductBuffer() with the above TaskHolder as callback (or original callback) // printSummary() takes care of the tails by setting last=true in the calls void collateProducts(EventIdentifier const& iEventID, SerializeStrategy const& iSerializers, TaskHolder iCallback) const; - void appendProductBuffer(ProductOutputBuffer& buf, const std::string_view blob, TaskHolder iCallback, bool last=false) const; + void appendProductBuffer(ProductOutputBuffer& buf, const std::string_view blob, TaskHolder iCallback, bool last, SmallBufferMapPtr smallbuffers) const; + TaskHolder makeProductsDoneCallback(TaskHolder iCallback, SmallBufferMapPtr smallbuffers, bool last) const; void flushEventStripe(const objstripe::EventStripe& stripe, TaskHolder iCallback, bool last=false) const; // configuration options @@ -113,7 +117,7 @@ class S3Outputer : public OutputerBase { mutable size_t eventGlobalOffset_{0}; mutable objstripe::EventStripe currentEventStripe_{}; mutable std::chrono::microseconds collateTime_; - constexpr static unsigned int maxFireAndForgetCollates_{4}; + constexpr static unsigned int maxFireAndForgetCollates_{0}; mutable std::atomic numFireAndForgetCollates_{0}; // only modified in appendProductBuffer() diff --git a/objectstripe.proto b/objectstripe.proto index dfdb76f..8b3861e 100644 --- a/objectstripe.proto +++ b/objectstripe.proto @@ -46,6 +46,12 @@ message EventStripe { repeated Event events = 1; // TODO: store product flushSize here? + + message ProductGroup { + repeated string names = 1; + } + + repeated ProductGroup groups = 2; } message ProductStripe { @@ -54,3 +60,7 @@ message ProductStripe { optional uint64 globalOffset = 3; optional Compression compression = 4; } + +message ProductGroupStripe { + repeated ProductStripe products = 2; +} From 75f4be00de271f51cdb6df5f4e54d2fa7e5b9bc5 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Thu, 4 May 2023 18:02:07 -0500 Subject: [PATCH 39/43] Working product group IO --- S3Outputer.cc | 19 ++++++---- S3Outputer.h | 10 ++--- S3Source.cc | 101 ++++++++++++++++++++++++++++++++++++++------------ S3Source.h | 70 ++++++++++++++++++++++++++-------- 4 files changed, 150 insertions(+), 50 deletions(-) diff --git a/S3Outputer.cc b/S3Outputer.cc index f98052b..e7cd121 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -211,10 +211,11 @@ void S3Outputer::outputAsync(unsigned int iLaneIndex, EventIdentifier const& iEv void S3Outputer::printSummary() const { { + // drain all buffers tbb::task_group group; { TaskHolder finalTask(group, make_functor_task([&group, task=group.defer([](){})]() mutable { group.run(std::move(task)); })); - SmallBufferMapPtr smallbuffers = std::make_shared(); + SmallBuffers smallbuffers = std::make_shared(); auto productsDoneCallback = makeProductsDoneCallback(finalTask, smallbuffers, true); for(auto& buf : buffers_) { buf.appendQueue_.push(group, [this, &buf, cb=productsDoneCallback, smallbuffers]() mutable { @@ -261,7 +262,7 @@ void S3Outputer::collateProducts( sev->set_event(iEventID.event); if (verbose_ >= 2) { std::cout << sev->DebugString(); } - SmallBufferMapPtr smallbuffers = std::make_shared(); + SmallBuffers smallbuffers = std::make_shared(); // pass a copy of iCallback auto productsDoneCallback = makeProductsDoneCallback(iCallback, smallbuffers, false); @@ -276,16 +277,16 @@ void S3Outputer::collateProducts( collateTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); } -TaskHolder S3Outputer::makeProductsDoneCallback(TaskHolder iCallback, SmallBufferMapPtr smallbuffers, bool last) const { +TaskHolder S3Outputer::makeProductsDoneCallback(TaskHolder iCallback, SmallBuffers smallbuffers, bool last) const { using namespace std::string_literals; - if ( (currentEventStripe_.events_size() == eventFlushSize_) || last ) { + if ( (last and currentEventStripe_.events_size() > 0) or (currentEventStripe_.events_size() == eventFlushSize_) ) { objstripe::EventStripe stripeOut; stripeOut.mutable_events()->Reserve(eventFlushSize_); std::swap(currentEventStripe_, stripeOut); std::cerr << "flush " + std::to_string(numFireAndForgetCollates_) + " " + std::to_string(std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1)) + "\n"; - auto nextCallback = ( (numFireAndForgetCollates_ < maxFireAndForgetCollates_) || last ) ? + auto nextCallback = ( (numFireAndForgetCollates_ < maxFireAndForgetCollates_) and ~last ) ? numFireAndForgetCollates_++, TaskHolder(*iCallback.group(), make_functor_task([this]() mutable {numFireAndForgetCollates_--;})) : std::move(iCallback); return TaskHolder(*nextCallback.group(), make_functor_task( [this, stripeOut=std::move(stripeOut), callback=std::move(nextCallback), smallbuffers]() mutable { @@ -330,7 +331,7 @@ void S3Outputer::appendProductBuffer( const std::string_view blob, TaskHolder iCallback, bool last, - SmallBufferMapPtr smallbuffers + SmallBuffers smallbuffers ) const { using namespace std::string_literals; @@ -376,7 +377,11 @@ void S3Outputer::appendProductBuffer( std::string name = buf.prefix_ + "/" + std::to_string(buf.stripe_.globaloffset()); if ( (bufferNevents == eventFlushSize_) && (bufferNbytes <= buf.info_->flushminbytes()) ) { // too small buffer, put it on a merge queue - smallbuffers->push_back({name, buf.stripe_}); // TODO: swap + objstripe::ProductStripe out; + out.mutable_content()->reserve(bufferNbytes); + out.mutable_compression()->CopyFrom(buf.stripe_.compression()); + std::swap(out, buf.stripe_); + smallbuffers->push_back({name, std::move(out)}); // leave iCallback alive til end of function } else { auto req = std::make_shared(S3Request::Type::put, name); diff --git a/S3Outputer.h b/S3Outputer.h index 98f5f5e..7ce9a12 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -61,7 +61,7 @@ class S3Outputer : public OutputerBase { index_.set_serializestrategy(objstripe::SerializeStrategy::kRoot); defaultCompression_.set_type(cType); defaultCompression_.set_level(cLevel); - index_.set_allocated_eventstripecompression(new objstripe::Compression(defaultCompression_)); + index_.mutable_eventstripecompression()->CopyFrom(defaultCompression_); eventStripeCompressor_ = StreamCompressor(index_.eventstripecompression()); } @@ -76,7 +76,7 @@ class S3Outputer : public OutputerBase { ProductOutputBuffer(const std::string& prefix, objstripe::ProductInfo* info, const objstripe::Compression& comp) : prefix_{prefix}, info_{info}, compressor_{comp} { stripe_.set_content(""); - stripe_.set_allocated_compression(new objstripe::Compression(compressor_.getCompression())); + stripe_.mutable_compression()->CopyFrom(compressor_.getCompression()); }; const std::string prefix_; @@ -87,7 +87,7 @@ class S3Outputer : public OutputerBase { std::chrono::microseconds appendTime_{0}; }; // product buffer name, bytes - typedef std::shared_ptr>> SmallBufferMapPtr; + typedef std::shared_ptr>> SmallBuffers; // Plan: // productReadyAsync() is threadsafe because serializers_ is one per lane @@ -97,8 +97,8 @@ class S3Outputer : public OutputerBase { // then collate() calls appendProductBuffer() with the above TaskHolder as callback (or original callback) // printSummary() takes care of the tails by setting last=true in the calls void collateProducts(EventIdentifier const& iEventID, SerializeStrategy const& iSerializers, TaskHolder iCallback) const; - void appendProductBuffer(ProductOutputBuffer& buf, const std::string_view blob, TaskHolder iCallback, bool last, SmallBufferMapPtr smallbuffers) const; - TaskHolder makeProductsDoneCallback(TaskHolder iCallback, SmallBufferMapPtr smallbuffers, bool last) const; + void appendProductBuffer(ProductOutputBuffer& buf, const std::string_view blob, TaskHolder iCallback, bool last, SmallBuffers smallbuffers) const; + TaskHolder makeProductsDoneCallback(TaskHolder iCallback, SmallBuffers smallbuffers, bool last) const; void flushEventStripe(const objstripe::EventStripe& stripe, TaskHolder iCallback, bool last=false) const; // configuration options diff --git a/S3Source.cc b/S3Source.cc index 4c8639e..c12f668 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -70,9 +70,25 @@ void decompress_stripe(const objstripe::Compression& setting, const std::string& break; } } + +void parse_productstripe(objstripe::ProductStripe& stripe, std::vector& offsets, std::string& content) { + offsets.reserve(stripe.counts_size() + 1); + size_t nbytes{0}; + offsets.push_back(nbytes); + for (const auto& c : stripe.counts()) { + nbytes += c; + offsets.push_back(nbytes); + } + assert(offsets.size() == stripe.counts_size() + 1); + decompress_stripe(stripe.compression(), stripe.content(), content, nbytes); + assert(nbytes == content.size()); + stripe.clear_content(); +} } -void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { +WaitableFetch::~WaitableFetch() {} + +void WaitableFetch::fetch(TaskHolder&& callback) { auto this_state{State::unretrieved}; if ( state_.compare_exchange_strong(this_state, State::retrieving) ) { auto req = std::make_shared(S3Request::Type::get, name_); @@ -81,24 +97,11 @@ void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { auto getDoneTask = TaskHolder(*group, make_functor_task([this, req]() { if ( req->status == S3Request::Status::ok ) { auto start = std::chrono::high_resolution_clock::now(); - if ( not data_.ParseFromString(req->buffer) ) { - throw std::runtime_error("Could not deserialize ProductStripe for key " + name_); - } - offsets_.reserve(data_.counts_size() + 1); - size_t nbytes{0}; - offsets_.push_back(nbytes); - for (const auto& c : data_.counts()) { - nbytes += c; - offsets_.push_back(nbytes); - } - assert(offsets_.size() == data_.counts_size() + 1); - ::decompress_stripe(data_.compression(), data_.content(), content_, nbytes); - assert(nbytes == content_.size()); - data_.clear_content(); - decompressTime_ = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); + parse(req->buffer); + parseTime_ = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); state_ = State::retrieved; waiters_.doneWaiting(); - } else { throw std::runtime_error("Could not retrieve ProductStripe for key " + name_); } + } else { throw std::runtime_error("Could not retrieve key " + name_); } })); conn_->submit(std::move(req), std::move(getDoneTask)); } else if (this_state == State::retrieved ) { @@ -108,17 +111,52 @@ void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { } } -std::string_view DelayedProductStripeRetriever::bufferAt(size_t globalEventIndex) const { +std::string_view WaitableFetchProductStripe::bufferAt(size_t groupIdx, size_t iOffset) const { assert(state_ == State::retrieved); - assert(globalOffset_ == data_.globaloffset()); - assert(globalOffset_ <= globalEventIndex); - size_t iOffset = globalEventIndex - globalOffset_; - assert(iOffset < data_.counts_size()); + assert(groupIdx == 0); + assert(iOffset < offsets_.size() - 1); size_t bstart = offsets_[iOffset]; size_t bstop = offsets_[iOffset+1]; return {&content_[bstart], bstop - bstart}; } +void WaitableFetchProductStripe::parse(const std::string& buffer) { + if ( not data_.ParseFromString(buffer) ) { + throw std::runtime_error("Could not deserialize key " + name_); + } + ::parse_productstripe(data_, offsets_, content_); +} + +std::string_view WaitableFetchProductGroupStripe::bufferAt(size_t groupIdx, size_t iOffset) const { + assert(state_ == State::retrieved); + assert(groupIdx < offsets_.size()); + assert(iOffset < offsets_[groupIdx].size() - 1); + size_t bstart = offsets_[groupIdx][iOffset]; + size_t bstop = offsets_[groupIdx][iOffset+1]; + return {&content_[groupIdx][bstart], bstop - bstart}; +} + +void WaitableFetchProductGroupStripe::parse(const std::string& buffer) { + if ( not data_.ParseFromString(buffer) ) { + throw std::runtime_error("Could not deserialize key " + name_); + } + offsets_.resize(data_.products_size()); + content_.resize(data_.products_size()); + for(size_t i=0; i< data_.products_size(); ++i) { + ::parse_productstripe(*data_.mutable_products(i), offsets_[i], content_[i]); + } +} + +void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { + fetcher_->fetch(std::move(callback)); +} + +std::string_view DelayedProductStripeRetriever::bufferAt(size_t globalEventIndex) const { + assert(globalOffset_ <= globalEventIndex); + size_t iOffset = globalEventIndex - globalOffset_; + return fetcher_->bufferAt(groupIdx_, iOffset); +} + ProductStripeGenerator::ProductStripeGenerator(const S3ConnectionRef& conn, const std::string& prefix, unsigned int flushSize, size_t globalIndexStart, size_t globalIndexEnd) : conn_(conn), prefix_(prefix), flushSize_(flushSize), globalIndexStart_(globalIndexStart), globalIndexEnd_(globalIndexEnd) { @@ -288,6 +326,18 @@ void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTask } nextEventStripe_++; nextEventInStripe_ = 0; + + productGroupMap_.clear(); + size_t eventStripeStart = currentEventStripe_.events(0).offset(); + for (size_t iGroup=0; iGroup < currentEventStripe_.groups_size(); ++iGroup) { + const auto& group = currentEventStripe_.groups(iGroup); + auto fetcher = std::make_shared( + conn_, objPrefix_ + "/group" + std::to_string(iGroup) + "/" + std::to_string(eventStripeStart) + ); + for (size_t groupIdx=0; groupIdx < group.names_size(); groupIdx++) { + productGroupMap_[group.names(groupIdx)] = std::make_shared(fetcher, groupIdx, eventStripeStart); + } + } } const auto event = currentEventStripe_.events(nextEventInStripe_); if ( verbose_ >= 1 ) std::cout << event.DebugString() << "\n"; @@ -295,7 +345,12 @@ void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTask size_t globalEventIndex = event.offset(); for (size_t i=0; i < productRetrievers_.size(); ++i) { - retriever.setStripe(i, productRetrievers_[i].stripeFor(globalEventIndex)); + auto itgroup = productGroupMap_.find(index_.products(i).productname()); + if ( itgroup != productGroupMap_.end() ) { + retriever.setStripe(i, itgroup->second); + } else { + retriever.setStripe(i, productRetrievers_[i].stripeFor(globalEventIndex)); + } } retriever.setEvent(globalEventIndex, {event.run(), event.lumi(), event.event()}); diff --git a/S3Source.h b/S3Source.h index 2441af1..8c1f9d2 100644 --- a/S3Source.h +++ b/S3Source.h @@ -18,29 +18,68 @@ namespace cce::tf { + +class WaitableFetch { + public: + WaitableFetch(const S3ConnectionRef& conn, const std::string& name): + conn_(conn), name_(name), state_{State::unretrieved} {}; + virtual ~WaitableFetch(); + void fetch(TaskHolder&& callback); + bool wasFetched() const { return state_ != State::unretrieved; }; + virtual std::string_view bufferAt(size_t groupIdx, size_t iOffset) const = 0; + std::chrono::microseconds parseTime() const { return parseTime_; }; + + protected: + const std::string name_; + enum class State {unretrieved, retrieving, retrieved}; + std::atomic state_; + + private: + virtual void parse(const std::string& buffer) = 0; + WaitingTaskList waiters_{}; + const S3ConnectionRef conn_; + std::chrono::microseconds parseTime_{0}; +}; + +class WaitableFetchProductStripe : public WaitableFetch { + public: + using WaitableFetch::WaitableFetch; + std::string_view bufferAt(size_t groupIdx, size_t iOffset) const override; + private: + void parse(const std::string& buffer) override; + objstripe::ProductStripe data_; + std::vector offsets_{}; + std::string content_{}; +}; + +class WaitableFetchProductGroupStripe : public WaitableFetch { + public: + using WaitableFetch::WaitableFetch; + std::string_view bufferAt(size_t groupIdx, size_t iOffset) const override; + private: + void parse(const std::string& buffer) override; + objstripe::ProductGroupStripe data_; + std::vector> offsets_{}; + std::vector content_{}; +}; + class DelayedProductStripeRetriever { public: - DelayedProductStripeRetriever(const S3ConnectionRef& conn, std::string name, size_t globalOffset): - conn_(conn), name_(name), globalOffset_(globalOffset), state_{State::unretrieved} {}; + // Note: for ProductStripes not in a ProductGroupStripe, groupIdx is ignored + DelayedProductStripeRetriever(const std::shared_ptr& fetcher, size_t groupIdx, size_t globalOffset): + fetcher_(fetcher), groupIdx_(groupIdx), globalOffset_(globalOffset) {}; + DelayedProductStripeRetriever(const S3ConnectionRef& conn, const std::string& name, size_t globalOffset): + fetcher_(std::make_shared(conn, name)), groupIdx_(0), globalOffset_(globalOffset) {}; void fetch(TaskHolder&& callback) const; std::string_view bufferAt(size_t globalEventIndex) const; - ~DelayedProductStripeRetriever() {}; size_t globalOffset() const { return globalOffset_; }; - bool wasFetched() const { return state_ != State::unretrieved; }; - std::chrono::microseconds decompressTime() const { return decompressTime_; } + bool wasFetched() const { return fetcher_->wasFetched(); }; + std::chrono::microseconds decompressTime() const { return fetcher_->parseTime(); } private: - const S3ConnectionRef conn_; - std::string name_; + size_t groupIdx_; size_t globalOffset_; - - enum class State {unretrieved, retrieving, retrieved}; - mutable std::atomic state_; - mutable WaitingTaskList waiters_{}; - mutable objstripe::ProductStripe data_{}; - mutable std::vector offsets_{}; - mutable std::string content_{}; - mutable std::chrono::microseconds decompressTime_{0}; + std::shared_ptr fetcher_; }; class ProductStripeGenerator { @@ -123,6 +162,7 @@ class S3Source : public SharedSourceBase { size_t nextEventStripe_ = 0; size_t nextEventInStripe_ = 0; objstripe::EventStripe currentEventStripe_; + std::map> productGroupMap_; std::vector productRetrievers_; std::vector laneRetrievers_; std::chrono::microseconds readTime_; From 7eff47ffd197047421ca1ed56fb9095cd5b299a3 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Thu, 4 May 2023 18:24:12 -0500 Subject: [PATCH 40/43] Properly wait on fire-and-forget --- S3Outputer.cc | 13 ++++++++----- S3Outputer.h | 5 ++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/S3Outputer.cc b/S3Outputer.cc index e7cd121..26a5207 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -212,18 +212,17 @@ void S3Outputer::outputAsync(unsigned int iLaneIndex, EventIdentifier const& iEv void S3Outputer::printSummary() const { { // drain all buffers - tbb::task_group group; { - TaskHolder finalTask(group, make_functor_task([&group, task=group.defer([](){})]() mutable { group.run(std::move(task)); })); + TaskHolder finalTask(*tails_group_, make_functor_task([this, task=tails_group_->defer([](){})]() mutable { tails_group_->run(std::move(task)); })); SmallBuffers smallbuffers = std::make_shared(); auto productsDoneCallback = makeProductsDoneCallback(finalTask, smallbuffers, true); for(auto& buf : buffers_) { - buf.appendQueue_.push(group, [this, &buf, cb=productsDoneCallback, smallbuffers]() mutable { + buf.appendQueue_.push(*tails_group_, [this, &buf, cb=productsDoneCallback, smallbuffers]() mutable { appendProductBuffer(buf, {}, std::move(cb), true, smallbuffers); }); } } - group.wait(); + tails_group_->wait(); } if(verbose_ >= 2) { @@ -287,7 +286,11 @@ TaskHolder S3Outputer::makeProductsDoneCallback(TaskHolder iCallback, SmallBuffe + " " + std::to_string(std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1)) + "\n"; auto nextCallback = ( (numFireAndForgetCollates_ < maxFireAndForgetCollates_) and ~last ) ? - numFireAndForgetCollates_++, TaskHolder(*iCallback.group(), make_functor_task([this]() mutable {numFireAndForgetCollates_--;})) : std::move(iCallback); + numFireAndForgetCollates_++, + TaskHolder( + *tails_group_, + make_functor_task([this, task=tails_group_->defer([](){})]() mutable {numFireAndForgetCollates_--; tails_group_->run(std::move(task));}) + ) : std::move(iCallback); return TaskHolder(*nextCallback.group(), make_functor_task( [this, stripeOut=std::move(stripeOut), callback=std::move(nextCallback), smallbuffers]() mutable { if(verbose_ >= 2) { std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n"; } diff --git a/S3Outputer.h b/S3Outputer.h index 7ce9a12..54b5c73 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -63,6 +63,8 @@ class S3Outputer : public OutputerBase { defaultCompression_.set_level(cLevel); index_.mutable_eventstripecompression()->CopyFrom(defaultCompression_); eventStripeCompressor_ = StreamCompressor(index_.eventstripecompression()); + + tails_group_ = std::make_unique(); } void setupForLane(unsigned int iLaneIndex, std::vector const& iDPs) final; @@ -117,8 +119,9 @@ class S3Outputer : public OutputerBase { mutable size_t eventGlobalOffset_{0}; mutable objstripe::EventStripe currentEventStripe_{}; mutable std::chrono::microseconds collateTime_; - constexpr static unsigned int maxFireAndForgetCollates_{0}; + constexpr static unsigned int maxFireAndForgetCollates_{4}; mutable std::atomic numFireAndForgetCollates_{0}; + std::unique_ptr tails_group_; // for fire-and-forget and last flush // only modified in appendProductBuffer() mutable std::vector buffers_; From 3904af94087be8db3da30a795cd0700088cce3a7 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Fri, 22 Sep 2023 13:42:22 -0500 Subject: [PATCH 41/43] Link libcurl and leave stubs for eventual SSL implementation --- CMakeLists.txt | 3 ++- S3Common.cc | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d5061e9..b423600 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -198,6 +198,7 @@ if(ENABLE_S3) if(NOT DEFINED LIBS3_DIR) message(FATAL_ERROR "You must provide LIBS3_DIR variable") endif() + find_package(CURL REQUIRED) find_package(Protobuf REQUIRED) include_directories(${Protobuf_INCLUDE_DIRS}) include_directories(${CMAKE_CURRENT_BINARY_DIR}) @@ -211,6 +212,6 @@ if(ENABLE_S3) ) target_include_directories(threaded_io_test PRIVATE ${LIBS3_DIR}/include) target_link_directories(threaded_io_test PRIVATE ${LIBS3_DIR}/lib) - target_link_libraries(threaded_io_test PRIVATE s3) + target_link_libraries(threaded_io_test PRIVATE s3 curl) # add_test(NAME S3OutputerEmptyTest COMMAND threaded_io_test EmptySource 1 1 0 10 S3Outputer) endif() diff --git a/S3Common.cc b/S3Common.cc index e11eaa2..7ae5dc2 100644 --- a/S3Common.cc +++ b/S3Common.cc @@ -10,6 +10,7 @@ #include #include "libs3.h" +#include #include "tbb/task_arena.h" #include "tbb/concurrent_queue.h" #include "S3Common.h" @@ -81,6 +82,10 @@ class S3LibWrapper { int max_fd, activeRequests{0}; int topfds{0}, topreq{0}; S3_create_request_context(&ctx); + // For now we do not enable peer verification because CURL is loading the CA bundle per connection https://github.com/curl/curl/pull/9620 + // S3_set_request_context_verify_peer(ctx, 1); + // auto status = curl_easy_setopt(curl, CURLOPT_CAPATH, "/etc/grid-security/certificates"); + // if ( status != CURLE_OK ) throw std::runtime_error("curle fail"); std::vector to_defer; while(running_) { FD_ZERO(&read_fds); From 5369e3d151d48da25896f6686e82a457a98ab6ab Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Fri, 22 Sep 2023 13:43:25 -0500 Subject: [PATCH 42/43] Fix empty object bug by copying string for output buffer --- S3Outputer.cc | 41 +++++++++++++++++++++++++---------------- S3Outputer.h | 2 +- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/S3Outputer.cc b/S3Outputer.cc index 26a5207..1a8cce2 100644 --- a/S3Outputer.cc +++ b/S3Outputer.cc @@ -267,9 +267,9 @@ void S3Outputer::collateProducts( auto buf = std::begin(buffers_); for (const auto& s : iSerializers) { - const std::string_view blob(s.blob().data(), s.blob().size()); - buf->appendQueue_.push(*productsDoneCallback.group(), [this, buf, blob, cb=productsDoneCallback, smallbuffers]() mutable { - appendProductBuffer(*buf, blob, std::move(cb), false, smallbuffers); + std::string blob(s.blob().data(), s.blob().size()); + buf->appendQueue_.push(*productsDoneCallback.group(), [this, buf, blob=std::move(blob), cb=productsDoneCallback, smallbuffers]() mutable { + appendProductBuffer(*buf, std::move(blob), std::move(cb), false, smallbuffers); }); buf++; } @@ -285,27 +285,30 @@ TaskHolder S3Outputer::makeProductsDoneCallback(TaskHolder iCallback, SmallBuffe std::cerr << "flush " + std::to_string(numFireAndForgetCollates_) + " " + std::to_string(std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1)) + "\n"; - auto nextCallback = ( (numFireAndForgetCollates_ < maxFireAndForgetCollates_) and ~last ) ? + assert(last xor (iCallback.group() != tails_group_.get())); + auto nextCallback = ( (numFireAndForgetCollates_ < maxFireAndForgetCollates_) ) ? numFireAndForgetCollates_++, TaskHolder( - *tails_group_, + *iCallback.group(), make_functor_task([this, task=tails_group_->defer([](){})]() mutable {numFireAndForgetCollates_--; tails_group_->run(std::move(task));}) ) : std::move(iCallback); return TaskHolder(*nextCallback.group(), make_functor_task( - [this, stripeOut=std::move(stripeOut), callback=std::move(nextCallback), smallbuffers]() mutable { - if(verbose_ >= 2) { std::cout << "reached event flush size "s + std::to_string(eventFlushSize_) + ", flushing\n"; } + [this, stripeOut=std::move(stripeOut), callback=std::move(nextCallback), smallbuffers=std::move(smallbuffers), last]() mutable { + if(verbose_ >= 2) { std::cout << "reached event flush size "s + std::to_string(stripeOut.events_size()) + ", flushing\n"; } // merge buffers by greedy algorithm std::sort(smallbuffers->begin(), smallbuffers->end(), [](const auto &a, const auto &b){ return a.second.content().size() > b.second.content().size(); }); size_t iGroup{0}; auto it = smallbuffers->begin(); objstripe::ProductGroupStripe gout; - do { + while ( it != smallbuffers->end() ) { size_t nbytes{0}; auto* group = stripeOut.add_groups(); while ( (nbytes < productBufferFlushMinBytes_) and (it != smallbuffers->end()) ) { nbytes += it->second.content().size(); - group->mutable_names()->Add(std::move(it->first)); - gout.mutable_products()->Add(std::move(it->second)); + auto* name = group->add_names(); + std::swap(*name, it->first); + auto* prod = gout.add_products(); + std::swap(*prod, it->second); it++; } auto req = std::make_shared(S3Request::Type::put, objPrefix_ + "/group" + std::to_string(iGroup) + "/" + std::to_string(stripeOut.events(0).offset())); @@ -318,9 +321,9 @@ TaskHolder S3Outputer::makeProductsDoneCallback(TaskHolder iCallback, SmallBuffe conn_->submit(std::move(req), std::move(putDoneTask)); iGroup++; gout.clear_products(); - } while ( it != smallbuffers->end() ); - flushQueue_.push(*callback.group(), [this, stripeOut=std::move(stripeOut), callback=std::move(callback)]() { - flushEventStripe(stripeOut, std::move(callback)); + } + flushQueue_.push(*callback.group(), [this, stripeOut=std::move(stripeOut), callback=std::move(callback), last]() { + flushEventStripe(stripeOut, std::move(callback), last); }); } )); @@ -331,7 +334,7 @@ TaskHolder S3Outputer::makeProductsDoneCallback(TaskHolder iCallback, SmallBuffe void S3Outputer::appendProductBuffer( ProductOutputBuffer& buf, - const std::string_view blob, + std::string&& blob, TaskHolder iCallback, bool last, SmallBuffers smallbuffers @@ -378,13 +381,19 @@ void S3Outputer::appendProductBuffer( } std::string name = buf.prefix_ + "/" + std::to_string(buf.stripe_.globaloffset()); - if ( (bufferNevents == eventFlushSize_) && (bufferNbytes <= buf.info_->flushminbytes()) ) { + if ( (last or (bufferNevents == eventFlushSize_)) && (bufferNbytes <= buf.info_->flushminbytes()) ) { // too small buffer, put it on a merge queue objstripe::ProductStripe out; out.mutable_content()->reserve(bufferNbytes); out.mutable_compression()->CopyFrom(buf.stripe_.compression()); + out.set_globaloffset(buf.stripe_.globaloffset()); + assert(out.content().size() == 0); + assert(buf.stripe_.content().size() == bufferNbytes); std::swap(out, buf.stripe_); - smallbuffers->push_back({name, std::move(out)}); + assert(out.content().size() == bufferNbytes); + assert(buf.stripe_.content().size() == 0); + auto it = smallbuffers->emplace_back(name, std::move(out)); + assert(it->second.content().size() == bufferNbytes); // leave iCallback alive til end of function } else { auto req = std::make_shared(S3Request::Type::put, name); diff --git a/S3Outputer.h b/S3Outputer.h index 54b5c73..427d4bb 100644 --- a/S3Outputer.h +++ b/S3Outputer.h @@ -99,7 +99,7 @@ class S3Outputer : public OutputerBase { // then collate() calls appendProductBuffer() with the above TaskHolder as callback (or original callback) // printSummary() takes care of the tails by setting last=true in the calls void collateProducts(EventIdentifier const& iEventID, SerializeStrategy const& iSerializers, TaskHolder iCallback) const; - void appendProductBuffer(ProductOutputBuffer& buf, const std::string_view blob, TaskHolder iCallback, bool last, SmallBuffers smallbuffers) const; + void appendProductBuffer(ProductOutputBuffer& buf, std::string&& blob, TaskHolder iCallback, bool last, SmallBuffers smallbuffers) const; TaskHolder makeProductsDoneCallback(TaskHolder iCallback, SmallBuffers smallbuffers, bool last) const; void flushEventStripe(const objstripe::EventStripe& stripe, TaskHolder iCallback, bool last=false) const; From 3ae81400321fb7b66721d770cdbb2a349f1ea748 Mon Sep 17 00:00:00 2001 From: Nick Smith Date: Fri, 22 Sep 2023 13:44:05 -0500 Subject: [PATCH 43/43] Stubs for prefetching objects on input (disabled) --- S3Source.cc | 47 ++++++++++++++++++++++++++++++++++++++--------- S3Source.h | 16 +++++++++------- 2 files changed, 47 insertions(+), 16 deletions(-) diff --git a/S3Source.cc b/S3Source.cc index c12f668..b45f946 100644 --- a/S3Source.cc +++ b/S3Source.cc @@ -101,7 +101,11 @@ void WaitableFetch::fetch(TaskHolder&& callback) { parseTime_ = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); state_ = State::retrieved; waiters_.doneWaiting(); - } else { throw std::runtime_error("Could not retrieve key " + name_); } + } else { + // TODO: possible that prefetch is for a stripe that is in a group in the next event batch + // throw std::runtime_error("Could not retrieve key " + name_); + std::cerr << "Could not retrieve key " + name_ + "\n"; + } })); conn_->submit(std::move(req), std::move(getDoneTask)); } else if (this_state == State::retrieved ) { @@ -114,9 +118,14 @@ void WaitableFetch::fetch(TaskHolder&& callback) { std::string_view WaitableFetchProductStripe::bufferAt(size_t groupIdx, size_t iOffset) const { assert(state_ == State::retrieved); assert(groupIdx == 0); + if ( iOffset >= offsets_.size() - 1 ) { + std::cerr << name_ << " at " << groupIdx << ", " << iOffset << std::endl; + } assert(iOffset < offsets_.size() - 1); size_t bstart = offsets_[iOffset]; size_t bstop = offsets_[iOffset+1]; + assert(bstop > bstart); + assert(bstop <= content_.size()); return {&content_[bstart], bstop - bstart}; } @@ -133,6 +142,8 @@ std::string_view WaitableFetchProductGroupStripe::bufferAt(size_t groupIdx, size assert(iOffset < offsets_[groupIdx].size() - 1); size_t bstart = offsets_[groupIdx][iOffset]; size_t bstop = offsets_[groupIdx][iOffset+1]; + assert(bstop > bstart); + assert(bstop <= content_[groupIdx].size()); return {&content_[groupIdx][bstart], bstop - bstart}; } @@ -154,6 +165,7 @@ void DelayedProductStripeRetriever::fetch(TaskHolder&& callback) const { std::string_view DelayedProductStripeRetriever::bufferAt(size_t globalEventIndex) const { assert(globalOffset_ <= globalEventIndex); size_t iOffset = globalEventIndex - globalOffset_; + assert(iOffset < flushSize_); return fetcher_->bufferAt(groupIdx_, iOffset); } @@ -162,28 +174,33 @@ ProductStripeGenerator::ProductStripeGenerator(const S3ConnectionRef& conn, cons { auto indexThis = globalIndexStart - (globalIndexStart % flushSize_); auto indexNext = indexThis + flushSize_; - currentStripe_ = std::make_shared(conn_, prefix_ + "/" + std::to_string(indexThis), indexThis); - nextStripe_ = std::make_shared(conn_, prefix_ + "/" + std::to_string(indexNext), indexNext); + currentStripe_ = std::make_shared(conn_, prefix_ + "/" + std::to_string(indexThis), indexThis, flushSize_); + nextStripe_ = std::make_shared(conn_, prefix_ + "/" + std::to_string(indexNext), indexNext, flushSize_); prefetch_group_ = std::make_unique(); } std::shared_ptr ProductStripeGenerator::stripeFor(size_t globalEventIndex) { assert(globalEventIndex >= globalIndexStart_ and globalEventIndex < globalIndexEnd_); + assert(globalEventIndex >= currentStripe_->globalOffset()); + assert(globalEventIndex <= nextStripe_->globalOffset()); if ( globalEventIndex == nextStripe_->globalOffset() ) { auto indexNext = globalEventIndex + flushSize_; - auto new_ps = std::make_shared(conn_, prefix_ + "/" + std::to_string(indexNext), indexNext); + auto new_ps = std::make_shared(conn_, prefix_ + "/" + std::to_string(indexNext), indexNext, flushSize_); // record decompress time of old stripe decompressTime_ += currentStripe_->decompressTime(); // shuffle new_ps -> nextStripe_ -> currentStripe_ - std::swap(nextStripe_, currentStripe_); - std::swap(new_ps, nextStripe_); + currentStripe_ = nextStripe_; + nextStripe_ = new_ps; } + assert(globalEventIndex >= currentStripe_->globalOffset()); + assert(globalEventIndex < nextStripe_->globalOffset()); if ( currentStripe_->wasFetched() and ~nextStripe_->wasFetched() and (globalEventIndex % flushSize_ >= flushSize_ / 2) and (globalEventIndex + flushSize_ < globalIndexEnd_) + and false ) { // somewhere in the middle of current stripe, prefetch next @@ -229,6 +246,12 @@ void S3DelayedRetriever::getAsync(DataProductRetriever& product, int index, Task auto start = std::chrono::high_resolution_clock::now(); auto buf = stripes_[index]->bufferAt(globalEventIndex_); auto readSize = deserializers_[index].deserialize(buf.data(), buf.size(), *dataProducts_[index].address()); + if ( readSize != buf.size() ) { + throw std::runtime_error( + "Read fail for event " + std::to_string(globalEventIndex_) + " product " + std::to_string(index) + " (" + dataProducts_[index].name() + ")" + ); + } + assert(readSize == buf.size()); dataProducts_[index].setSize(readSize); deserializeTime_ += std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start); } @@ -329,27 +352,33 @@ void S3Source::readEventAsync(unsigned int iLane, long iEventIndex, OptionalTask productGroupMap_.clear(); size_t eventStripeStart = currentEventStripe_.events(0).offset(); + assert(eventStripeStart == (nextEventStripe_-1)* index_.eventstripesize()); for (size_t iGroup=0; iGroup < currentEventStripe_.groups_size(); ++iGroup) { const auto& group = currentEventStripe_.groups(iGroup); auto fetcher = std::make_shared( conn_, objPrefix_ + "/group" + std::to_string(iGroup) + "/" + std::to_string(eventStripeStart) ); + if ( verbose_ >= 2 ) std::cout << "creating fetcher for group " + objPrefix_ + "/group" + std::to_string(iGroup) + "/" + std::to_string(eventStripeStart) + "\n"; for (size_t groupIdx=0; groupIdx < group.names_size(); groupIdx++) { - productGroupMap_[group.names(groupIdx)] = std::make_shared(fetcher, groupIdx, eventStripeStart); + productGroupMap_[group.names(groupIdx)] = std::make_shared(fetcher, groupIdx, eventStripeStart, index_.eventstripesize()); } } } + size_t eventStripeStart = currentEventStripe_.events(0).offset(); const auto event = currentEventStripe_.events(nextEventInStripe_); if ( verbose_ >= 1 ) std::cout << event.DebugString() << "\n"; auto& retriever = laneRetrievers_[iLane]; size_t globalEventIndex = event.offset(); for (size_t i=0; i < productRetrievers_.size(); ++i) { - auto itgroup = productGroupMap_.find(index_.products(i).productname()); + auto itgroup = productGroupMap_.find(objPrefix_ + "/" + index_.products(i).productname() + "/" + std::to_string(eventStripeStart)); + auto pstripe = productRetrievers_[i].stripeFor(globalEventIndex); if ( itgroup != productGroupMap_.end() ) { + if ( verbose_ >= 2) std::cout << "using group for product " + index_.products(i).productname() + "/" + std::to_string(eventStripeStart) + "\n"; retriever.setStripe(i, itgroup->second); } else { - retriever.setStripe(i, productRetrievers_[i].stripeFor(globalEventIndex)); + if (verbose_ >= 2) std::cout << "using individual stripe for product " + index_.products(i).productname() + "\n"; + retriever.setStripe(i, std::move(pstripe)); } } diff --git a/S3Source.h b/S3Source.h index 8c1f9d2..84544f7 100644 --- a/S3Source.h +++ b/S3Source.h @@ -66,10 +66,10 @@ class WaitableFetchProductGroupStripe : public WaitableFetch { class DelayedProductStripeRetriever { public: // Note: for ProductStripes not in a ProductGroupStripe, groupIdx is ignored - DelayedProductStripeRetriever(const std::shared_ptr& fetcher, size_t groupIdx, size_t globalOffset): - fetcher_(fetcher), groupIdx_(groupIdx), globalOffset_(globalOffset) {}; - DelayedProductStripeRetriever(const S3ConnectionRef& conn, const std::string& name, size_t globalOffset): - fetcher_(std::make_shared(conn, name)), groupIdx_(0), globalOffset_(globalOffset) {}; + DelayedProductStripeRetriever(const std::shared_ptr& fetcher, size_t groupIdx, size_t globalOffset, size_t flushSize): + fetcher_(fetcher), groupIdx_(groupIdx), globalOffset_(globalOffset), flushSize_(flushSize) {}; + DelayedProductStripeRetriever(const S3ConnectionRef& conn, const std::string& name, size_t globalOffset, size_t flushSize): + fetcher_(std::make_shared(conn, name)), groupIdx_(0), globalOffset_(globalOffset), flushSize_(flushSize) {}; void fetch(TaskHolder&& callback) const; std::string_view bufferAt(size_t globalEventIndex) const; size_t globalOffset() const { return globalOffset_; }; @@ -77,8 +77,9 @@ class DelayedProductStripeRetriever { std::chrono::microseconds decompressTime() const { return fetcher_->parseTime(); } private: - size_t groupIdx_; - size_t globalOffset_; + const size_t groupIdx_; + const size_t globalOffset_; + const size_t flushSize_; std::shared_ptr fetcher_; }; @@ -92,7 +93,8 @@ class ProductStripeGenerator { const S3ConnectionRef conn_; const std::string prefix_; const unsigned int flushSize_; - size_t globalIndexStart_, globalIndexEnd_; + const size_t globalIndexStart_; + const size_t globalIndexEnd_; std::shared_ptr currentStripe_; std::shared_ptr nextStripe_; std::chrono::microseconds decompressTime_{0};