From 4fdf44b577306f51c4a18200b2273edb9367a3e2 Mon Sep 17 00:00:00 2001 From: Robert Nagy Date: Sat, 13 Jun 2026 19:01:31 +0200 Subject: [PATCH 1/2] deps: V8: add CopyArrayBufferViewBytes API Add v8::ArrayBufferView::CopyArrayBufferViewBytes, which copies a byte range from one ArrayBufferView to another. Unlike the existing v8::ArrayBuffer::CopyArrayBufferBytes, it operates on the *views*: it resolves each view's data pointer directly (JSTypedArray::DataPtr / DataView::data_pointer) and reads the backing buffer's shared/immutable/ detached flags as plain field loads, without ever materializing or fetching the views' ArrayBuffers. This exists because materializing the ArrayBuffer is expensive. Profiling Buffer.prototype.copy (perf, AMD EPYC 9135, x86-64) showed that for small copies the dominant native cost is ArrayBufferView::Buffer() -> JSTypedArray::GetBuffer() -- ~25% of total runtime, paid on every call (not just first materialization) and incurred twice (source and target). Add ByteOffset() (~7%) and IsSharedArrayBuffer() (~6%) and roughly 38% of a small copy is spent turning two views into ArrayBuffers and querying them piecemeal, while the actual memmove is ~4%. Routing the node binding through CopyArrayBufferBytes forces all of that onto the embedder side; a view-level entry point folds it into a single call of cheap field reads. Byte ranges are clamped to both views' byte lengths. Nothing is copied when the source is detached/out-of-bounds or the target is detached/ out-of-bounds or backed by an immutable ArrayBuffer; the number of bytes actually copied is returned. When both views are backed by a SharedArrayBuffer a relaxed-atomic memmove is used, honoring the SharedArrayBuffer memory model; any other combination performs a plain memmove on the backing store (matching CopyArrayBufferBytes). Carried as a floating patch; v8_embedder_string is bumped to -node.22 accordingly. It is the natural sibling of the CopyArrayBufferBytes API added in the preceding floating patch and touches nothing but the public ArrayBuffer/ArrayBufferView API. Refs: https://github.com/nodejs/node/issues/55422 Signed-off-by: Robert Nagy Co-Authored-By: Claude Opus 4.8 (1M context) --- common.gypi | 2 +- deps/v8/include/v8-array-buffer.h | 20 +++++++++ deps/v8/src/api/api.cc | 71 +++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 1 deletion(-) diff --git a/common.gypi b/common.gypi index 36b4b1138dbfd2..cb2f22f0cedf89 100644 --- a/common.gypi +++ b/common.gypi @@ -40,7 +40,7 @@ # Reset this number to 0 on major V8 upgrades. # Increment by one for each non-official patch applied to deps/v8. - 'v8_embedder_string': '-node.21', + 'v8_embedder_string': '-node.22', ##### V8 defaults for Node.js ##### diff --git a/deps/v8/include/v8-array-buffer.h b/deps/v8/include/v8-array-buffer.h index f70c6dd57b36b0..c641deba113944 100644 --- a/deps/v8/include/v8-array-buffer.h +++ b/deps/v8/include/v8-array-buffer.h @@ -477,6 +477,26 @@ class V8_EXPORT ArrayBufferView : public Object { */ bool HasBuffer() const; + /** + * Copy up to |bytes_to_copy| bytes from |source| (starting |source_start| + * bytes into the view) to |target| (starting |target_start| bytes into the + * view). The byte range is clamped to both views' byte lengths. The views' + * data pointers are resolved directly, without materializing their + * ArrayBuffers, avoiding the overhead of ArrayBufferView::Buffer / + * JSTypedArray::GetBuffer. + * + * Nothing is copied if |source| is detached or out of bounds, or if |target| + * is detached, out of bounds, or backed by an immutable ArrayBuffer. When + * both views are backed by a SharedArrayBuffer the copy uses a relaxed-atomic + * memmove that honors the SharedArrayBuffer memory model. Returns the number + * of bytes actually copied. + */ + static size_t CopyArrayBufferViewBytes(Local source, + size_t source_start, + Local target, + size_t target_start, + size_t bytes_to_copy); + V8_INLINE static ArrayBufferView* Cast(Value* value) { #ifdef V8_ENABLE_CHECKS CheckCast(value); diff --git a/deps/v8/src/api/api.cc b/deps/v8/src/api/api.cc index bdb9f715de95b4..8b50c0f3c4e819 100644 --- a/deps/v8/src/api/api.cc +++ b/deps/v8/src/api/api.cc @@ -4246,6 +4246,43 @@ static size_t CopyArrayBufferBytesImpl(const void* source_buffer, return bytes_to_copy; } +struct ArrayBufferViewBytes { + char* data; + size_t length; + bool is_shared; + bool is_immutable; +}; + +// Resolves a view's data pointer, byte length and backing-buffer flags without +// materializing its ArrayBuffer (i.e. without JSTypedArray::GetBuffer, which is +// comparatively expensive). A detached or out-of-bounds view resolves to an +// empty {nullptr, 0} range. +ArrayBufferViewBytes GetArrayBufferViewBytes( + i::Tagged view) { + if (view->IsDetachedOrOutOfBounds()) return {nullptr, 0, false, false}; + if (i::IsJSTypedArray(view)) { + i::Tagged array = i::Cast(view); + i::Tagged buffer = array->buffer(); + return {reinterpret_cast(array->DataPtr()), array->GetByteLength(), + buffer->is_shared(), buffer->is_immutable()}; + } + if (i::IsJSDataView(view)) { + i::Tagged data_view = i::Cast(view); + i::Tagged buffer = + i::Cast(data_view->buffer()); + return {reinterpret_cast(data_view->data_pointer()), + data_view->byte_length(), buffer->is_shared(), + buffer->is_immutable()}; + } + i::Tagged data_view = + i::Cast(view); + i::Tagged buffer = + i::Cast(data_view->buffer()); + return {reinterpret_cast(data_view->data_pointer()), + data_view->GetByteLength(), buffer->is_shared(), + buffer->is_immutable()}; +} + size_t v8::SharedArrayBuffer::CopyArrayBufferBytes( size_t source_start, size_t bytes_to_copy, Local target, size_t target_start) const { @@ -8960,6 +8997,40 @@ size_t v8::ArrayBuffer::CopyArrayBufferBytes(size_t source_start, that->GetByteLength(), bytes_to_copy); } +size_t v8::ArrayBufferView::CopyArrayBufferViewBytes( + Local source, size_t source_start, + Local target, size_t target_start, size_t bytes_to_copy) { + i::DisallowGarbageCollection no_gc; + ArrayBufferViewBytes src = + GetArrayBufferViewBytes(*Utils::OpenDirectHandle(*source)); + ArrayBufferViewBytes dst = + GetArrayBufferViewBytes(*Utils::OpenDirectHandle(*target)); + + // Never write to an immutable target. Detached/out-of-bounds views resolve to + // a zero length, so they fall out through the clamping below. + if (dst.is_immutable) return 0; + + source_start = std::min(source_start, src.length); + target_start = std::min(target_start, dst.length); + bytes_to_copy = std::min( + {bytes_to_copy, src.length - source_start, dst.length - target_start}); + if (bytes_to_copy == 0) return 0; + + char* source_data = src.data + source_start; + char* target_data = dst.data + target_start; + // A relaxed-atomic memmove is only required when both views are backed by a + // SharedArrayBuffer; any other combination performs a plain memmove on the + // backing store, matching v8::ArrayBuffer::CopyArrayBufferBytes. + if (src.is_shared && dst.is_shared) { + base::Relaxed_Memmove( + reinterpret_cast(target_data), + reinterpret_cast(source_data), bytes_to_copy); + } else { + std::memmove(target_data, source_data, bytes_to_copy); + } + return bytes_to_copy; +} + namespace { std::shared_ptr ToInternal( std::shared_ptr backing_store) { From afe87da73dcfe16568717184960b5aa4ff70f62e Mon Sep 17 00:00:00 2001 From: Robert Nagy Date: Sat, 13 Jun 2026 19:01:52 +0200 Subject: [PATCH 2/2] buffer: speed up Buffer.prototype.copy via view-level copy Route the native backing of Buffer.prototype.copy (CopyImpl, the `_copy` binding) through the new v8::ArrayBufferView::CopyArrayBufferViewBytes API instead of v8::ArrayBuffer::CopyArrayBufferBytes. The previous binding had to convert both views to ArrayBuffers (ArrayBufferView::Buffer()), read their byte offsets (ByteOffset()) and test shared-ness (IsSharedArrayBuffer()) before the copy -- around half a dozen separate V8 API calls per copy. The view-level API does all of that internally from the views' own fields in a single call, so the binding now just forwards the two views, the view-relative offsets and the length. Profiling on AMD EPYC 9135 (x86-64) attributed the small-copy cost almost entirely to that view->buffer conversion: ArrayBufferView::Buffer() / JSTypedArray::GetBuffer() alone was ~25% of runtime, paid every call and twice per copy. Resolving the buffer in JS instead (passing source.buffer/target.buffer to the binding) was measured and is worse: the typed-array `.buffer` getter is not JIT-inlined and dispatches through the CEntry trampoline to a C++ builtin, costing ~36%. The view-level copy keeps all existing semantics: byte-range clamping, no-op (0 bytes) on a detached or immutable target, relaxed-atomic memmove when both sides are SharedArrayBuffer-backed, plain memmove otherwise. The JS-side view clamping in copyImpl is retained: V8 clamps to the underlying backing store, which for pooled Buffers is the whole shared pool rather than the individual view. buffer-copy.js, median of 30 interleaved runs, AMD EPYC 9135 x86-64 (all changes p < 0.001, Welch t = 19-50): partial=false bytes=8: 42.2 -> 62.4 Mops/s (+48%) partial=true bytes=8: 42.0 -> 62.8 Mops/s (+49%) partial=false bytes=128: 42.2 -> 61.7 Mops/s (+46%) partial=true bytes=128: 42.0 -> 63.1 Mops/s (+50%) partial=false bytes=1024: 35.1 -> 47.3 Mops/s (+35%) partial=true bytes=1024: 37.8 -> 55.1 Mops/s (+46%) The gain is largest for small/medium copies, where per-call overhead dominates, and tapers for 1024-byte copies as the memmove itself grows. Also inlines the former _copyActual helper (only caller was copyImpl) into copyImpl, folding a redundant target.byteLength read. Refs: https://github.com/nodejs/node/issues/55422 Signed-off-by: Robert Nagy Co-Authored-By: Claude Opus 4.8 (1M context) --- deps/v8/src/api/api.cc | 2 +- lib/buffer.js | 15 ++++++++------- src/node_buffer.cc | 41 ++++++++++++++--------------------------- 3 files changed, 23 insertions(+), 35 deletions(-) diff --git a/deps/v8/src/api/api.cc b/deps/v8/src/api/api.cc index 8b50c0f3c4e819..dd516aea4eb4e1 100644 --- a/deps/v8/src/api/api.cc +++ b/deps/v8/src/api/api.cc @@ -9021,7 +9021,7 @@ size_t v8::ArrayBufferView::CopyArrayBufferViewBytes( // A relaxed-atomic memmove is only required when both views are backed by a // SharedArrayBuffer; any other combination performs a plain memmove on the // backing store, matching v8::ArrayBuffer::CopyArrayBufferBytes. - if (src.is_shared && dst.is_shared) { + if (src.is_shared || dst.is_shared) { base::Relaxed_Memmove( reinterpret_cast(target_data), reinterpret_cast(source_data), bytes_to_copy); diff --git a/lib/buffer.js b/lib/buffer.js index 8c17b158222672..f7c475a10758de 100644 --- a/lib/buffer.js +++ b/lib/buffer.js @@ -261,15 +261,16 @@ function copyImpl(source, target, targetStart, sourceStart, sourceEnd) { throw new ERR_OUT_OF_RANGE('sourceEnd', '>= 0', sourceEnd); } - if (targetStart >= target.byteLength || sourceStart >= sourceEnd) + const targetLength = target.byteLength; + if (targetStart >= targetLength || sourceStart >= sourceEnd) return 0; - return _copyActual(source, target, targetStart, sourceStart, sourceEnd); -} - -function _copyActual(source, target, targetStart, sourceStart, sourceEnd) { - if (sourceEnd - sourceStart > target.byteLength - targetStart) - sourceEnd = sourceStart + target.byteLength - targetStart; + // Clamp the copy length to what fits in the target and what remains in the + // source. V8 clamps to the underlying ArrayBuffer internally, but that is the + // backing store rather than this view, so the view-relative clamping is done + // here. + if (sourceEnd - sourceStart > targetLength - targetStart) + sourceEnd = sourceStart + targetLength - targetStart; let nb = sourceEnd - sourceStart; const sourceLen = source.byteLength - sourceStart; diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 9adb02517efc42..aba069433c26ab 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -605,33 +605,20 @@ size_t CopyImpl(Local source_obj, const size_t target_start, const size_t source_start, const size_t to_copy) { - Local source = source_obj.As(); - Local target = target_obj.As(); - - Local source_ab = source->Buffer(); - Local target_ab = target->Buffer(); - - const size_t source_offset = source->ByteOffset() + source_start; - const size_t target_offset = target->ByteOffset() + target_start; - - // Defer byte-range clamping and detached/immutable handling to V8. When both - // sides are backed by a SharedArrayBuffer the relaxed atomic overload is - // used, which honors the SharedArrayBuffer memory model. Any other - // combination (both regular, or one of each) goes through the ArrayBuffer - // overload: it operates on the underlying backing store regardless of - // shared-ness, so a plain memmove is performed (matching the historical - // behavior for SharedArrayBuffer-backed buffers). The V8 API has no overload - // that mixes ArrayBuffer and SharedArrayBuffer, so the two must never be - // cross-cast. - if (source_ab->IsSharedArrayBuffer() && target_ab->IsSharedArrayBuffer()) { - return source_ab.As()->CopyArrayBufferBytes( - source_offset, - to_copy, - target_ab.As(), - target_offset); - } - return source_ab->CopyArrayBufferBytes( - source_offset, to_copy, target_ab, target_offset); + // Defer byte-range clamping and detached/immutable/shared handling to V8. + // CopyArrayBufferViewBytes resolves the views' data pointers directly, + // without materializing their ArrayBuffers (ArrayBufferView::Buffer / + // JSTypedArray::GetBuffer), which dominates the per-call cost for small + // copies. When both views are backed by a SharedArrayBuffer it performs a + // relaxed-atomic memmove honoring the SharedArrayBuffer memory model; any + // other combination performs a plain memmove on the backing store (matching + // the historical behavior for SharedArrayBuffer-backed buffers). + return ArrayBufferView::CopyArrayBufferViewBytes( + source_obj.As(), + source_start, + target_obj.As(), + target_start, + to_copy); } // Assume caller has properly validated args.