From 4fdf44b577306f51c4a18200b2273edb9367a3e2 Mon Sep 17 00:00:00 2001
From: Robert Nagy <ronagy@icloud.com>
Date: Sat, 13 Jun 2026 19:01:31 +0200
Subject: [PATCH 1/2] deps: V8: add CopyArrayBufferViewBytes API

Add v8::ArrayBufferView::CopyArrayBufferViewBytes, which copies a byte
range from one ArrayBufferView to another. Unlike the existing
v8::ArrayBuffer::CopyArrayBufferBytes, it operates on the *views*: it
resolves each view's data pointer directly (JSTypedArray::DataPtr /
DataView::data_pointer) and reads the backing buffer's shared/immutable/
detached flags as plain field loads, without ever materializing or
fetching the views' ArrayBuffers.

This exists because materializing the ArrayBuffer is expensive. Profiling
Buffer.prototype.copy (perf, AMD EPYC 9135, x86-64) showed that for small
copies the dominant native cost is ArrayBufferView::Buffer() ->
JSTypedArray::GetBuffer() -- ~25% of total runtime, paid on every call
(not just first materialization) and incurred twice (source and target).
Add ByteOffset() (~7%) and IsSharedArrayBuffer() (~6%) and roughly 38% of
a small copy is spent turning two views into ArrayBuffers and querying
them piecemeal, while the actual memmove is ~4%. Routing the node binding
through CopyArrayBufferBytes forces all of that onto the embedder side; a
view-level entry point folds it into a single call of cheap field reads.

Byte ranges are clamped to both views' byte lengths. Nothing is copied
when the source is detached/out-of-bounds or the target is detached/
out-of-bounds or backed by an immutable ArrayBuffer; the number of bytes
actually copied is returned. When both views are backed by a
SharedArrayBuffer a relaxed-atomic memmove is used, honoring the
SharedArrayBuffer memory model; any other combination performs a plain
memmove on the backing store (matching CopyArrayBufferBytes).

Carried as a floating patch; v8_embedder_string is bumped to -node.22
accordingly. It is the natural sibling of the CopyArrayBufferBytes API
added in the preceding floating patch and touches nothing but the public
ArrayBuffer/ArrayBufferView API.

Refs: https://github.com/nodejs/node/issues/55422
Signed-off-by: Robert Nagy <ronagy@icloud.com>
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 common.gypi                       |  2 +-
 deps/v8/include/v8-array-buffer.h | 20 +++++++++
 deps/v8/src/api/api.cc            | 71 +++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 1 deletion(-)
diff --git a/common.gypi b/common.gypi
index 36b4b1138dbfd2..cb2f22f0cedf89 100644
--- a/common.gypi
+++ b/common.gypi
@@ -40,7 +40,7 @@
 
     # Reset this number to 0 on major V8 upgrades.
     # Increment by one for each non-official patch applied to deps/v8.
-    'v8_embedder_string': '-node.21',
+    'v8_embedder_string': '-node.22',
 
     ##### V8 defaults for Node.js #####
 
diff --git a/deps/v8/include/v8-array-buffer.h b/deps/v8/include/v8-array-buffer.h
index f70c6dd57b36b0..c641deba113944 100644
--- a/deps/v8/include/v8-array-buffer.h
+++ b/deps/v8/include/v8-array-buffer.h
@@ -477,6 +477,26 @@ class V8_EXPORT ArrayBufferView : public Object {
    */
   bool HasBuffer() const;
 
+  /**
+   * Copy up to |bytes_to_copy| bytes from |source| (starting |source_start|
+   * bytes into the view) to |target| (starting |target_start| bytes into the
+   * view). The byte range is clamped to both views' byte lengths. The views'
+   * data pointers are resolved directly, without materializing their
+   * ArrayBuffers, avoiding the overhead of ArrayBufferView::Buffer /
+   * JSTypedArray::GetBuffer.
+   *
+   * Nothing is copied if |source| is detached or out of bounds, or if |target|
+   * is detached, out of bounds, or backed by an immutable ArrayBuffer. When
+   * both views are backed by a SharedArrayBuffer the copy uses a relaxed-atomic
+   * memmove that honors the SharedArrayBuffer memory model. Returns the number
+   * of bytes actually copied.
+   */
+  static size_t CopyArrayBufferViewBytes(Local<ArrayBufferView> source,
+                                         size_t source_start,
+                                         Local<ArrayBufferView> target,
+                                         size_t target_start,
+                                         size_t bytes_to_copy);
+
   V8_INLINE static ArrayBufferView* Cast(Value* value) {
 #ifdef V8_ENABLE_CHECKS
     CheckCast(value);
diff --git a/deps/v8/src/api/api.cc b/deps/v8/src/api/api.cc
index bdb9f715de95b4..8b50c0f3c4e819 100644
--- a/deps/v8/src/api/api.cc
+++ b/deps/v8/src/api/api.cc
@@ -4246,6 +4246,43 @@ static size_t CopyArrayBufferBytesImpl(const void* source_buffer,
   return bytes_to_copy;
 }
 
+struct ArrayBufferViewBytes {
+  char* data;
+  size_t length;
+  bool is_shared;
+  bool is_immutable;
+};
+
+// Resolves a view's data pointer, byte length and backing-buffer flags without
+// materializing its ArrayBuffer (i.e. without JSTypedArray::GetBuffer, which is
+// comparatively expensive). A detached or out-of-bounds view resolves to an
+// empty {nullptr, 0} range.
+ArrayBufferViewBytes GetArrayBufferViewBytes(
+    i::Tagged<i::JSArrayBufferView> view) {
+  if (view->IsDetachedOrOutOfBounds()) return {nullptr, 0, false, false};
+  if (i::IsJSTypedArray(view)) {
+    i::Tagged<i::JSTypedArray> array = i::Cast<i::JSTypedArray>(view);
+    i::Tagged<i::JSArrayBuffer> buffer = array->buffer();
+    return {reinterpret_cast<char*>(array->DataPtr()), array->GetByteLength(),
+            buffer->is_shared(), buffer->is_immutable()};
+  }
+  if (i::IsJSDataView(view)) {
+    i::Tagged<i::JSDataView> data_view = i::Cast<i::JSDataView>(view);
+    i::Tagged<i::JSArrayBuffer> buffer =
+        i::Cast<i::JSArrayBuffer>(data_view->buffer());
+    return {reinterpret_cast<char*>(data_view->data_pointer()),
+            data_view->byte_length(), buffer->is_shared(),
+            buffer->is_immutable()};
+  }
+  i::Tagged<i::JSRabGsabDataView> data_view =
+      i::Cast<i::JSRabGsabDataView>(view);
+  i::Tagged<i::JSArrayBuffer> buffer =
+      i::Cast<i::JSArrayBuffer>(data_view->buffer());
+  return {reinterpret_cast<char*>(data_view->data_pointer()),
+          data_view->GetByteLength(), buffer->is_shared(),
+          buffer->is_immutable()};
+}
+
 size_t v8::SharedArrayBuffer::CopyArrayBufferBytes(
     size_t source_start, size_t bytes_to_copy, Local<SharedArrayBuffer> target,
     size_t target_start) const {
@@ -8960,6 +8997,40 @@ size_t v8::ArrayBuffer::CopyArrayBufferBytes(size_t source_start,
                                          that->GetByteLength(), bytes_to_copy);
 }
 
+size_t v8::ArrayBufferView::CopyArrayBufferViewBytes(
+    Local<ArrayBufferView> source, size_t source_start,
+    Local<ArrayBufferView> target, size_t target_start, size_t bytes_to_copy) {
+  i::DisallowGarbageCollection no_gc;
+  ArrayBufferViewBytes src =
+      GetArrayBufferViewBytes(*Utils::OpenDirectHandle(*source));
+  ArrayBufferViewBytes dst =
+      GetArrayBufferViewBytes(*Utils::OpenDirectHandle(*target));
+
+  // Never write to an immutable target. Detached/out-of-bounds views resolve to
+  // a zero length, so they fall out through the clamping below.
+  if (dst.is_immutable) return 0;
+
+  source_start = std::min(source_start, src.length);
+  target_start = std::min(target_start, dst.length);
+  bytes_to_copy = std::min(
+      {bytes_to_copy, src.length - source_start, dst.length - target_start});
+  if (bytes_to_copy == 0) return 0;
+
+  char* source_data = src.data + source_start;
+  char* target_data = dst.data + target_start;
+  // A relaxed-atomic memmove is only required when both views are backed by a
+  // SharedArrayBuffer; any other combination performs a plain memmove on the
+  // backing store, matching v8::ArrayBuffer::CopyArrayBufferBytes.
+  if (src.is_shared && dst.is_shared) {
+    base::Relaxed_Memmove(
+        reinterpret_cast<base::Atomic8*>(target_data),
+        reinterpret_cast<const base::Atomic8*>(source_data), bytes_to_copy);
+  } else {
+    std::memmove(target_data, source_data, bytes_to_copy);
+  }
+  return bytes_to_copy;
+}
+
 namespace {
 std::shared_ptr<i::BackingStore> ToInternal(
     std::shared_ptr<i::BackingStoreBase> backing_store) {

From afe87da73dcfe16568717184960b5aa4ff70f62e Mon Sep 17 00:00:00 2001
From: Robert Nagy <ronagy@icloud.com>
Date: Sat, 13 Jun 2026 19:01:52 +0200
Subject: [PATCH 2/2] buffer: speed up Buffer.prototype.copy via view-level
 copy

Route the native backing of Buffer.prototype.copy (CopyImpl, the `_copy`
binding) through the new v8::ArrayBufferView::CopyArrayBufferViewBytes API
instead of v8::ArrayBuffer::CopyArrayBufferBytes. The previous binding had
to convert both views to ArrayBuffers (ArrayBufferView::Buffer()), read
their byte offsets (ByteOffset()) and test shared-ness
(IsSharedArrayBuffer()) before the copy -- around half a dozen separate V8
API calls per copy. The view-level API does all of that internally from
the views' own fields in a single call, so the binding now just forwards
the two views, the view-relative offsets and the length.

Profiling on AMD EPYC 9135 (x86-64) attributed the small-copy cost almost
entirely to that view->buffer conversion: ArrayBufferView::Buffer() /
JSTypedArray::GetBuffer() alone was ~25% of runtime, paid every call and
twice per copy. Resolving the buffer in JS instead (passing
source.buffer/target.buffer to the binding) was measured and is worse:
the typed-array `.buffer` getter is not JIT-inlined and dispatches through
the CEntry trampoline to a C++ builtin, costing ~36%.

The view-level copy keeps all existing semantics: byte-range clamping,
no-op (0 bytes) on a detached or immutable target, relaxed-atomic memmove
when both sides are SharedArrayBuffer-backed, plain memmove otherwise. The
JS-side view clamping in copyImpl is retained: V8 clamps to the underlying
backing store, which for pooled Buffers is the whole shared pool rather
than the individual view.

buffer-copy.js, median of 30 interleaved runs, AMD EPYC 9135 x86-64
(all changes p < 0.001, Welch t = 19-50):

  partial=false bytes=8:    42.2 -> 62.4 Mops/s  (+48%)
  partial=true  bytes=8:    42.0 -> 62.8 Mops/s  (+49%)
  partial=false bytes=128:  42.2 -> 61.7 Mops/s  (+46%)
  partial=true  bytes=128:  42.0 -> 63.1 Mops/s  (+50%)
  partial=false bytes=1024: 35.1 -> 47.3 Mops/s  (+35%)
  partial=true  bytes=1024: 37.8 -> 55.1 Mops/s  (+46%)

The gain is largest for small/medium copies, where per-call overhead
dominates, and tapers for 1024-byte copies as the memmove itself grows.

Also inlines the former _copyActual helper (only caller was copyImpl) into
copyImpl, folding a redundant target.byteLength read.

Refs: https://github.com/nodejs/node/issues/55422
Signed-off-by: Robert Nagy <ronagy@icloud.com>
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 deps/v8/src/api/api.cc |  2 +-
 lib/buffer.js          | 15 ++++++++-------
 src/node_buffer.cc     | 41 ++++++++++++++---------------------------
 3 files changed, 23 insertions(+), 35 deletions(-)

diff --git a/deps/v8/src/api/api.cc b/deps/v8/src/api/api.cc
index 8b50c0f3c4e819..dd516aea4eb4e1 100644
--- a/deps/v8/src/api/api.cc
+++ b/deps/v8/src/api/api.cc
@@ -9021,7 +9021,7 @@ size_t v8::ArrayBufferView::CopyArrayBufferViewBytes(
   // A relaxed-atomic memmove is only required when both views are backed by a
   // SharedArrayBuffer; any other combination performs a plain memmove on the
   // backing store, matching v8::ArrayBuffer::CopyArrayBufferBytes.
-  if (src.is_shared && dst.is_shared) {
+  if (src.is_shared || dst.is_shared) {
     base::Relaxed_Memmove(
         reinterpret_cast<base::Atomic8*>(target_data),
         reinterpret_cast<const base::Atomic8*>(source_data), bytes_to_copy);
diff --git a/lib/buffer.js b/lib/buffer.js
index 8c17b158222672..f7c475a10758de 100644
--- a/lib/buffer.js
+++ b/lib/buffer.js
@@ -261,15 +261,16 @@ function copyImpl(source, target, targetStart, sourceStart, sourceEnd) {
       throw new ERR_OUT_OF_RANGE('sourceEnd', '>= 0', sourceEnd);
   }
 
-  if (targetStart >= target.byteLength || sourceStart >= sourceEnd)
+  const targetLength = target.byteLength;
+  if (targetStart >= targetLength || sourceStart >= sourceEnd)
     return 0;
 
-  return _copyActual(source, target, targetStart, sourceStart, sourceEnd);
-}
-
-function _copyActual(source, target, targetStart, sourceStart, sourceEnd) {
-  if (sourceEnd - sourceStart > target.byteLength - targetStart)
-    sourceEnd = sourceStart + target.byteLength - targetStart;
+  // Clamp the copy length to what fits in the target and what remains in the
+  // source. V8 clamps to the underlying ArrayBuffer internally, but that is the
+  // backing store rather than this view, so the view-relative clamping is done
+  // here.
+  if (sourceEnd - sourceStart > targetLength - targetStart)
+    sourceEnd = sourceStart + targetLength - targetStart;
 
   let nb = sourceEnd - sourceStart;
   const sourceLen = source.byteLength - sourceStart;
diff --git a/src/node_buffer.cc b/src/node_buffer.cc
index 9adb02517efc42..aba069433c26ab 100644
--- a/src/node_buffer.cc
+++ b/src/node_buffer.cc
@@ -605,33 +605,20 @@ size_t CopyImpl(Local<Value> source_obj,
                 const size_t target_start,
                 const size_t source_start,
                 const size_t to_copy) {
-  Local<ArrayBufferView> source = source_obj.As<ArrayBufferView>();
-  Local<ArrayBufferView> target = target_obj.As<ArrayBufferView>();
-
-  Local<ArrayBuffer> source_ab = source->Buffer();
-  Local<ArrayBuffer> target_ab = target->Buffer();
-
-  const size_t source_offset = source->ByteOffset() + source_start;
-  const size_t target_offset = target->ByteOffset() + target_start;
-
-  // Defer byte-range clamping and detached/immutable handling to V8. When both
-  // sides are backed by a SharedArrayBuffer the relaxed atomic overload is
-  // used, which honors the SharedArrayBuffer memory model. Any other
-  // combination (both regular, or one of each) goes through the ArrayBuffer
-  // overload: it operates on the underlying backing store regardless of
-  // shared-ness, so a plain memmove is performed (matching the historical
-  // behavior for SharedArrayBuffer-backed buffers). The V8 API has no overload
-  // that mixes ArrayBuffer and SharedArrayBuffer, so the two must never be
-  // cross-cast.
-  if (source_ab->IsSharedArrayBuffer() && target_ab->IsSharedArrayBuffer()) {
-    return source_ab.As<SharedArrayBuffer>()->CopyArrayBufferBytes(
-        source_offset,
-        to_copy,
-        target_ab.As<SharedArrayBuffer>(),
-        target_offset);
-  }
-  return source_ab->CopyArrayBufferBytes(
-      source_offset, to_copy, target_ab, target_offset);
+  // Defer byte-range clamping and detached/immutable/shared handling to V8.
+  // CopyArrayBufferViewBytes resolves the views' data pointers directly,
+  // without materializing their ArrayBuffers (ArrayBufferView::Buffer /
+  // JSTypedArray::GetBuffer), which dominates the per-call cost for small
+  // copies. When both views are backed by a SharedArrayBuffer it performs a
+  // relaxed-atomic memmove honoring the SharedArrayBuffer memory model; any
+  // other combination performs a plain memmove on the backing store (matching
+  // the historical behavior for SharedArrayBuffer-backed buffers).
+  return ArrayBufferView::CopyArrayBufferViewBytes(
+      source_obj.As<ArrayBufferView>(),
+      source_start,
+      target_obj.As<ArrayBufferView>(),
+      target_start,
+      to_copy);
 }
 
 // Assume caller has properly validated args.