src: avoid copying source string in TextEncoder.encode

anonrig · anonrig · commit a886029baf77 · 2026-06-13T18:24:31.000-04:00
`EncodeUtf8String`, which backs `TextEncoder.prototype.encode()`, copied
the entire source string out of the V8 heap into a `MaybeStackBuffer`
(via `WriteOneByteV2`/`WriteV2`) before encoding, allocating on the heap
for strings larger than the stack buffer. `EncodeInto` already avoids
this by reading the flat content directly through `v8::String::ValueView`.

Read the flat content via `ValueView` instead. Because `ValueView` holds
a `DisallowGarbageCollection` scope, the backing store cannot be
allocated while it is alive, so the view is used in two short scopes:
one to validate and compute the exact UTF-8 length, and one to encode
directly into the backing store after allocation. Flattening is cached
on the string, so re-acquiring the view is cheap. The rare unpaired
surrogate path still copies into a mutable buffer for in-place
`to_well_formed_utf16`.

benchmark/util/text-encoder.js (op=encode, n=1e6, 12 runs each):

                       len=256  len=1024  len=8192
  ascii                 +14.1%    +23.5%    +43.9%
  one-byte (latin1)     +14.4%    +22.2%    +12.3%
  two-byte (utf-16)     +16.7%    +20.4%    +15.5%

len=32 uses the unchanged small-string path (~noise). The untouched
encodeInto path stayed flat (-2.0%..+0.5%) across all configurations.
diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc
@@ -340,53 +340,43 @@ void BindingData::EncodeUtf8String(const FunctionCallbackInfo<Value>& args) {
 
   size_t length = source->Length();
   size_t utf8_length = 0;
-  bool is_one_byte = source->IsOneByte();
-
-  if (is_one_byte) {
-    // One-byte string (Latin1) - copy to buffer first, then process
-    MaybeStackBuffer<uint8_t, MAX_SIZE_FOR_STACK_ALLOC> latin1_buffer(length);
-    source->WriteOneByteV2(isolate, 0, length, latin1_buffer.out());
-
-    auto data = reinterpret_cast<const char*>(latin1_buffer.out());
-
-    // Check if it's pure ASCII - if so, we can just copy
-    simdutf::result result = simdutf::validate_ascii_with_errors(data, length);
-    if (result.error == simdutf::SUCCESS) {
-      // Pure ASCII - direct copy
-      std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
-          isolate, length, BackingStoreInitializationMode::kUninitialized);
-      CHECK(bs);
-      memcpy(bs->Data(), data, length);
-      Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
-      args.GetReturnValue().Set(Uint8Array::New(ab, 0, length));
-      return;
-    }
 
-    // Latin1 with non-ASCII characters - need conversion
-    utf8_length = simdutf::utf8_length_from_latin1(data, length);
-    std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
-        isolate, utf8_length, BackingStoreInitializationMode::kUninitialized);
-    CHECK(bs);
-    [[maybe_unused]] size_t written = simdutf::convert_latin1_to_utf8(
-        data, length, static_cast<char*>(bs->Data()));
-    DCHECK_EQ(written, utf8_length);
-    Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
-    args.GetReturnValue().Set(Uint8Array::New(ab, 0, utf8_length));
-    return;
+  // Inspect the string's flat content directly to determine the encoding and
+  // the exact UTF-8 output size, without copying it out of the V8 heap.
+  //
+  // v8::String::ValueView holds a DisallowGarbageCollection scope, so it must
+  // be released before allocating the backing store below. Flattening is cached
+  // on the string, so re-acquiring the view for the conversion pass is cheap.
+  bool is_one_byte;
+  bool is_ascii = false;
+  bool is_well_formed = true;
+  {
+    v8::String::ValueView view(isolate, source);
+    is_one_byte = view.is_one_byte();
+    if (is_one_byte) {
+      auto data = reinterpret_cast<const char*>(view.data8());
+      is_ascii = simdutf::validate_ascii_with_errors(data, length).error ==
+                 simdutf::SUCCESS;
+      utf8_length =
+          is_ascii ? length : simdutf::utf8_length_from_latin1(data, length);
+    } else {
+      auto data = reinterpret_cast<const char16_t*>(view.data16());
+      is_well_formed = simdutf::validate_utf16_with_errors(data, length).error ==
+                       simdutf::SUCCESS;
+      if (is_well_formed) {
+        utf8_length = simdutf::utf8_length_from_utf16(data, length);
+      }
+    }
   }
 
-  // Two-byte string (UTF-16) - copy to buffer first
-  MaybeStackBuffer<uint16_t, MAX_SIZE_FOR_STACK_ALLOC> utf16_buffer(length);
-  source->WriteV2(isolate, 0, length, utf16_buffer.out());
-
-  auto data = reinterpret_cast<char16_t*>(utf16_buffer.out());
-
-  // Check for unpaired surrogates
-  simdutf::result validation_result =
-      simdutf::validate_utf16_with_errors(data, length);
+  // Rare path: two-byte string with unpaired surrogates. Copy into a mutable
+  // buffer, make it well-formed, then encode.
+  if (!is_well_formed) {
+    MaybeStackBuffer<uint16_t, MAX_SIZE_FOR_STACK_ALLOC> utf16_buffer(length);
+    source->WriteV2(isolate, 0, length, utf16_buffer.out());
+    auto data = reinterpret_cast<char16_t*>(utf16_buffer.out());
+    simdutf::to_well_formed_utf16(data, length, data);
 
-  if (validation_result.error == simdutf::SUCCESS) {
-    // Valid UTF-16 - use the fast path
     utf8_length = simdutf::utf8_length_from_utf16(data, length);
     std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
         isolate, utf8_length, BackingStoreInitializationMode::kUninitialized);
@@ -399,16 +389,30 @@ void BindingData::EncodeUtf8String(const FunctionCallbackInfo<Value>& args) {
     return;
   }
 
-  // Invalid UTF-16 with unpaired surrogates - convert to well-formed in place
-  simdutf::to_well_formed_utf16(data, length, data);
-
-  utf8_length = simdutf::utf8_length_from_utf16(data, length);
+  // Common path: allocate the exact-size output, then re-acquire the flat
+  // content and encode directly into the backing store.
   std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
       isolate, utf8_length, BackingStoreInitializationMode::kUninitialized);
   CHECK(bs);
-  [[maybe_unused]] size_t written = simdutf::convert_utf16_to_utf8(
-      data, length, static_cast<char*>(bs->Data()));
-  DCHECK_EQ(written, utf8_length);
+  char* out = static_cast<char*>(bs->Data());
+  {
+    v8::String::ValueView view(isolate, source);
+    if (is_one_byte) {
+      auto data = reinterpret_cast<const char*>(view.data8());
+      if (is_ascii) {
+        memcpy(out, data, length);
+      } else {
+        [[maybe_unused]] size_t written =
+            simdutf::convert_latin1_to_utf8(data, length, out);
+        DCHECK_EQ(written, utf8_length);
+      }
+    } else {
+      auto data = reinterpret_cast<const char16_t*>(view.data16());
+      [[maybe_unused]] size_t written =
+          simdutf::convert_utf16_to_utf8(data, length, out);
+      DCHECK_EQ(written, utf8_length);
+    }
+  }
   Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
   args.GetReturnValue().Set(Uint8Array::New(ab, 0, utf8_length));
 }