Skip to content

Commit a886029

Browse files
committed
src: avoid copying source string in TextEncoder.encode
`EncodeUtf8String`, which backs `TextEncoder.prototype.encode()`, copied the entire source string out of the V8 heap into a `MaybeStackBuffer` (via `WriteOneByteV2`/`WriteV2`) before encoding, allocating on the heap for strings larger than the stack buffer. `EncodeInto` already avoids this by reading the flat content directly through `v8::String::ValueView`. Read the flat content via `ValueView` instead. Because `ValueView` holds a `DisallowGarbageCollection` scope, the backing store cannot be allocated while it is alive, so the view is used in two short scopes: one to validate and compute the exact UTF-8 length, and one to encode directly into the backing store after allocation. Flattening is cached on the string, so re-acquiring the view is cheap. The rare unpaired surrogate path still copies into a mutable buffer for in-place `to_well_formed_utf16`. benchmark/util/text-encoder.js (op=encode, n=1e6, 12 runs each): len=256 len=1024 len=8192 ascii +14.1% +23.5% +43.9% one-byte (latin1) +14.4% +22.2% +12.3% two-byte (utf-16) +16.7% +20.4% +15.5% len=32 uses the unchanged small-string path (~noise). The untouched encodeInto path stayed flat (-2.0%..+0.5%) across all configurations.
1 parent d097de8 commit a886029

1 file changed

Lines changed: 54 additions & 50 deletions

File tree

src/encoding_binding.cc

Lines changed: 54 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -340,53 +340,43 @@ void BindingData::EncodeUtf8String(const FunctionCallbackInfo<Value>& args) {
340340

341341
size_t length = source->Length();
342342
size_t utf8_length = 0;
343-
bool is_one_byte = source->IsOneByte();
344-
345-
if (is_one_byte) {
346-
// One-byte string (Latin1) - copy to buffer first, then process
347-
MaybeStackBuffer<uint8_t, MAX_SIZE_FOR_STACK_ALLOC> latin1_buffer(length);
348-
source->WriteOneByteV2(isolate, 0, length, latin1_buffer.out());
349-
350-
auto data = reinterpret_cast<const char*>(latin1_buffer.out());
351-
352-
// Check if it's pure ASCII - if so, we can just copy
353-
simdutf::result result = simdutf::validate_ascii_with_errors(data, length);
354-
if (result.error == simdutf::SUCCESS) {
355-
// Pure ASCII - direct copy
356-
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
357-
isolate, length, BackingStoreInitializationMode::kUninitialized);
358-
CHECK(bs);
359-
memcpy(bs->Data(), data, length);
360-
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
361-
args.GetReturnValue().Set(Uint8Array::New(ab, 0, length));
362-
return;
363-
}
364343

365-
// Latin1 with non-ASCII characters - need conversion
366-
utf8_length = simdutf::utf8_length_from_latin1(data, length);
367-
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
368-
isolate, utf8_length, BackingStoreInitializationMode::kUninitialized);
369-
CHECK(bs);
370-
[[maybe_unused]] size_t written = simdutf::convert_latin1_to_utf8(
371-
data, length, static_cast<char*>(bs->Data()));
372-
DCHECK_EQ(written, utf8_length);
373-
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
374-
args.GetReturnValue().Set(Uint8Array::New(ab, 0, utf8_length));
375-
return;
344+
// Inspect the string's flat content directly to determine the encoding and
345+
// the exact UTF-8 output size, without copying it out of the V8 heap.
346+
//
347+
// v8::String::ValueView holds a DisallowGarbageCollection scope, so it must
348+
// be released before allocating the backing store below. Flattening is cached
349+
// on the string, so re-acquiring the view for the conversion pass is cheap.
350+
bool is_one_byte;
351+
bool is_ascii = false;
352+
bool is_well_formed = true;
353+
{
354+
v8::String::ValueView view(isolate, source);
355+
is_one_byte = view.is_one_byte();
356+
if (is_one_byte) {
357+
auto data = reinterpret_cast<const char*>(view.data8());
358+
is_ascii = simdutf::validate_ascii_with_errors(data, length).error ==
359+
simdutf::SUCCESS;
360+
utf8_length =
361+
is_ascii ? length : simdutf::utf8_length_from_latin1(data, length);
362+
} else {
363+
auto data = reinterpret_cast<const char16_t*>(view.data16());
364+
is_well_formed = simdutf::validate_utf16_with_errors(data, length).error ==
365+
simdutf::SUCCESS;
366+
if (is_well_formed) {
367+
utf8_length = simdutf::utf8_length_from_utf16(data, length);
368+
}
369+
}
376370
}
377371

378-
// Two-byte string (UTF-16) - copy to buffer first
379-
MaybeStackBuffer<uint16_t, MAX_SIZE_FOR_STACK_ALLOC> utf16_buffer(length);
380-
source->WriteV2(isolate, 0, length, utf16_buffer.out());
381-
382-
auto data = reinterpret_cast<char16_t*>(utf16_buffer.out());
383-
384-
// Check for unpaired surrogates
385-
simdutf::result validation_result =
386-
simdutf::validate_utf16_with_errors(data, length);
372+
// Rare path: two-byte string with unpaired surrogates. Copy into a mutable
373+
// buffer, make it well-formed, then encode.
374+
if (!is_well_formed) {
375+
MaybeStackBuffer<uint16_t, MAX_SIZE_FOR_STACK_ALLOC> utf16_buffer(length);
376+
source->WriteV2(isolate, 0, length, utf16_buffer.out());
377+
auto data = reinterpret_cast<char16_t*>(utf16_buffer.out());
378+
simdutf::to_well_formed_utf16(data, length, data);
387379

388-
if (validation_result.error == simdutf::SUCCESS) {
389-
// Valid UTF-16 - use the fast path
390380
utf8_length = simdutf::utf8_length_from_utf16(data, length);
391381
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
392382
isolate, utf8_length, BackingStoreInitializationMode::kUninitialized);
@@ -399,16 +389,30 @@ void BindingData::EncodeUtf8String(const FunctionCallbackInfo<Value>& args) {
399389
return;
400390
}
401391

402-
// Invalid UTF-16 with unpaired surrogates - convert to well-formed in place
403-
simdutf::to_well_formed_utf16(data, length, data);
404-
405-
utf8_length = simdutf::utf8_length_from_utf16(data, length);
392+
// Common path: allocate the exact-size output, then re-acquire the flat
393+
// content and encode directly into the backing store.
406394
std::unique_ptr<BackingStore> bs = ArrayBuffer::NewBackingStore(
407395
isolate, utf8_length, BackingStoreInitializationMode::kUninitialized);
408396
CHECK(bs);
409-
[[maybe_unused]] size_t written = simdutf::convert_utf16_to_utf8(
410-
data, length, static_cast<char*>(bs->Data()));
411-
DCHECK_EQ(written, utf8_length);
397+
char* out = static_cast<char*>(bs->Data());
398+
{
399+
v8::String::ValueView view(isolate, source);
400+
if (is_one_byte) {
401+
auto data = reinterpret_cast<const char*>(view.data8());
402+
if (is_ascii) {
403+
memcpy(out, data, length);
404+
} else {
405+
[[maybe_unused]] size_t written =
406+
simdutf::convert_latin1_to_utf8(data, length, out);
407+
DCHECK_EQ(written, utf8_length);
408+
}
409+
} else {
410+
auto data = reinterpret_cast<const char16_t*>(view.data16());
411+
[[maybe_unused]] size_t written =
412+
simdutf::convert_utf16_to_utf8(data, length, out);
413+
DCHECK_EQ(written, utf8_length);
414+
}
415+
}
412416
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, std::move(bs));
413417
args.GetReturnValue().Set(Uint8Array::New(ab, 0, utf8_length));
414418
}

0 commit comments

Comments
 (0)