sciencesakura · sciencesakura · Jun 28, 2025 · Jun 28, 2025
diff --git a/packages/mutf-8/src/index.bench.mts b/packages/mutf-8/src/index.bench.mts
@@ -3,12 +3,17 @@
 import { bench, describe } from "vitest";
 import { MUtf8Decoder, MUtf8Encoder } from "./index.js";
 
+const asciiText = "The quick brown fox jumps over the lazy dog.";
+const latinText = "Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich.";
+const asianText = "あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら";
+const emojiText = "😀😃😄😁😆😅😂🤣😊😇🙂🙃😉😌😍🥰😘😗😙😚";
+const mixedText = `${asciiText} ${latinText} ${asianText} ${emojiText}`;
+
 const encoder = new MUtf8Encoder();
 const decoder = new MUtf8Decoder();
 
 describe("MUtf8Decoder.decode", () => {
   describe("ASCII text", () => {
-    const asciiText = "The quick brown fox jumps over the lazy dog.";
     const asciiBytes500 = encoder.encode(stretch(asciiText, 500));
     const asciiBytes5K = encoder.encode(stretch(asciiText, 5000));
     const asciiBytes50K = encoder.encode(stretch(asciiText, 50000));
@@ -27,7 +32,6 @@ describe("MUtf8Decoder.decode", () => {
   });
 
   describe("Latin-1 text", () => {
-    const latinText = "Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich.";
     const latin1Bytes500 = encoder.encode(stretch(latinText, 500));
     const latin1Bytes5K = encoder.encode(stretch(latinText, 5000));
     const latin1Bytes50K = encoder.encode(stretch(latinText, 50000));
@@ -46,7 +50,6 @@ describe("MUtf8Decoder.decode", () => {
   });
 
   describe("Asian text", () => {
-    const asianText = "あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら";
     const asianBytes500 = encoder.encode(stretch(asianText, 500));
     const asianBytes5K = encoder.encode(stretch(asianText, 5000));
     const asianBytes50K = encoder.encode(stretch(asianText, 50000));
@@ -65,7 +68,6 @@ describe("MUtf8Decoder.decode", () => {
   });
 
   describe("Emoji text", () => {
-    const emojiText = "😀😃😄😁😆😅😂🤣😊😇🙂🙃😉😌😍🥰😘😗😙😚";
     const emojiBytes500 = encoder.encode(stretch(emojiText, 500));
     const emojiBytes5K = encoder.encode(stretch(emojiText, 5000));
     const emojiBytes50K = encoder.encode(stretch(emojiText, 50000));
@@ -84,8 +86,6 @@ describe("MUtf8Decoder.decode", () => {
   });
 
   describe("Mixed text", () => {
-    const mixedText =
-      "The quick brown fox jumps over the lazy dog. Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら 😀😃😄😁😆😅😂🤣😊😇🙂🙃😉😌😍🥰😘😗😙😚";
     const mixedBytes500 = encoder.encode(stretch(mixedText, 500));
     const mixedBytes5K = encoder.encode(stretch(mixedText, 5000));
     const mixedBytes50K = encoder.encode(stretch(mixedText, 50000));
@@ -104,6 +104,192 @@ describe("MUtf8Decoder.decode", () => {
   });
 });
 
+describe("MUtf8Encoder.encode", () => {
+  describe("ASCII text", () => {
+    const asciiText500 = stretch(asciiText, 500);
+    const asciiText5K = stretch(asciiText, 5000);
+    const asciiText50K = stretch(asciiText, 50000);
+
+    bench("encode 500 length string", () => {
+      encoder.encode(asciiText500);
+    });
+
+    bench("encode 5K length string", () => {
+      encoder.encode(asciiText5K);
+    });
+
+    bench("encode 50K length string", () => {
+      encoder.encode(asciiText50K);
+    });
+  });
+
+  describe("Latin-1 text", () => {
+    const latinText500 = stretch(latinText, 500);
+    const latinText5K = stretch(latinText, 5000);
+    const latinText50K = stretch(latinText, 50000);
+
+    bench("encode 500 length string", () => {
+      encoder.encode(latinText500);
+    });
+
+    bench("encode 5K length string", () => {
+      encoder.encode(latinText5K);
+    });
+
+    bench("encode 50K length string", () => {
+      encoder.encode(latinText50K);
+    });
+  });
+
+  describe("Asian text", () => {
+    const asianText500 = stretch(asianText, 500);
+    const asianText5K = stretch(asianText, 5000);
+    const asianText50K = stretch(asianText, 50000);
+
+    bench("encode 500 length string", () => {
+      encoder.encode(asianText500);
+    });
+
+    bench("encode 5K length string", () => {
+      encoder.encode(asianText5K);
+    });
+
+    bench("encode 50K length string", () => {
+      encoder.encode(asianText50K);
+    });
+  });
+
+  describe("Emoji text", () => {
+    const emojiText500 = stretch(emojiText, 500);
+    const emojiText5K = stretch(emojiText, 5000);
+    const emojiText50K = stretch(emojiText, 50000);
+
+    bench("encode 500 length string", () => {
+      encoder.encode(emojiText500);
+    });
+
+    bench("encode 5K length string", () => {
+      encoder.encode(emojiText5K);
+    });
+
+    bench("encode 50K length string", () => {
+      encoder.encode(emojiText50K);
+    });
+  });
+
+  describe("Mixed text", () => {
+    const mixedText500 = stretch(mixedText, 500);
+    const mixedText5K = stretch(mixedText, 5000);
+    const mixedText50K = stretch(mixedText, 50000);
+
+    bench("encode 500 length string", () => {
+      encoder.encode(mixedText500);
+    });
+
+    bench("encode 5K length string", () => {
+      encoder.encode(mixedText5K);
+    });
+
+    bench("encode 50K length string", () => {
+      encoder.encode(mixedText50K);
+    });
+  });
+});
+
+describe("MUtf8Encoder.encodeInto", () => {
+  const buffer = new Uint8Array(800000);
+
+  describe("ASCII text", () => {
+    const asciiText500 = stretch(asciiText, 500);
+    const asciiText5K = stretch(asciiText, 5000);
+    const asciiText50K = stretch(asciiText, 50000);
+
+    bench("encode 500 length string", () => {
+      encoder.encodeInto(asciiText500, buffer);
+    });
+
+    bench("encode 5K length string", () => {
+      encoder.encodeInto(asciiText5K, buffer);
+    });
+
+    bench("encode 50K length string", () => {
+      encoder.encodeInto(asciiText50K, buffer);
+    });
+  });
+
+  describe("Latin-1 text", () => {
+    const latinText500 = stretch(latinText, 500);
+    const latinText5K = stretch(latinText, 5000);
+    const latinText50K = stretch(latinText, 50000);
+
+    bench("encode 500 length string", () => {
+      encoder.encodeInto(latinText500, buffer);
+    });
+
+    bench("encode 5K length string", () => {
+      encoder.encodeInto(latinText5K, buffer);
+    });
+
+    bench("encode 50K length string", () => {
+      encoder.encodeInto(latinText50K, buffer);
+    });
+  });
+
+  describe("Asian text", () => {
+    const asianText500 = stretch(asianText, 500);
+    const asianText5K = stretch(asianText, 5000);
+    const asianText50K = stretch(asianText, 50000);
+
+    bench("encode 500 length string", () => {
+      encoder.encodeInto(asianText500, buffer);
+    });
+
+    bench("encode 5K length string", () => {
+      encoder.encodeInto(asianText5K, buffer);
+    });
+
+    bench("encode 50K length string", () => {
+      encoder.encodeInto(asianText50K, buffer);
+    });
+  });
+
+  describe("Emoji text", () => {
+    const emojiText500 = stretch(emojiText, 500);
+    const emojiText5K = stretch(emojiText, 5000);
+    const emojiText50K = stretch(emojiText, 50000);
+
+    bench("encode 500 length string", () => {
+      encoder.encodeInto(emojiText500, buffer);
+    });
+
+    bench("encode 5K length string", () => {
+      encoder.encodeInto(emojiText5K, buffer);
+    });
+
+    bench("encode 50K length string", () => {
+      encoder.encodeInto(emojiText50K, buffer);
+    });
+  });
+
+  describe("Mixed text", () => {
+    const mixedText500 = stretch(mixedText, 500);
+    const mixedText5K = stretch(mixedText, 5000);
+    const mixedText50K = stretch(mixedText, 50000);
+
+    bench("encode 500 length string", () => {
+      encoder.encodeInto(mixedText500, buffer);
+    });
+
+    bench("encode 5K length string", () => {
+      encoder.encodeInto(mixedText5K, buffer);
+    });
+
+    bench("encode 50K length string", () => {
+      encoder.encodeInto(mixedText50K, buffer);
+    });
+  });
+});
+
 function stretch(text: string, length: number): string {
   return text.repeat(Math.ceil(length / text.length)).slice(0, length);
 }
diff --git a/packages/mutf-8/src/index.test.mts b/packages/mutf-8/src/index.test.mts
@@ -96,8 +96,8 @@ describe("MUtf8Encoder.encode()", () => {
   });
 
   test.each(testdata)("encode the text: $text", ({ text, binary }) => {
-    const decoder = new MUtf8Decoder();
-    expect(decoder.decode(binary)).toBe(text);
+    const encoder = new MUtf8Encoder();
+    expect(encoder.encode(text)).toEqual(binary);
   });
 });
 

diff --git a/packages/mutf-8/src/index.ts b/packages/mutf-8/src/index.ts
@@ -325,29 +325,31 @@ export class MUtf8Encoder {
    * @returns A new `Uint8Array` containing the Modified UTF-8 encoded bytes
    */
   encode(input = ""): Uint8Array {
-    const bin: number[] = [];
-    for (const c of input) {
+    const bytes = new Uint8Array(this.#estimateByteLength(input));
+    let bp = 0;
+    for (let cp = 0; cp < input.length; cp++) {
       // biome-ignore lint/style/noNonNullAssertion: `c` is always a non-empty string.
-      const code = c.codePointAt(0)!;
+      const code = input.codePointAt(cp)!;
       if (0x0001 <= code && code <= 0x007f) {
-        bin.push(code);
+        bytes[bp++] = code;
       } else if (code <= 0x07ff) {
-        bin.push(0xc0 | (code >>> 6));
-        bin.push(0x80 | (0x3f & code));
+        bytes[bp++] = 0xc0 | (code >>> 6);
+        bytes[bp++] = 0x80 | (0x3f & code);
       } else if (code <= 0xffff) {
-        bin.push(0xe0 | (code >>> 12));
-        bin.push(0x80 | (0x3f & (code >>> 6)));
-        bin.push(0x80 | (0x3f & code));
+        bytes[bp++] = 0xe0 | (code >>> 12);
+        bytes[bp++] = 0x80 | (0x3f & (code >>> 6));
+        bytes[bp++] = 0x80 | (0x3f & code);
       } else {
-        bin.push(0xed);
-        bin.push(0xa0 | ((code >>> 16) - 1));
-        bin.push(0x80 | (0x3f & (code >>> 10)));
-        bin.push(0xed);
-        bin.push(0xb0 | (0x0f & (code >>> 6)));
-        bin.push(0x80 | (0x3f & code));
+        bytes[bp++] = 0xed;
+        bytes[bp++] = 0xa0 | ((code >>> 16) - 1);
+        bytes[bp++] = 0x80 | (0x3f & (code >>> 10));
+        bytes[bp++] = 0xed;
+        bytes[bp++] = 0xb0 | (0x0f & (code >>> 6));
+        bytes[bp++] = 0x80 | (0x3f & code);
+        cp++;
       }
     }
-    return new Uint8Array(bin);
+    return bytes;
   }
 
   /**
@@ -365,36 +367,57 @@ export class MUtf8Encoder {
    * @returns An object indicating how many characters were read and bytes written
    */
   encodeInto(source: string, destination: Uint8Array): TextEncoderEncodeIntoResult {
-    const destLen = destination.length;
-    let i = 0;
-    let read = 0;
-    for (const c of source) {
+    const capacity = destination.length;
+    let bp = 0;
+    let cp = 0;
+    while (cp < source.length) {
       // biome-ignore lint/style/noNonNullAssertion: `c` is always a non-empty string.
-      const code = c.codePointAt(0)!;
+      const code = source.codePointAt(cp)!;
+      if (0x0001 <= code && code <= 0x007f) {
+        if (capacity <= bp) break;
+        destination[bp++] = code;
+        cp++;
+      } else if (code <= 0x07ff) {
+        if (capacity <= bp + 1) break;
+        destination[bp++] = 0xc0 | (code >>> 6);
+        destination[bp++] = 0x80 | (0x3f & code);
+        cp++;
+      } else if (code <= 0xffff) {
+        if (capacity <= bp + 2) break;
+        destination[bp++] = 0xe0 | (code >>> 12);
+        destination[bp++] = 0x80 | (0x3f & (code >>> 6));
+        destination[bp++] = 0x80 | (0x3f & code);
+        cp++;
+      } else {
+        if (capacity <= bp + 5) break;
+        destination[bp++] = 0xed;
+        destination[bp++] = 0xa0 | ((code >>> 16) - 1);
+        destination[bp++] = 0x80 | (0x3f & (code >>> 10));
+        destination[bp++] = 0xed;
+        destination[bp++] = 0xb0 | (0x0f & (code >>> 6));
+        destination[bp++] = 0x80 | (0x3f & code);
+        cp += 2;
+      }
+    }
+    return { read: cp, written: bp };
+  }
+
+  #estimateByteLength(source: string): number {
+    let length = 0;
+    for (let cp = 0; cp < source.length; cp++) {
+      // biome-ignore lint/style/noNonNullAssertion: `source` is always a non-empty string.
+      const code = source.codePointAt(cp)!;
       if (0x0001 <= code && code <= 0x007f) {
-        if (destLen <= i) break;
-        destination[i++] = code;
+        length += 1;
       } else if (code <= 0x07ff) {
-        if (destLen <= i + 1) break;
-        destination[i++] = 0xc0 | (code >>> 6);
-        destination[i++] = 0x80 | (0x3f & code);
+        length += 2;
       } else if (code <= 0xffff) {
-        if (destLen <= i + 2) break;
-        destination[i++] = 0xe0 | (code >>> 12);
-        destination[i++] = 0x80 | (0x3f & (code >>> 6));
-        destination[i++] = 0x80 | (0x3f & code);
+        length += 3;
       } else {
-        if (destLen <= i + 5) break;
-        destination[i++] = 0xed;
-        destination[i++] = 0xa0 | ((code >>> 16) - 1);
-        destination[i++] = 0x80 | (0x3f & (code >>> 10));
-        destination[i++] = 0xed;
-        destination[i++] = 0xb0 | (0x0f & (code >>> 6));
-        destination[i++] = 0x80 | (0x3f & code);
-        read++;
+        length += 6;
+        cp++;
       }
-      read++;
     }
-    return { read, written: i };
+    return length;
   }
 }