Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 192 additions & 6 deletions packages/mutf-8/src/index.bench.mts
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,17 @@
import { bench, describe } from "vitest";
import { MUtf8Decoder, MUtf8Encoder } from "./index.js";

const asciiText = "The quick brown fox jumps over the lazy dog.";
const latinText = "Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich.";
const asianText = "あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら";
const emojiText = "😀😃😄😁😆😅😂🤣😊😇🙂🙃😉😌😍🥰😘😗😙😚";
const mixedText = `${asciiText} ${latinText} ${asianText} ${emojiText}`;

const encoder = new MUtf8Encoder();
const decoder = new MUtf8Decoder();

describe("MUtf8Decoder.decode", () => {
describe("ASCII text", () => {
const asciiText = "The quick brown fox jumps over the lazy dog.";
const asciiBytes500 = encoder.encode(stretch(asciiText, 500));
const asciiBytes5K = encoder.encode(stretch(asciiText, 5000));
const asciiBytes50K = encoder.encode(stretch(asciiText, 50000));
Expand All @@ -27,7 +32,6 @@ describe("MUtf8Decoder.decode", () => {
});

describe("Latin-1 text", () => {
const latinText = "Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich.";
const latin1Bytes500 = encoder.encode(stretch(latinText, 500));
const latin1Bytes5K = encoder.encode(stretch(latinText, 5000));
const latin1Bytes50K = encoder.encode(stretch(latinText, 50000));
Expand All @@ -46,7 +50,6 @@ describe("MUtf8Decoder.decode", () => {
});

describe("Asian text", () => {
const asianText = "あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら";
const asianBytes500 = encoder.encode(stretch(asianText, 500));
const asianBytes5K = encoder.encode(stretch(asianText, 5000));
const asianBytes50K = encoder.encode(stretch(asianText, 50000));
Expand All @@ -65,7 +68,6 @@ describe("MUtf8Decoder.decode", () => {
});

describe("Emoji text", () => {
const emojiText = "😀😃😄😁😆😅😂🤣😊😇🙂🙃😉😌😍🥰😘😗😙😚";
const emojiBytes500 = encoder.encode(stretch(emojiText, 500));
const emojiBytes5K = encoder.encode(stretch(emojiText, 5000));
const emojiBytes50K = encoder.encode(stretch(emojiText, 50000));
Expand All @@ -84,8 +86,6 @@ describe("MUtf8Decoder.decode", () => {
});

describe("Mixed text", () => {
const mixedText =
"The quick brown fox jumps over the lazy dog. Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich. あのイーハトーヴォのすきとおった風、夏でも底に冷たさをもつ青いそら 😀😃😄😁😆😅😂🤣😊😇🙂🙃😉😌😍🥰😘😗😙😚";
const mixedBytes500 = encoder.encode(stretch(mixedText, 500));
const mixedBytes5K = encoder.encode(stretch(mixedText, 5000));
const mixedBytes50K = encoder.encode(stretch(mixedText, 50000));
Expand All @@ -104,6 +104,192 @@ describe("MUtf8Decoder.decode", () => {
});
});

describe("MUtf8Encoder.encode", () => {
describe("ASCII text", () => {
const asciiText500 = stretch(asciiText, 500);
const asciiText5K = stretch(asciiText, 5000);
const asciiText50K = stretch(asciiText, 50000);

bench("encode 500 length string", () => {
encoder.encode(asciiText500);
});

bench("encode 5K length string", () => {
encoder.encode(asciiText5K);
});

bench("encode 50K length string", () => {
encoder.encode(asciiText50K);
});
});

describe("Latin-1 text", () => {
const latinText500 = stretch(latinText, 500);
const latinText5K = stretch(latinText, 5000);
const latinText50K = stretch(latinText, 50000);

bench("encode 500 length string", () => {
encoder.encode(latinText500);
});

bench("encode 5K length string", () => {
encoder.encode(latinText5K);
});

bench("encode 50K length string", () => {
encoder.encode(latinText50K);
});
});

describe("Asian text", () => {
const asianText500 = stretch(asianText, 500);
const asianText5K = stretch(asianText, 5000);
const asianText50K = stretch(asianText, 50000);

bench("encode 500 length string", () => {
encoder.encode(asianText500);
});

bench("encode 5K length string", () => {
encoder.encode(asianText5K);
});

bench("encode 50K length string", () => {
encoder.encode(asianText50K);
});
});

describe("Emoji text", () => {
const emojiText500 = stretch(emojiText, 500);
const emojiText5K = stretch(emojiText, 5000);
const emojiText50K = stretch(emojiText, 50000);

bench("encode 500 length string", () => {
encoder.encode(emojiText500);
});

bench("encode 5K length string", () => {
encoder.encode(emojiText5K);
});

bench("encode 50K length string", () => {
encoder.encode(emojiText50K);
});
});

describe("Mixed text", () => {
const mixedText500 = stretch(mixedText, 500);
const mixedText5K = stretch(mixedText, 5000);
const mixedText50K = stretch(mixedText, 50000);

bench("encode 500 length string", () => {
encoder.encode(mixedText500);
});

bench("encode 5K length string", () => {
encoder.encode(mixedText5K);
});

bench("encode 50K length string", () => {
encoder.encode(mixedText50K);
});
});
});

describe("MUtf8Encoder.encodeInto", () => {
const buffer = new Uint8Array(800000);

describe("ASCII text", () => {
const asciiText500 = stretch(asciiText, 500);
const asciiText5K = stretch(asciiText, 5000);
const asciiText50K = stretch(asciiText, 50000);

bench("encode 500 length string", () => {
encoder.encodeInto(asciiText500, buffer);
});

bench("encode 5K length string", () => {
encoder.encodeInto(asciiText5K, buffer);
});

bench("encode 50K length string", () => {
encoder.encodeInto(asciiText50K, buffer);
});
});

describe("Latin-1 text", () => {
const latinText500 = stretch(latinText, 500);
const latinText5K = stretch(latinText, 5000);
const latinText50K = stretch(latinText, 50000);

bench("encode 500 length string", () => {
encoder.encodeInto(latinText500, buffer);
});

bench("encode 5K length string", () => {
encoder.encodeInto(latinText5K, buffer);
});

bench("encode 50K length string", () => {
encoder.encodeInto(latinText50K, buffer);
});
});

describe("Asian text", () => {
const asianText500 = stretch(asianText, 500);
const asianText5K = stretch(asianText, 5000);
const asianText50K = stretch(asianText, 50000);

bench("encode 500 length string", () => {
encoder.encodeInto(asianText500, buffer);
});

bench("encode 5K length string", () => {
encoder.encodeInto(asianText5K, buffer);
});

bench("encode 50K length string", () => {
encoder.encodeInto(asianText50K, buffer);
});
});

describe("Emoji text", () => {
const emojiText500 = stretch(emojiText, 500);
const emojiText5K = stretch(emojiText, 5000);
const emojiText50K = stretch(emojiText, 50000);

bench("encode 500 length string", () => {
encoder.encodeInto(emojiText500, buffer);
});

bench("encode 5K length string", () => {
encoder.encodeInto(emojiText5K, buffer);
});

bench("encode 50K length string", () => {
encoder.encodeInto(emojiText50K, buffer);
});
});

describe("Mixed text", () => {
const mixedText500 = stretch(mixedText, 500);
const mixedText5K = stretch(mixedText, 5000);
const mixedText50K = stretch(mixedText, 50000);

bench("encode 500 length string", () => {
encoder.encodeInto(mixedText500, buffer);
});

bench("encode 5K length string", () => {
encoder.encodeInto(mixedText5K, buffer);
});

bench("encode 50K length string", () => {
encoder.encodeInto(mixedText50K, buffer);
});
});
});

function stretch(text: string, length: number): string {
return text.repeat(Math.ceil(length / text.length)).slice(0, length);
}
4 changes: 2 additions & 2 deletions packages/mutf-8/src/index.test.mts
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ describe("MUtf8Encoder.encode()", () => {
});

test.each(testdata)("encode the text: $text", ({ text, binary }) => {
const decoder = new MUtf8Decoder();
expect(decoder.decode(binary)).toBe(text);
const encoder = new MUtf8Encoder();
expect(encoder.encode(text)).toEqual(binary);
});
});

Expand Down
103 changes: 63 additions & 40 deletions packages/mutf-8/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -325,29 +325,31 @@ export class MUtf8Encoder {
* @returns A new `Uint8Array` containing the Modified UTF-8 encoded bytes
*/
encode(input = ""): Uint8Array {
const bin: number[] = [];
for (const c of input) {
const bytes = new Uint8Array(this.#estimateByteLength(input));
let bp = 0;
for (let cp = 0; cp < input.length; cp++) {
// biome-ignore lint/style/noNonNullAssertion: `c` is always a non-empty string.
const code = c.codePointAt(0)!;
const code = input.codePointAt(cp)!;
if (0x0001 <= code && code <= 0x007f) {
bin.push(code);
bytes[bp++] = code;
} else if (code <= 0x07ff) {
bin.push(0xc0 | (code >>> 6));
bin.push(0x80 | (0x3f & code));
bytes[bp++] = 0xc0 | (code >>> 6);
bytes[bp++] = 0x80 | (0x3f & code);
} else if (code <= 0xffff) {
bin.push(0xe0 | (code >>> 12));
bin.push(0x80 | (0x3f & (code >>> 6)));
bin.push(0x80 | (0x3f & code));
bytes[bp++] = 0xe0 | (code >>> 12);
bytes[bp++] = 0x80 | (0x3f & (code >>> 6));
bytes[bp++] = 0x80 | (0x3f & code);
} else {
bin.push(0xed);
bin.push(0xa0 | ((code >>> 16) - 1));
bin.push(0x80 | (0x3f & (code >>> 10)));
bin.push(0xed);
bin.push(0xb0 | (0x0f & (code >>> 6)));
bin.push(0x80 | (0x3f & code));
bytes[bp++] = 0xed;
bytes[bp++] = 0xa0 | ((code >>> 16) - 1);
bytes[bp++] = 0x80 | (0x3f & (code >>> 10));
bytes[bp++] = 0xed;
bytes[bp++] = 0xb0 | (0x0f & (code >>> 6));
bytes[bp++] = 0x80 | (0x3f & code);
cp++;
}
}
return new Uint8Array(bin);
return bytes;
}

/**
Expand All @@ -365,36 +367,57 @@ export class MUtf8Encoder {
* @returns An object indicating how many characters were read and bytes written
*/
encodeInto(source: string, destination: Uint8Array): TextEncoderEncodeIntoResult {
const destLen = destination.length;
let i = 0;
let read = 0;
for (const c of source) {
const capacity = destination.length;
let bp = 0;
let cp = 0;
while (cp < source.length) {
// biome-ignore lint/style/noNonNullAssertion: `c` is always a non-empty string.
const code = c.codePointAt(0)!;
const code = source.codePointAt(cp)!;
if (0x0001 <= code && code <= 0x007f) {
if (capacity <= bp) break;
destination[bp++] = code;
cp++;
} else if (code <= 0x07ff) {
if (capacity <= bp + 1) break;
destination[bp++] = 0xc0 | (code >>> 6);
destination[bp++] = 0x80 | (0x3f & code);
cp++;
} else if (code <= 0xffff) {
if (capacity <= bp + 2) break;
destination[bp++] = 0xe0 | (code >>> 12);
destination[bp++] = 0x80 | (0x3f & (code >>> 6));
destination[bp++] = 0x80 | (0x3f & code);
cp++;
} else {
if (capacity <= bp + 5) break;
destination[bp++] = 0xed;
destination[bp++] = 0xa0 | ((code >>> 16) - 1);
destination[bp++] = 0x80 | (0x3f & (code >>> 10));
destination[bp++] = 0xed;
destination[bp++] = 0xb0 | (0x0f & (code >>> 6));
destination[bp++] = 0x80 | (0x3f & code);
cp += 2;
}
}
return { read: cp, written: bp };
}

#estimateByteLength(source: string): number {
let length = 0;
for (let cp = 0; cp < source.length; cp++) {
// biome-ignore lint/style/noNonNullAssertion: `source` is always a non-empty string.
const code = source.codePointAt(cp)!;
if (0x0001 <= code && code <= 0x007f) {
if (destLen <= i) break;
destination[i++] = code;
length += 1;
} else if (code <= 0x07ff) {
if (destLen <= i + 1) break;
destination[i++] = 0xc0 | (code >>> 6);
destination[i++] = 0x80 | (0x3f & code);
length += 2;
} else if (code <= 0xffff) {
if (destLen <= i + 2) break;
destination[i++] = 0xe0 | (code >>> 12);
destination[i++] = 0x80 | (0x3f & (code >>> 6));
destination[i++] = 0x80 | (0x3f & code);
length += 3;
} else {
if (destLen <= i + 5) break;
destination[i++] = 0xed;
destination[i++] = 0xa0 | ((code >>> 16) - 1);
destination[i++] = 0x80 | (0x3f & (code >>> 10));
destination[i++] = 0xed;
destination[i++] = 0xb0 | (0x0f & (code >>> 6));
destination[i++] = 0x80 | (0x3f & code);
read++;
length += 6;
cp++;
}
read++;
}
return { read, written: i };
return length;
}
}