From 99945463be26949f970632d9fab11290cd71c13f Mon Sep 17 00:00:00 2001 From: evermake Date: Wed, 25 Mar 2026 21:40:14 +0500 Subject: [PATCH 1/7] chore: remove unused `deno-lint-ignore` --- test/jsx.test.tsx | 1 - 1 file changed, 1 deletion(-) diff --git a/test/jsx.test.tsx b/test/jsx.test.tsx index 3273ba0..dc22bc1 100644 --- a/test/jsx.test.tsx +++ b/test/jsx.test.tsx @@ -2,7 +2,6 @@ import { describe, expect, it } from 'vitest' describe('jsx', () => { it('should transform fragments', () => { - // deno-lint-ignore jsx-no-useless-fragment expect(<>).toEqual({ type: 'fragment', subelements: [], From 6795a6a3d5efeefcf4ac3c30ae45fb720eada0e3 Mon Sep 17 00:00:00 2001 From: evermake Date: Wed, 25 Mar 2026 23:22:34 +0500 Subject: [PATCH 2/7] chore: change some eslint rules --- eslint.config.js | 5 ++++- src/jsx.ts | 11 +++++++---- src/render.ts | 3 ++- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/eslint.config.js b/eslint.config.js index c8452e0..e089a0a 100644 --- a/eslint.config.js +++ b/eslint.config.js @@ -5,7 +5,7 @@ export default antfu({ indent: 2, semi: false, quotes: 'single', - jsx: true, + jsx: false, }, formatters: true, typescript: true, @@ -15,5 +15,8 @@ export default antfu({ rules: { 'ts/no-empty-object-type': 'off', 'ts/no-namespace': 'off', + 'style/brace-style': ['error', '1tbs', { allowSingleLine: false }], + 'style/arrow-parens': ['error', 'always'], + 'curly': ['error', 'all'], }, }) diff --git a/src/jsx.ts b/src/jsx.ts index 9fd6bdb..ab6dfb1 100644 --- a/src/jsx.ts +++ b/src/jsx.ts @@ -28,8 +28,9 @@ export function createElement( }) } - if (typeof type === 'function') + if (typeof type === 'function') { return type({ ...props, children }) + } throw new Error(`Invalid JSX component: ${type}.`) } @@ -49,11 +50,13 @@ function elementsFromNode(node: TgxNode): TgxElement[] { return [{ type: 'plain', value: node }] } - if (node == null) + if (node == null) { return [{ type: 'plain', value: node }] + } - if (Array.isArray(node)) - return node.flatMap(child => elementsFromNode(child)) + if (Array.isArray(node)) { + return node.flatMap((child) => elementsFromNode(child)) + } return [node] } diff --git a/src/render.ts b/src/render.ts index ec4a4ea..1c4c898 100644 --- a/src/render.ts +++ b/src/render.ts @@ -53,8 +53,9 @@ function renderTextElement(el: TgxElementText): string { } function renderPlainElement({ value }: TgxElementPlain): string { - if (value == null || typeof value === 'boolean') + if (value == null || typeof value === 'boolean') { return '' + } return sanitize(String(value)) } From d886cf68cc802b4f2884e4e197e4e18c02badd07 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 25 Mar 2026 15:16:15 +0000 Subject: [PATCH 3/7] feat: add telegram message entities parser Co-authored-by: Vladislav Deryabkin --- src/index.ts | 1 + src/parser.ts | 197 ++++++++++++++++++++++++++++++++++++++++++++ test/parser.test.ts | 119 ++++++++++++++++++++++++++ 3 files changed, 317 insertions(+) create mode 100644 src/parser.ts create mode 100644 test/parser.test.ts diff --git a/src/index.ts b/src/index.ts index 1ceffe9..ee26a56 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,5 @@ export * from './jsx.ts' export { Line } from './Line.ts' +export * from './parser.ts' export * from './render.ts' export * from './types.ts' diff --git a/src/parser.ts b/src/parser.ts new file mode 100644 index 0000000..219d593 --- /dev/null +++ b/src/parser.ts @@ -0,0 +1,197 @@ +import type { TextEntity, TgxElement, TgxElementText } from './types.ts' + +export type TelegramMessageEntityType + = | 'mention' + | 'hashtag' + | 'cashtag' + | 'bot_command' + | 'url' + | 'email' + | 'phone_number' + | 'bold' + | 'italic' + | 'underline' + | 'strikethrough' + | 'spoiler' + | 'code' + | 'pre' + | 'blockquote' + | 'expandable_blockquote' + | 'text_link' + | 'text_mention' + | 'custom_emoji' + +interface TelegramMessageEntityBase { + type: TelegramMessageEntityType | (string & {}) + offset: number + length: number +} + +interface TelegramMessageEntityTextLink extends TelegramMessageEntityBase { + type: 'text_link' + url: string +} + +interface TelegramMessageEntityPre extends TelegramMessageEntityBase { + type: 'pre' + language?: string +} + +interface TelegramMessageEntityCustomEmoji extends TelegramMessageEntityBase { + type: 'custom_emoji' + custom_emoji_id: string +} + +interface TelegramMessageEntityFallback extends TelegramMessageEntityBase { + [key: string]: unknown +} + +export type TelegramMessageEntity + = | TelegramMessageEntityTextLink + | TelegramMessageEntityPre + | TelegramMessageEntityCustomEmoji + | TelegramMessageEntityFallback + +interface ParsedEntity { + start: number + end: number + entity: TextEntity +} + +interface OpenEntity { + end: number + node: TgxElementText +} + +/** + * Converts Telegram message `text` + `entities` into a TGX tree. + * + * Offsets/lengths follow Telegram's UTF-16 indexing model. + * Invalid and crossing entities are ignored. + */ +export function parseMessageEntities( + text: string, + entities: readonly TelegramMessageEntity[] = [], +): TgxElement { + const parsedEntities = collectValidEntities(text, entities) + const root: TgxElement[] = [] + const openStack: OpenEntity[] = [] + const boundaries = buildBoundaries(parsedEntities, text.length) + + let cursor = 0 + for (const boundary of boundaries) { + const chunk = text.slice(cursor, boundary) + if (chunk) { + const target = openStack.length > 0 ? openStack.at(-1)!.node.subelements : root + target.push({ type: 'plain', value: chunk }) + } + + while (openStack.length > 0 && openStack.at(-1)!.end === boundary) + openStack.pop() + + for (const item of parsedEntities) { + if (item.start !== boundary) + continue + + const node: TgxElementText = { type: 'text', entity: item.entity, subelements: [] } + const target = openStack.length > 0 ? openStack.at(-1)!.node.subelements : root + target.push(node) + openStack.push({ end: item.end, node }) + } + + cursor = boundary + } + + return { type: 'fragment', subelements: root } +} + +function buildBoundaries(entities: readonly ParsedEntity[], textLength: number): number[] { + const boundaries = new Set([textLength]) + for (const entity of entities) { + boundaries.add(entity.start) + boundaries.add(entity.end) + } + return [...boundaries].sort((a, b) => a - b) +} + +function collectValidEntities( + text: string, + entities: readonly TelegramMessageEntity[], +): ParsedEntity[] { + const sorted = [...entities] + .sort((a, b) => { + if (a.offset !== b.offset) + return a.offset - b.offset + return b.length - a.length + }) + + const valid: ParsedEntity[] = [] + const stack: number[] = [] + + for (const entity of sorted) { + const start = entity.offset + const end = entity.offset + entity.length + + if (!isValidRange(start, end, text.length)) + continue + + const mapped = mapEntity(text, entity, start, end) + if (!mapped) + continue + + while (stack.length > 0 && start >= stack.at(-1)!) + stack.pop() + + if (stack.length > 0 && end > stack.at(-1)!) + continue + + valid.push({ start, end, entity: mapped }) + stack.push(end) + } + + return valid +} + +function isValidRange(start: number, end: number, textLength: number): boolean { + if (!Number.isInteger(start) || !Number.isInteger(end)) + return false + if (start < 0 || end < 0 || end <= start) + return false + return end <= textLength +} + +function mapEntity( + text: string, + entity: TelegramMessageEntity, + start: number, + end: number, +): TextEntity | null { + switch (entity.type) { + case 'bold': + return { type: 'bold' } + case 'italic': + return { type: 'italic' } + case 'underline': + return { type: 'underline' } + case 'strikethrough': + return { type: 'strikethrough' } + case 'spoiler': + return { type: 'spoiler' } + case 'code': + return { type: 'code' } + case 'pre': + return { type: 'codeblock', language: entity.language } + case 'text_link': + return entity.url ? { type: 'link', url: entity.url } : null + case 'custom_emoji': + return entity.custom_emoji_id + ? { type: 'custom-emoji', id: entity.custom_emoji_id, alt: text.slice(start, end) } + : null + case 'blockquote': + return { type: 'blockquote', expandable: false } + case 'expandable_blockquote': + return { type: 'blockquote', expandable: true } + default: + return null + } +} diff --git a/test/parser.test.ts b/test/parser.test.ts new file mode 100644 index 0000000..f289d73 --- /dev/null +++ b/test/parser.test.ts @@ -0,0 +1,119 @@ +import type { TelegramMessageEntity } from '../src/parser.ts' +import { describe, expect, it } from 'vitest' +import { parseMessageEntities } from '../src/parser.ts' + +describe('parseMessageEntities', () => { + it('returns plain fragment when entities are missing', () => { + expect(parseMessageEntities('hello')).toEqual({ + type: 'fragment', + subelements: [{ type: 'plain', value: 'hello' }], + }) + }) + + it('parses nested entities with stable structure', () => { + const text = 'hello world' + const entities: TelegramMessageEntity[] = [ + { type: 'bold', offset: 0, length: 11 }, + { type: 'italic', offset: 6, length: 5 }, + ] + + expect(parseMessageEntities(text, entities)).toEqual({ + type: 'fragment', + subelements: [{ + type: 'text', + entity: { type: 'bold' }, + subelements: [ + { type: 'plain', value: 'hello ' }, + { + type: 'text', + entity: { type: 'italic' }, + subelements: [{ type: 'plain', value: 'world' }], + }, + ], + }], + }) + }) + + it('parses adjacent entities correctly', () => { + const text = 'abcd' + const entities: TelegramMessageEntity[] = [ + { type: 'bold', offset: 0, length: 2 }, + { type: 'italic', offset: 2, length: 2 }, + ] + + expect(parseMessageEntities(text, entities)).toEqual({ + type: 'fragment', + subelements: [ + { + type: 'text', + entity: { type: 'bold' }, + subelements: [{ type: 'plain', value: 'ab' }], + }, + { + type: 'text', + entity: { type: 'italic' }, + subelements: [{ type: 'plain', value: 'cd' }], + }, + ], + }) + }) + + it('maps link, codeblock and custom emoji entities', () => { + const text = 'ab๐Ÿ™‚de' + const entities: TelegramMessageEntity[] = [ + { type: 'text_link', offset: 0, length: 2, url: 'https://example.com' }, + { type: 'custom_emoji', offset: 2, length: 2, custom_emoji_id: '42' }, + { type: 'pre', offset: 4, length: 1, language: 'ts' }, + ] + + expect(parseMessageEntities(text, entities)).toEqual({ + type: 'fragment', + subelements: [ + { + type: 'text', + entity: { type: 'link', url: 'https://example.com' }, + subelements: [{ type: 'plain', value: 'ab' }], + }, + { + type: 'text', + entity: { type: 'custom-emoji', id: '42', alt: '๐Ÿ™‚' }, + subelements: [], + }, + { + type: 'text', + entity: { type: 'codeblock', language: 'ts' }, + subelements: [{ type: 'plain', value: 'd' }], + }, + { type: 'plain', value: 'e' }, + ], + }) + }) + + it('ignores invalid and crossing entities', () => { + const text = 'abcdef' + const entities: TelegramMessageEntity[] = [ + { type: 'bold', offset: -1, length: 1 }, + { type: 'italic', offset: 0, length: 0 }, + { type: 'underline', offset: 0, length: 4 }, + { type: 'strikethrough', offset: 2, length: 4 }, // crossing with underline, should be skipped + { type: 'code', offset: 4, length: 2 }, + { type: 'mention', offset: 0, length: 2 }, // unsupported, ignored + ] + + expect(parseMessageEntities(text, entities)).toEqual({ + type: 'fragment', + subelements: [ + { + type: 'text', + entity: { type: 'underline' }, + subelements: [{ type: 'plain', value: 'abcd' }], + }, + { + type: 'text', + entity: { type: 'code' }, + subelements: [{ type: 'plain', value: 'ef' }], + }, + ], + }) + }) +}) From ea36497db49ba258f2889f4ee035d88747ffa262 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 25 Mar 2026 15:18:24 +0000 Subject: [PATCH 4/7] fix: keep custom emoji parser nodes leaf-only Co-authored-by: Vladislav Deryabkin --- src/parser.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/parser.ts b/src/parser.ts index 219d593..e1d0cc9 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -81,7 +81,7 @@ export function parseMessageEntities( let cursor = 0 for (const boundary of boundaries) { const chunk = text.slice(cursor, boundary) - if (chunk) { + if (chunk && !isSuppressedChunk(openStack)) { const target = openStack.length > 0 ? openStack.at(-1)!.node.subelements : root target.push({ type: 'plain', value: chunk }) } @@ -105,6 +105,10 @@ export function parseMessageEntities( return { type: 'fragment', subelements: root } } +function isSuppressedChunk(openStack: readonly OpenEntity[]): boolean { + return openStack.at(-1)?.node.entity.type === 'custom-emoji' +} + function buildBoundaries(entities: readonly ParsedEntity[], textLength: number): number[] { const boundaries = new Set([textLength]) for (const entity of entities) { From a8a24dae9a759e060da722341a629d510131e9ae Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 25 Mar 2026 15:32:33 +0000 Subject: [PATCH 5/7] refactor: unify parser entity type and add utf16 tests Co-authored-by: Vladislav Deryabkin --- src/parser.ts | 40 +++++-------------- test/parser.test.ts | 93 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 97 insertions(+), 36 deletions(-) diff --git a/src/parser.ts b/src/parser.ts index e1d0cc9..f09b6c7 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -1,7 +1,8 @@ import type { TextEntity, TgxElement, TgxElementText } from './types.ts' -export type TelegramMessageEntityType - = | 'mention' +export interface MessageEntity { + type: + | 'mention' | 'hashtag' | 'cashtag' | 'bot_command' @@ -20,38 +21,15 @@ export type TelegramMessageEntityType | 'text_link' | 'text_mention' | 'custom_emoji' - -interface TelegramMessageEntityBase { - type: TelegramMessageEntityType | (string & {}) + | (string & {}) offset: number length: number -} - -interface TelegramMessageEntityTextLink extends TelegramMessageEntityBase { - type: 'text_link' - url: string -} - -interface TelegramMessageEntityPre extends TelegramMessageEntityBase { - type: 'pre' + url?: string language?: string -} - -interface TelegramMessageEntityCustomEmoji extends TelegramMessageEntityBase { - type: 'custom_emoji' - custom_emoji_id: string -} - -interface TelegramMessageEntityFallback extends TelegramMessageEntityBase { + custom_emoji_id?: string [key: string]: unknown } -export type TelegramMessageEntity - = | TelegramMessageEntityTextLink - | TelegramMessageEntityPre - | TelegramMessageEntityCustomEmoji - | TelegramMessageEntityFallback - interface ParsedEntity { start: number end: number @@ -71,7 +49,7 @@ interface OpenEntity { */ export function parseMessageEntities( text: string, - entities: readonly TelegramMessageEntity[] = [], + entities: readonly MessageEntity[] = [], ): TgxElement { const parsedEntities = collectValidEntities(text, entities) const root: TgxElement[] = [] @@ -120,7 +98,7 @@ function buildBoundaries(entities: readonly ParsedEntity[], textLength: number): function collectValidEntities( text: string, - entities: readonly TelegramMessageEntity[], + entities: readonly MessageEntity[], ): ParsedEntity[] { const sorted = [...entities] .sort((a, b) => { @@ -166,7 +144,7 @@ function isValidRange(start: number, end: number, textLength: number): boolean { function mapEntity( text: string, - entity: TelegramMessageEntity, + entity: MessageEntity, start: number, end: number, ): TextEntity | null { diff --git a/test/parser.test.ts b/test/parser.test.ts index f289d73..6216b82 100644 --- a/test/parser.test.ts +++ b/test/parser.test.ts @@ -1,7 +1,11 @@ -import type { TelegramMessageEntity } from '../src/parser.ts' +import type { MessageEntity } from '../src/parser.ts' import { describe, expect, it } from 'vitest' import { parseMessageEntities } from '../src/parser.ts' +function cu(value: string): number { + return value.length +} + describe('parseMessageEntities', () => { it('returns plain fragment when entities are missing', () => { expect(parseMessageEntities('hello')).toEqual({ @@ -12,7 +16,7 @@ describe('parseMessageEntities', () => { it('parses nested entities with stable structure', () => { const text = 'hello world' - const entities: TelegramMessageEntity[] = [ + const entities: MessageEntity[] = [ { type: 'bold', offset: 0, length: 11 }, { type: 'italic', offset: 6, length: 5 }, ] @@ -36,7 +40,7 @@ describe('parseMessageEntities', () => { it('parses adjacent entities correctly', () => { const text = 'abcd' - const entities: TelegramMessageEntity[] = [ + const entities: MessageEntity[] = [ { type: 'bold', offset: 0, length: 2 }, { type: 'italic', offset: 2, length: 2 }, ] @@ -60,7 +64,7 @@ describe('parseMessageEntities', () => { it('maps link, codeblock and custom emoji entities', () => { const text = 'ab๐Ÿ™‚de' - const entities: TelegramMessageEntity[] = [ + const entities: MessageEntity[] = [ { type: 'text_link', offset: 0, length: 2, url: 'https://example.com' }, { type: 'custom_emoji', offset: 2, length: 2, custom_emoji_id: '42' }, { type: 'pre', offset: 4, length: 1, language: 'ts' }, @@ -91,7 +95,7 @@ describe('parseMessageEntities', () => { it('ignores invalid and crossing entities', () => { const text = 'abcdef' - const entities: TelegramMessageEntity[] = [ + const entities: MessageEntity[] = [ { type: 'bold', offset: -1, length: 1 }, { type: 'italic', offset: 0, length: 0 }, { type: 'underline', offset: 0, length: 4 }, @@ -116,4 +120,83 @@ describe('parseMessageEntities', () => { ], }) }) + + it('handles UTF-16 offsets for surrogate pairs and combining marks', () => { + const text = 'A๐Ÿ™‚e\u0301ะ–' + const emojiOffset = cu('A') + const emojiLength = cu('๐Ÿ™‚') + const combiningOffset = cu('A๐Ÿ™‚') + const combiningLength = cu('e\u0301') + const cyrillicOffset = cu('A๐Ÿ™‚e\u0301') + const cyrillicLength = cu('ะ–') + + expect(emojiOffset).toBe(1) + expect(emojiLength).toBe(2) + expect(combiningOffset).toBe(3) + expect(combiningLength).toBe(2) + expect(cyrillicOffset).toBe(5) + expect(cyrillicLength).toBe(1) + + const entities: MessageEntity[] = [ + { type: 'bold', offset: emojiOffset, length: emojiLength }, // ๐Ÿ™‚ + { type: 'italic', offset: combiningOffset, length: combiningLength }, // e + combining acute + { type: 'underline', offset: cyrillicOffset, length: cyrillicLength }, // ะ– + ] + + expect(parseMessageEntities(text, entities)).toEqual({ + type: 'fragment', + subelements: [ + { type: 'plain', value: 'A' }, + { + type: 'text', + entity: { type: 'bold' }, + subelements: [{ type: 'plain', value: '๐Ÿ™‚' }], + }, + { + type: 'text', + entity: { type: 'italic' }, + subelements: [{ type: 'plain', value: 'e\u0301' }], + }, + { + type: 'text', + entity: { type: 'underline' }, + subelements: [{ type: 'plain', value: 'ะ–' }], + }, + ], + }) + }) + + it('handles UTF-16 offsets for ZWJ emoji sequences', () => { + const family = '๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ' + const text = `x${family}y` + const familyOffset = cu('x') + const familyLength = cu(family) + const tailOffset = cu(`x${family}`) + + expect(familyOffset).toBe(1) + expect(familyLength).toBe(11) + expect(tailOffset).toBe(12) + + const entities: MessageEntity[] = [ + { type: 'custom_emoji', offset: familyOffset, length: familyLength, custom_emoji_id: 'family-1' }, + { type: 'code', offset: tailOffset, length: cu('y') }, + ] + + expect(parseMessageEntities(text, entities)).toEqual({ + type: 'fragment', + subelements: [ + { type: 'plain', value: 'x' }, + { + type: 'text', + entity: { type: 'custom-emoji', id: 'family-1', alt: family }, + subelements: [], + }, + { + type: 'text', + entity: { type: 'code' }, + subelements: [{ type: 'plain', value: 'y' }], + }, + ], + }) + }) }) From 04dd510cb2c61d34786010fcdbb4e2083745ead8 Mon Sep 17 00:00:00 2001 From: evermake Date: Wed, 25 Mar 2026 23:21:54 +0500 Subject: [PATCH 6/7] improve tests & implementation --- src/index.ts | 2 +- src/parse.ts | 260 ++++++++++++++++++++++++++++++++++++++++++++ src/parser.ts | 179 ------------------------------ test/parse.test.tsx | 170 +++++++++++++++++++++++++++++ test/parser.test.ts | 202 ---------------------------------- tsconfig.lib.json | 2 +- 6 files changed, 432 insertions(+), 383 deletions(-) create mode 100644 src/parse.ts delete mode 100644 src/parser.ts create mode 100644 test/parse.test.tsx delete mode 100644 test/parser.test.ts diff --git a/src/index.ts b/src/index.ts index ee26a56..c8f32ea 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,5 +1,5 @@ export * from './jsx.ts' export { Line } from './Line.ts' -export * from './parser.ts' +export * from './parse.ts' export * from './render.ts' export * from './types.ts' diff --git a/src/parse.ts b/src/parse.ts new file mode 100644 index 0000000..5363e34 --- /dev/null +++ b/src/parse.ts @@ -0,0 +1,260 @@ +import type { TextEntity, TgxElement, TgxElementText } from './types.ts' + +/** + * @see https://core.telegram.org/bots/api#messageentity + */ +export interface MessageEntity { + type: + | 'mention' + | 'hashtag' + | 'cashtag' + | 'bot_command' + | 'url' + | 'email' + | 'phone_number' + | 'bold' + | 'italic' + | 'underline' + | 'strikethrough' + | 'spoiler' + | 'code' + | 'pre' + | 'blockquote' + | 'expandable_blockquote' + | 'text_link' + | 'text_mention' + | 'custom_emoji' + | 'date_time' + offset: number + length: number + url?: string + language?: string + custom_emoji_id?: string + unix_time?: number + date_time_format?: string +} + +/** + * Converts formatted Telegram text with message entities to a {@link TgxElement}. + */ +export function parseEntities( + text: string, + entities: ReadonlyArray = [], +): TgxElement { + const parsedEntities = mergeAdjacentEntities(collectValidEntities(text, entities)) + const root: TgxElement[] = [] + const openStack: OpenEntity[] = [] + const boundaries = buildBoundaries(parsedEntities, text.length) + + let cursor = 0 + for (const boundary of boundaries) { + const chunk = text.slice(cursor, boundary) + if (chunk && !isSuppressedChunk(openStack)) { + const target = openStack.length > 0 ? openStack.at(-1)!.node.subelements : root + target.push({ type: 'plain', value: chunk }) + } + + while (openStack.length > 0 && openStack.at(-1)!.end === boundary) { + openStack.pop() + } + + for (const item of parsedEntities) { + if (item.start !== boundary) { + continue + } + + const node: TgxElementText = { type: 'text', entity: item.entity, subelements: [] } + const target = openStack.length > 0 ? openStack.at(-1)!.node.subelements : root + target.push(node) + openStack.push({ end: item.end, node }) + } + + cursor = boundary + } + + return { type: 'fragment', subelements: root } +} + +interface ParsedEntity { + start: number + end: number + entity: TextEntity +} + +interface OpenEntity { + end: number + node: TgxElementText +} + +/** + * Merges adjacent entities of the same type that exist at the same nesting + * level. Two entities A and B (A.end === B.start) can be merged when no other + * entity C creates a nesting boundary exactly between them: + * - Condition 1: no C with C.start โ‰ค A.start and C.end === A.end + * (C ends exactly at the boundary โ€” merged M would escape C) + * - Condition 2: no C with C.start === B.start and C.end > B.end + * (C starts at the boundary and extends further โ€” merged M would cross C) + * + * The process is repeated until no more merges are possible. + */ +function mergeAdjacentEntities(entities: ParsedEntity[]): ParsedEntity[] { + let result = entities.slice() + let pair = findMergePair(result) + while (pair !== null) { + const [i, j] = pair + const A = result[i]! + const B = result[j]! + const merged: ParsedEntity = { start: A.start, end: B.end, entity: A.entity } + result = result.filter((_, k) => k !== i && k !== j) + result.push(merged) + result.sort((a, b) => (a.start !== b.start ? a.start - b.start : b.end - a.end)) + pair = findMergePair(result) + } + return result +} + +function findMergePair(entities: ParsedEntity[]): [number, number] | null { + for (let i = 0; i < entities.length; i++) { + for (let j = 0; j < entities.length; j++) { + if (i === j) { + continue + } + const A = entities[i]! + const B = entities[j]! + if (A.end !== B.start || !entitiesDeepEqual(A.entity, B.entity)) { + continue + } + if (canMergeAdjacentEntities(A, B, entities)) { + return [i, j] + } + } + } + return null +} + +function canMergeAdjacentEntities( + A: ParsedEntity, + B: ParsedEntity, + all: ParsedEntity[], +): boolean { + for (const C of all) { + if (C === A || C === B) { + continue + } + if (C.start <= A.start && C.end === A.end) { + return false + } + if (C.start === B.start && C.end > B.end) { + return false + } + } + return true +} + +function entitiesDeepEqual(a: TextEntity, b: TextEntity): boolean { + return JSON.stringify(a) === JSON.stringify(b) +} + +function isSuppressedChunk(openStack: Array): boolean { + return openStack.at(-1)?.node.entity.type === 'custom-emoji' +} + +function buildBoundaries(entities: Array, textLength: number): number[] { + const boundaries = new Set([textLength]) + for (const entity of entities) { + boundaries.add(entity.start) + boundaries.add(entity.end) + } + return Array.from(boundaries).sort((a, b) => a - b) +} + +function collectValidEntities( + text: string, + entities: readonly MessageEntity[], +): ParsedEntity[] { + const sorted = entities.toSorted((a, b) => { + if (a.offset !== b.offset) { + return a.offset - b.offset + } + return b.length - a.length + }) + + const valid: ParsedEntity[] = [] + const stack: number[] = [] + + for (const entity of sorted) { + const start = entity.offset + const end = entity.offset + entity.length + + if (!isValidRange(start, end, text.length)) { + continue + } + + const mapped = mapEntity(text.slice(start, end), entity) + if (!mapped) { + continue + } + + while (stack.length > 0 && start >= stack.at(-1)!) { + stack.pop() + } + + if (stack.length > 0 && end > stack.at(-1)!) { + continue + } + + valid.push({ start, end, entity: mapped }) + stack.push(end) + } + + return valid +} + +function isValidRange(start: number, end: number, textLength: number): boolean { + if (!Number.isSafeInteger(start) || !Number.isSafeInteger(end)) { + return false + } + if (start < 0 || end < 0 || end <= start) { + return false + } + return end <= textLength +} + +function mapEntity( + content: string, + entity: MessageEntity, +): TextEntity | null { + switch (entity.type) { + case 'bold': + return { type: 'bold' } + case 'italic': + return { type: 'italic' } + case 'underline': + return { type: 'underline' } + case 'strikethrough': + return { type: 'strikethrough' } + case 'spoiler': + return { type: 'spoiler' } + case 'code': + return { type: 'code' } + case 'pre': + return { type: 'codeblock', language: entity.language } + case 'text_link': + return entity.url + ? { type: 'link', url: entity.url } + : null + case 'custom_emoji': + return entity.custom_emoji_id + ? { type: 'custom-emoji', id: entity.custom_emoji_id, alt: content } + : null + case 'blockquote': + return { type: 'blockquote', expandable: false } + case 'expandable_blockquote': + return { type: 'blockquote', expandable: true } + case 'date_time': + return entity.unix_time != null + ? { type: 'date-time', unix: entity.unix_time, format: entity.date_time_format } + : null + } + return null +} diff --git a/src/parser.ts b/src/parser.ts deleted file mode 100644 index f09b6c7..0000000 --- a/src/parser.ts +++ /dev/null @@ -1,179 +0,0 @@ -import type { TextEntity, TgxElement, TgxElementText } from './types.ts' - -export interface MessageEntity { - type: - | 'mention' - | 'hashtag' - | 'cashtag' - | 'bot_command' - | 'url' - | 'email' - | 'phone_number' - | 'bold' - | 'italic' - | 'underline' - | 'strikethrough' - | 'spoiler' - | 'code' - | 'pre' - | 'blockquote' - | 'expandable_blockquote' - | 'text_link' - | 'text_mention' - | 'custom_emoji' - | (string & {}) - offset: number - length: number - url?: string - language?: string - custom_emoji_id?: string - [key: string]: unknown -} - -interface ParsedEntity { - start: number - end: number - entity: TextEntity -} - -interface OpenEntity { - end: number - node: TgxElementText -} - -/** - * Converts Telegram message `text` + `entities` into a TGX tree. - * - * Offsets/lengths follow Telegram's UTF-16 indexing model. - * Invalid and crossing entities are ignored. - */ -export function parseMessageEntities( - text: string, - entities: readonly MessageEntity[] = [], -): TgxElement { - const parsedEntities = collectValidEntities(text, entities) - const root: TgxElement[] = [] - const openStack: OpenEntity[] = [] - const boundaries = buildBoundaries(parsedEntities, text.length) - - let cursor = 0 - for (const boundary of boundaries) { - const chunk = text.slice(cursor, boundary) - if (chunk && !isSuppressedChunk(openStack)) { - const target = openStack.length > 0 ? openStack.at(-1)!.node.subelements : root - target.push({ type: 'plain', value: chunk }) - } - - while (openStack.length > 0 && openStack.at(-1)!.end === boundary) - openStack.pop() - - for (const item of parsedEntities) { - if (item.start !== boundary) - continue - - const node: TgxElementText = { type: 'text', entity: item.entity, subelements: [] } - const target = openStack.length > 0 ? openStack.at(-1)!.node.subelements : root - target.push(node) - openStack.push({ end: item.end, node }) - } - - cursor = boundary - } - - return { type: 'fragment', subelements: root } -} - -function isSuppressedChunk(openStack: readonly OpenEntity[]): boolean { - return openStack.at(-1)?.node.entity.type === 'custom-emoji' -} - -function buildBoundaries(entities: readonly ParsedEntity[], textLength: number): number[] { - const boundaries = new Set([textLength]) - for (const entity of entities) { - boundaries.add(entity.start) - boundaries.add(entity.end) - } - return [...boundaries].sort((a, b) => a - b) -} - -function collectValidEntities( - text: string, - entities: readonly MessageEntity[], -): ParsedEntity[] { - const sorted = [...entities] - .sort((a, b) => { - if (a.offset !== b.offset) - return a.offset - b.offset - return b.length - a.length - }) - - const valid: ParsedEntity[] = [] - const stack: number[] = [] - - for (const entity of sorted) { - const start = entity.offset - const end = entity.offset + entity.length - - if (!isValidRange(start, end, text.length)) - continue - - const mapped = mapEntity(text, entity, start, end) - if (!mapped) - continue - - while (stack.length > 0 && start >= stack.at(-1)!) - stack.pop() - - if (stack.length > 0 && end > stack.at(-1)!) - continue - - valid.push({ start, end, entity: mapped }) - stack.push(end) - } - - return valid -} - -function isValidRange(start: number, end: number, textLength: number): boolean { - if (!Number.isInteger(start) || !Number.isInteger(end)) - return false - if (start < 0 || end < 0 || end <= start) - return false - return end <= textLength -} - -function mapEntity( - text: string, - entity: MessageEntity, - start: number, - end: number, -): TextEntity | null { - switch (entity.type) { - case 'bold': - return { type: 'bold' } - case 'italic': - return { type: 'italic' } - case 'underline': - return { type: 'underline' } - case 'strikethrough': - return { type: 'strikethrough' } - case 'spoiler': - return { type: 'spoiler' } - case 'code': - return { type: 'code' } - case 'pre': - return { type: 'codeblock', language: entity.language } - case 'text_link': - return entity.url ? { type: 'link', url: entity.url } : null - case 'custom_emoji': - return entity.custom_emoji_id - ? { type: 'custom-emoji', id: entity.custom_emoji_id, alt: text.slice(start, end) } - : null - case 'blockquote': - return { type: 'blockquote', expandable: false } - case 'expandable_blockquote': - return { type: 'blockquote', expandable: true } - default: - return null - } -} diff --git a/test/parse.test.tsx b/test/parse.test.tsx new file mode 100644 index 0000000..4398b47 --- /dev/null +++ b/test/parse.test.tsx @@ -0,0 +1,170 @@ +import type { MessageEntity, TgxElement } from '../src/index.ts' +import { describe, expect, it } from 'vitest' +import { parseEntities } from '../src/index.ts' + +describe('parseEntities', () => { + const TEST_CASES: Array<{ + name: string + text: string + entities?: Array + expected: TgxElement + }> = [ + { + name: 'plain text w/o entities', + text: 'hello', + entities: [], + expected: <>{'hello'}, + }, + { + name: 'nested entities with stable structure', + text: 'hello world', + entities: [ + { type: 'bold', offset: 0, length: 11 }, + { type: 'italic', offset: 6, length: 5 }, + ], + expected: <>{'hello '}{'world'}, + }, + { + name: 'adjacent entities', + text: 'abcd', + entities: [ + { type: 'bold', offset: 0, length: 2 }, + { type: 'italic', offset: 2, length: 2 }, + ], + expected: <>{'ab'}{'cd'}, + }, + { + name: 'link, codeblock and custom emoji', + text: 'ab๐Ÿ™‚de', + entities: [ + { type: 'text_link', offset: 0, length: 2, url: 'https://example.com' }, + { type: 'custom_emoji', offset: 2, length: 2, custom_emoji_id: '42' }, + { type: 'pre', offset: 4, length: 1, language: 'ts' }, + ], + expected: ( + <> + {'ab'} + + {'d'} + {'e'} + + ), + }, + { + name: 'ignoring invalid and crossing entities', + text: 'abcdef', + entities: [ + { type: 'bold', offset: -1, length: 1 }, + { type: 'italic', offset: 0, length: 0 }, + { type: 'underline', offset: 0, length: 4 }, + { type: 'strikethrough', offset: 2, length: 4 }, // crossing with underline, should be skipped + { type: 'code', offset: 4, length: 2 }, + { type: 'mention', offset: 0, length: 2 }, // unsupported, ignored + ], + expected: ( + <> + {'abcd'} + {'ef'} + + ), + }, + { + name: 'UTF-16 offsets for ZWJ emoji sequences', + text: `x${'๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ'}y`, + entities: [ + { type: 'custom_emoji', offset: 1, length: 11, custom_emoji_id: 'family-1' }, + { type: 'code', offset: 12, length: 1 }, + ], + expected: <>{'x'}{'y'}, + }, + { + name: 'surrogate pairs', + text: 'A๐Ÿ™‚e\u0301ะ–', + entities: [ + { offset: 1, length: 2, type: 'bold' }, + { offset: 3, length: 2, type: 'italic' }, + { offset: 5, length: 1, type: 'underline' }, + ], + expected: <>{'A'}{'๐Ÿ™‚'}{'e\u0301'}{'ะ–'}, + }, + { + name: 'complex example', + text: '๐Ÿ˜ฎโ€๐Ÿ’จะŸะ ะ˜ะœะ•ะ ๐Ÿšซ\n\n๐Ÿ‡ท๐Ÿ‡บ - ไฟ„็ฝ—ๆ–ฏ\n๐Ÿ‡บ๐Ÿ‡ธ โ€“ ะะผะตั€ะธะบะฐ\n๐Ÿ‡จ๐Ÿ‡ณ โ€” China\n\nะผะพะฝะพัˆะธั€ะธะฝะฝั‹ะน โ†”๏ธ text\n\n\nconsole.log("๐Ÿ‘‹ Hi there!")\n\n\nCopy this code โ†’ ุงู„ุฑู…ุฒ ุงู„ุชุฑูˆูŠุฌูŠ โ† for nothing.\n\nExpandable blockquote with bold, italic, spoiler, strikethrough, underline, and bold italic spoiler strikethrough underline text!\n\nTomorrow at 12:34.', + entities: [ + { offset: 0, length: 5, type: 'custom_emoji', custom_emoji_id: '5192886773948107844' }, + { offset: 5, length: 6, type: 'bold' }, + { offset: 11, length: 2, type: 'custom_emoji', custom_emoji_id: '5776064103483184336' }, + { offset: 15, length: 4, type: 'text_link', url: 'https://en.wikipedia.org/wiki/Russia' }, + { offset: 26, length: 4, type: 'text_link', url: 'https://en.wikipedia.org/wiki/United_States' }, + { offset: 41, length: 4, type: 'text_link', url: 'https://en.wikipedia.org/wiki/China' }, + { offset: 55, length: 20, type: 'code' }, + { offset: 77, length: 29, type: 'pre', language: 'js' }, + { offset: 125, length: 14, type: 'code' }, + { offset: 156, length: 129, type: 'expandable_blockquote' }, + { offset: 183, length: 4, type: 'bold' }, + { offset: 189, length: 6, type: 'italic' }, + { offset: 197, length: 7, type: 'spoiler' }, + { offset: 206, length: 13, type: 'strikethrough' }, + { offset: 221, length: 9, type: 'underline' }, + { offset: 236, length: 5, type: 'bold' }, + { offset: 241, length: 7, type: 'bold' }, + { offset: 241, length: 7, type: 'italic' }, + { offset: 248, length: 8, type: 'bold' }, + { offset: 248, length: 8, type: 'italic' }, + { offset: 248, length: 8, type: 'spoiler' }, + { offset: 256, length: 14, type: 'bold' }, + { offset: 256, length: 14, type: 'italic' }, + { offset: 256, length: 14, type: 'strikethrough' }, + { offset: 256, length: 14, type: 'spoiler' }, + { offset: 270, length: 14, type: 'bold' }, + { offset: 270, length: 14, type: 'italic' }, + { offset: 270, length: 14, type: 'underline' }, + { offset: 270, length: 14, type: 'strikethrough' }, + { offset: 270, length: 14, type: 'spoiler' }, + { offset: 287, length: 17, type: 'date_time', unix_time: 1774510496, date_time_format: 'dt' }, + ], + expected: ( + <> + + ะŸะ ะ˜ะœะ•ะ  + + {'\n\n'} + ๐Ÿ‡ท๐Ÿ‡บ + {' - ไฟ„็ฝ—ๆ–ฏ\n'} + ๐Ÿ‡บ๐Ÿ‡ธ + {' โ€“ ะะผะตั€ะธะบะฐ\n'} + ๐Ÿ‡จ๐Ÿ‡ณ + {' โ€” China\n\n'} + ะผะพะฝะพัˆะธั€ะธะฝะฝั‹ะน โ†”๏ธ text + {'\n\n'} + {'\nconsole.log("๐Ÿ‘‹ Hi there!")\n'} + {'\n\nCopy this code โ†’ '} + {'ุงู„ุฑู…ุฒ ุงู„ุชุฑูˆูŠุฌูŠ'} + {' โ† for nothing.\n\n'} +
+ {'Expandable blockquote with '} + bold + {', '} + italic + {', '} + spoiler + {', '} + strikethrough + {', '} + underline + {', and '} + bold italic spoiler strikethrough underline text + {'!'} +
+ {'\n\n'} + + {'.'} + + ), + }, + ] + + it.each(TEST_CASES)('should parse $name', ({ text, entities, expected }) => { + expect(parseEntities(text, entities)).toEqual(expected) + }) +}) diff --git a/test/parser.test.ts b/test/parser.test.ts deleted file mode 100644 index 6216b82..0000000 --- a/test/parser.test.ts +++ /dev/null @@ -1,202 +0,0 @@ -import type { MessageEntity } from '../src/parser.ts' -import { describe, expect, it } from 'vitest' -import { parseMessageEntities } from '../src/parser.ts' - -function cu(value: string): number { - return value.length -} - -describe('parseMessageEntities', () => { - it('returns plain fragment when entities are missing', () => { - expect(parseMessageEntities('hello')).toEqual({ - type: 'fragment', - subelements: [{ type: 'plain', value: 'hello' }], - }) - }) - - it('parses nested entities with stable structure', () => { - const text = 'hello world' - const entities: MessageEntity[] = [ - { type: 'bold', offset: 0, length: 11 }, - { type: 'italic', offset: 6, length: 5 }, - ] - - expect(parseMessageEntities(text, entities)).toEqual({ - type: 'fragment', - subelements: [{ - type: 'text', - entity: { type: 'bold' }, - subelements: [ - { type: 'plain', value: 'hello ' }, - { - type: 'text', - entity: { type: 'italic' }, - subelements: [{ type: 'plain', value: 'world' }], - }, - ], - }], - }) - }) - - it('parses adjacent entities correctly', () => { - const text = 'abcd' - const entities: MessageEntity[] = [ - { type: 'bold', offset: 0, length: 2 }, - { type: 'italic', offset: 2, length: 2 }, - ] - - expect(parseMessageEntities(text, entities)).toEqual({ - type: 'fragment', - subelements: [ - { - type: 'text', - entity: { type: 'bold' }, - subelements: [{ type: 'plain', value: 'ab' }], - }, - { - type: 'text', - entity: { type: 'italic' }, - subelements: [{ type: 'plain', value: 'cd' }], - }, - ], - }) - }) - - it('maps link, codeblock and custom emoji entities', () => { - const text = 'ab๐Ÿ™‚de' - const entities: MessageEntity[] = [ - { type: 'text_link', offset: 0, length: 2, url: 'https://example.com' }, - { type: 'custom_emoji', offset: 2, length: 2, custom_emoji_id: '42' }, - { type: 'pre', offset: 4, length: 1, language: 'ts' }, - ] - - expect(parseMessageEntities(text, entities)).toEqual({ - type: 'fragment', - subelements: [ - { - type: 'text', - entity: { type: 'link', url: 'https://example.com' }, - subelements: [{ type: 'plain', value: 'ab' }], - }, - { - type: 'text', - entity: { type: 'custom-emoji', id: '42', alt: '๐Ÿ™‚' }, - subelements: [], - }, - { - type: 'text', - entity: { type: 'codeblock', language: 'ts' }, - subelements: [{ type: 'plain', value: 'd' }], - }, - { type: 'plain', value: 'e' }, - ], - }) - }) - - it('ignores invalid and crossing entities', () => { - const text = 'abcdef' - const entities: MessageEntity[] = [ - { type: 'bold', offset: -1, length: 1 }, - { type: 'italic', offset: 0, length: 0 }, - { type: 'underline', offset: 0, length: 4 }, - { type: 'strikethrough', offset: 2, length: 4 }, // crossing with underline, should be skipped - { type: 'code', offset: 4, length: 2 }, - { type: 'mention', offset: 0, length: 2 }, // unsupported, ignored - ] - - expect(parseMessageEntities(text, entities)).toEqual({ - type: 'fragment', - subelements: [ - { - type: 'text', - entity: { type: 'underline' }, - subelements: [{ type: 'plain', value: 'abcd' }], - }, - { - type: 'text', - entity: { type: 'code' }, - subelements: [{ type: 'plain', value: 'ef' }], - }, - ], - }) - }) - - it('handles UTF-16 offsets for surrogate pairs and combining marks', () => { - const text = 'A๐Ÿ™‚e\u0301ะ–' - const emojiOffset = cu('A') - const emojiLength = cu('๐Ÿ™‚') - const combiningOffset = cu('A๐Ÿ™‚') - const combiningLength = cu('e\u0301') - const cyrillicOffset = cu('A๐Ÿ™‚e\u0301') - const cyrillicLength = cu('ะ–') - - expect(emojiOffset).toBe(1) - expect(emojiLength).toBe(2) - expect(combiningOffset).toBe(3) - expect(combiningLength).toBe(2) - expect(cyrillicOffset).toBe(5) - expect(cyrillicLength).toBe(1) - - const entities: MessageEntity[] = [ - { type: 'bold', offset: emojiOffset, length: emojiLength }, // ๐Ÿ™‚ - { type: 'italic', offset: combiningOffset, length: combiningLength }, // e + combining acute - { type: 'underline', offset: cyrillicOffset, length: cyrillicLength }, // ะ– - ] - - expect(parseMessageEntities(text, entities)).toEqual({ - type: 'fragment', - subelements: [ - { type: 'plain', value: 'A' }, - { - type: 'text', - entity: { type: 'bold' }, - subelements: [{ type: 'plain', value: '๐Ÿ™‚' }], - }, - { - type: 'text', - entity: { type: 'italic' }, - subelements: [{ type: 'plain', value: 'e\u0301' }], - }, - { - type: 'text', - entity: { type: 'underline' }, - subelements: [{ type: 'plain', value: 'ะ–' }], - }, - ], - }) - }) - - it('handles UTF-16 offsets for ZWJ emoji sequences', () => { - const family = '๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ' - const text = `x${family}y` - const familyOffset = cu('x') - const familyLength = cu(family) - const tailOffset = cu(`x${family}`) - - expect(familyOffset).toBe(1) - expect(familyLength).toBe(11) - expect(tailOffset).toBe(12) - - const entities: MessageEntity[] = [ - { type: 'custom_emoji', offset: familyOffset, length: familyLength, custom_emoji_id: 'family-1' }, - { type: 'code', offset: tailOffset, length: cu('y') }, - ] - - expect(parseMessageEntities(text, entities)).toEqual({ - type: 'fragment', - subelements: [ - { type: 'plain', value: 'x' }, - { - type: 'text', - entity: { type: 'custom-emoji', id: 'family-1', alt: family }, - subelements: [], - }, - { - type: 'text', - entity: { type: 'code' }, - subelements: [{ type: 'plain', value: 'y' }], - }, - ], - }) - }) -}) diff --git a/tsconfig.lib.json b/tsconfig.lib.json index ac9e52c..13e334c 100644 --- a/tsconfig.lib.json +++ b/tsconfig.lib.json @@ -2,7 +2,7 @@ "compilerOptions": { "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.lib.tsbuildinfo", "target": "es2022", - "lib": ["ES2022"], + "lib": ["ESNext"], "moduleDetection": "force", "rootDir": "./src/", "module": "node20", From acef61f80d4bd8c2bdcc2ff59f18d2c2f6d799a4 Mon Sep 17 00:00:00 2001 From: evermake Date: Wed, 25 Mar 2026 23:26:45 +0500 Subject: [PATCH 7/7] .