Skip to content

Commit ca8ad44

Browse files
cursoragentevermake
andcommitted
refactor: unify parser entity type and add utf16 tests
Co-authored-by: Vladislav Deryabkin <evermake@users.noreply.github.com>
1 parent 62cfc80 commit ca8ad44

2 files changed

Lines changed: 97 additions & 36 deletions

File tree

src/parser.ts

Lines changed: 9 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import type { TextEntity, TgxElement, TgxElementText } from './types.ts'
22

3-
export type TelegramMessageEntityType
4-
= | 'mention'
3+
export interface MessageEntity {
4+
type:
5+
| 'mention'
56
| 'hashtag'
67
| 'cashtag'
78
| 'bot_command'
@@ -20,38 +21,15 @@ export type TelegramMessageEntityType
2021
| 'text_link'
2122
| 'text_mention'
2223
| 'custom_emoji'
23-
24-
interface TelegramMessageEntityBase {
25-
type: TelegramMessageEntityType | (string & {})
24+
| (string & {})
2625
offset: number
2726
length: number
28-
}
29-
30-
interface TelegramMessageEntityTextLink extends TelegramMessageEntityBase {
31-
type: 'text_link'
32-
url: string
33-
}
34-
35-
interface TelegramMessageEntityPre extends TelegramMessageEntityBase {
36-
type: 'pre'
27+
url?: string
3728
language?: string
38-
}
39-
40-
interface TelegramMessageEntityCustomEmoji extends TelegramMessageEntityBase {
41-
type: 'custom_emoji'
42-
custom_emoji_id: string
43-
}
44-
45-
interface TelegramMessageEntityFallback extends TelegramMessageEntityBase {
29+
custom_emoji_id?: string
4630
[key: string]: unknown
4731
}
4832

49-
export type TelegramMessageEntity
50-
= | TelegramMessageEntityTextLink
51-
| TelegramMessageEntityPre
52-
| TelegramMessageEntityCustomEmoji
53-
| TelegramMessageEntityFallback
54-
5533
interface ParsedEntity {
5634
start: number
5735
end: number
@@ -71,7 +49,7 @@ interface OpenEntity {
7149
*/
7250
export function parseMessageEntities(
7351
text: string,
74-
entities: readonly TelegramMessageEntity[] = [],
52+
entities: readonly MessageEntity[] = [],
7553
): TgxElement {
7654
const parsedEntities = collectValidEntities(text, entities)
7755
const root: TgxElement[] = []
@@ -120,7 +98,7 @@ function buildBoundaries(entities: readonly ParsedEntity[], textLength: number):
12098

12199
function collectValidEntities(
122100
text: string,
123-
entities: readonly TelegramMessageEntity[],
101+
entities: readonly MessageEntity[],
124102
): ParsedEntity[] {
125103
const sorted = [...entities]
126104
.sort((a, b) => {
@@ -166,7 +144,7 @@ function isValidRange(start: number, end: number, textLength: number): boolean {
166144

167145
function mapEntity(
168146
text: string,
169-
entity: TelegramMessageEntity,
147+
entity: MessageEntity,
170148
start: number,
171149
end: number,
172150
): TextEntity | null {

test/parser.test.ts

Lines changed: 88 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1-
import type { TelegramMessageEntity } from '../src/parser.ts'
1+
import type { MessageEntity } from '../src/parser.ts'
22
import { describe, expect, it } from 'vitest'
33
import { parseMessageEntities } from '../src/parser.ts'
44

5+
function cu(value: string): number {
6+
return value.length
7+
}
8+
59
describe('parseMessageEntities', () => {
610
it('returns plain fragment when entities are missing', () => {
711
expect(parseMessageEntities('hello')).toEqual({
@@ -12,7 +16,7 @@ describe('parseMessageEntities', () => {
1216

1317
it('parses nested entities with stable structure', () => {
1418
const text = 'hello world'
15-
const entities: TelegramMessageEntity[] = [
19+
const entities: MessageEntity[] = [
1620
{ type: 'bold', offset: 0, length: 11 },
1721
{ type: 'italic', offset: 6, length: 5 },
1822
]
@@ -36,7 +40,7 @@ describe('parseMessageEntities', () => {
3640

3741
it('parses adjacent entities correctly', () => {
3842
const text = 'abcd'
39-
const entities: TelegramMessageEntity[] = [
43+
const entities: MessageEntity[] = [
4044
{ type: 'bold', offset: 0, length: 2 },
4145
{ type: 'italic', offset: 2, length: 2 },
4246
]
@@ -60,7 +64,7 @@ describe('parseMessageEntities', () => {
6064

6165
it('maps link, codeblock and custom emoji entities', () => {
6266
const text = 'ab🙂de'
63-
const entities: TelegramMessageEntity[] = [
67+
const entities: MessageEntity[] = [
6468
{ type: 'text_link', offset: 0, length: 2, url: 'https://example.com' },
6569
{ type: 'custom_emoji', offset: 2, length: 2, custom_emoji_id: '42' },
6670
{ type: 'pre', offset: 4, length: 1, language: 'ts' },
@@ -91,7 +95,7 @@ describe('parseMessageEntities', () => {
9195

9296
it('ignores invalid and crossing entities', () => {
9397
const text = 'abcdef'
94-
const entities: TelegramMessageEntity[] = [
98+
const entities: MessageEntity[] = [
9599
{ type: 'bold', offset: -1, length: 1 },
96100
{ type: 'italic', offset: 0, length: 0 },
97101
{ type: 'underline', offset: 0, length: 4 },
@@ -116,4 +120,83 @@ describe('parseMessageEntities', () => {
116120
],
117121
})
118122
})
123+
124+
it('handles UTF-16 offsets for surrogate pairs and combining marks', () => {
125+
const text = 'A🙂e\u0301Ж'
126+
const emojiOffset = cu('A')
127+
const emojiLength = cu('🙂')
128+
const combiningOffset = cu('A🙂')
129+
const combiningLength = cu('e\u0301')
130+
const cyrillicOffset = cu('A🙂e\u0301')
131+
const cyrillicLength = cu('Ж')
132+
133+
expect(emojiOffset).toBe(1)
134+
expect(emojiLength).toBe(2)
135+
expect(combiningOffset).toBe(3)
136+
expect(combiningLength).toBe(2)
137+
expect(cyrillicOffset).toBe(5)
138+
expect(cyrillicLength).toBe(1)
139+
140+
const entities: MessageEntity[] = [
141+
{ type: 'bold', offset: emojiOffset, length: emojiLength }, // 🙂
142+
{ type: 'italic', offset: combiningOffset, length: combiningLength }, // e + combining acute
143+
{ type: 'underline', offset: cyrillicOffset, length: cyrillicLength }, // Ж
144+
]
145+
146+
expect(parseMessageEntities(text, entities)).toEqual({
147+
type: 'fragment',
148+
subelements: [
149+
{ type: 'plain', value: 'A' },
150+
{
151+
type: 'text',
152+
entity: { type: 'bold' },
153+
subelements: [{ type: 'plain', value: '🙂' }],
154+
},
155+
{
156+
type: 'text',
157+
entity: { type: 'italic' },
158+
subelements: [{ type: 'plain', value: 'e\u0301' }],
159+
},
160+
{
161+
type: 'text',
162+
entity: { type: 'underline' },
163+
subelements: [{ type: 'plain', value: 'Ж' }],
164+
},
165+
],
166+
})
167+
})
168+
169+
it('handles UTF-16 offsets for ZWJ emoji sequences', () => {
170+
const family = '👨‍👩‍👧‍👦'
171+
const text = `x${family}y`
172+
const familyOffset = cu('x')
173+
const familyLength = cu(family)
174+
const tailOffset = cu(`x${family}`)
175+
176+
expect(familyOffset).toBe(1)
177+
expect(familyLength).toBe(11)
178+
expect(tailOffset).toBe(12)
179+
180+
const entities: MessageEntity[] = [
181+
{ type: 'custom_emoji', offset: familyOffset, length: familyLength, custom_emoji_id: 'family-1' },
182+
{ type: 'code', offset: tailOffset, length: cu('y') },
183+
]
184+
185+
expect(parseMessageEntities(text, entities)).toEqual({
186+
type: 'fragment',
187+
subelements: [
188+
{ type: 'plain', value: 'x' },
189+
{
190+
type: 'text',
191+
entity: { type: 'custom-emoji', id: 'family-1', alt: family },
192+
subelements: [],
193+
},
194+
{
195+
type: 'text',
196+
entity: { type: 'code' },
197+
subelements: [{ type: 'plain', value: 'y' }],
198+
},
199+
],
200+
})
201+
})
119202
})

0 commit comments

Comments
 (0)