diff --git a/lua/ninetyfive/delta.lua b/lua/ninetyfive/delta.lua index d731b57..a0a6672 100644 --- a/lua/ninetyfive/delta.lua +++ b/lua/ninetyfive/delta.lua @@ -8,23 +8,38 @@ local M = {} ---@return number end_pos byte offset in old_text where change ends ---@return string insert_text the text to insert at start function M.compute_delta(old_text, new_text) - -- Find common prefix + -- Find common prefix (by byte) local i = 1 while i <= #old_text and i <= #new_text and old_text:sub(i, i) == new_text:sub(i, i) do i = i + 1 end - local prefix_len = i - 1 - -- Find common suffix (but don't overlap with prefix) + -- Find common suffix (by byte, don't overlap with prefix) local j = 0 - while j < #old_text - prefix_len and j < #new_text - prefix_len + while j < #old_text - (i - 1) and j < #new_text - (i - 1) and old_text:sub(#old_text - j, #old_text - j) == new_text:sub(#new_text - j, #new_text - j) do j = j + 1 end - local start = prefix_len + -- Adjust start backward to UTF-8 character boundary + -- Continuation bytes are 10xxxxxx (0x80-0xBF) + local start = i - 1 + local b = old_text:byte(start + 1) + while start > 0 and b and b >= 0x80 and b <= 0xBF do + start = start - 1 + b = old_text:byte(start + 1) + end + + -- Adjust end backward to UTF-8 character boundary local end_pos = #old_text - j - local insert_text = new_text:sub(prefix_len + 1, #new_text - j) + b = old_text:byte(end_pos + 1) + while end_pos > start and b and b >= 0x80 and b <= 0xBF do + end_pos = end_pos - 1 + b = old_text:byte(end_pos + 1) + end + + j = #old_text - end_pos + local insert_text = new_text:sub(start + 1, #new_text - j) return start, end_pos, insert_text end diff --git a/tests/test_delta.lua b/tests/test_delta.lua index 768dda1..87fa340 100644 --- a/tests/test_delta.lua +++ b/tests/test_delta.lua @@ -187,4 +187,44 @@ T["add new line at end"] = function() MiniTest.expect.equality(r.text, "\nline2") end +-- ============ UTF-8 boundary tests ============ +-- These tests verify that byte offsets don't split multi-byte UTF-8 characters. +-- The bug occurs when two different multi-byte chars share the same leading byte(s) +-- but differ in a continuation byte - byte-by-byte comparison splits the character. + +T["UTF-8: different 2-byte chars (é vs ë)"] = function() + -- é = c3 a9, ë = c3 ab (both start with c3, differ in continuation byte) + -- Byte comparison would find diff at byte 2 (the continuation byte), splitting the char + child.lua([[ + local old_text = "héllo" + local new_text = "hëllo" + local start, end_pos, _ = delta.compute_delta(old_text, new_text) + -- Check that boundary doesn't fall on a continuation byte (0x80-0xBF) + local next_byte = old_text:byte(start + 1) + _G.result = { + boundary_valid = not (next_byte >= 0x80 and next_byte <= 0xBF), + start = start, + } + ]]) + local r = child.lua_get("_G.result") + MiniTest.expect.equality(r.boundary_valid, true) +end + +T["UTF-8: different 3-byte chars (中 vs 丰)"] = function() + -- 中 = e4 b8 ad, 丰 = e4 b8 b0 (both start with e4 b8, differ in last byte) + -- Byte comparison would find diff at byte 3, splitting the char + child.lua([[ + local old_text = "中文" + local new_text = "丰文" + local start, end_pos, _ = delta.compute_delta(old_text, new_text) + local next_byte = old_text:byte(start + 1) + _G.result = { + boundary_valid = not (next_byte >= 0x80 and next_byte <= 0xBF), + start = start, + } + ]]) + local r = child.lua_get("_G.result") + MiniTest.expect.equality(r.boundary_valid, true) +end + return T