Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions lua/ninetyfive/delta.lua
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,38 @@ local M = {}
---@return number end_pos byte offset in old_text where change ends
---@return string insert_text the text to insert at start
function M.compute_delta(old_text, new_text)
-- Find common prefix
-- Find common prefix (by byte)
local i = 1
while i <= #old_text and i <= #new_text and old_text:sub(i, i) == new_text:sub(i, i) do
i = i + 1
end
local prefix_len = i - 1

-- Find common suffix (but don't overlap with prefix)
-- Find common suffix (by byte, don't overlap with prefix)
local j = 0
while j < #old_text - prefix_len and j < #new_text - prefix_len
while j < #old_text - (i - 1) and j < #new_text - (i - 1)
and old_text:sub(#old_text - j, #old_text - j) == new_text:sub(#new_text - j, #new_text - j) do
j = j + 1
end

local start = prefix_len
-- Adjust start backward to UTF-8 character boundary
-- Continuation bytes are 10xxxxxx (0x80-0xBF)
local start = i - 1
local b = old_text:byte(start + 1)
while start > 0 and b and b >= 0x80 and b <= 0xBF do
start = start - 1
b = old_text:byte(start + 1)
end

-- Adjust end backward to UTF-8 character boundary
local end_pos = #old_text - j
local insert_text = new_text:sub(prefix_len + 1, #new_text - j)
b = old_text:byte(end_pos + 1)
while end_pos > start and b and b >= 0x80 and b <= 0xBF do
end_pos = end_pos - 1
b = old_text:byte(end_pos + 1)
end

j = #old_text - end_pos
local insert_text = new_text:sub(start + 1, #new_text - j)

return start, end_pos, insert_text
end
Expand Down
40 changes: 40 additions & 0 deletions tests/test_delta.lua
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,44 @@ T["add new line at end"] = function()
MiniTest.expect.equality(r.text, "\nline2")
end

-- ============ UTF-8 boundary tests ============
-- These tests verify that byte offsets don't split multi-byte UTF-8 characters.
-- The bug occurs when two different multi-byte chars share the same leading byte(s)
-- but differ in a continuation byte - byte-by-byte comparison splits the character.

T["UTF-8: different 2-byte chars (é vs ë)"] = function()
-- é = c3 a9, ë = c3 ab (both start with c3, differ in continuation byte)
-- Byte comparison would find diff at byte 2 (the continuation byte), splitting the char
child.lua([[
local old_text = "héllo"
local new_text = "hëllo"
local start, end_pos, _ = delta.compute_delta(old_text, new_text)
-- Check that boundary doesn't fall on a continuation byte (0x80-0xBF)
local next_byte = old_text:byte(start + 1)
_G.result = {
boundary_valid = not (next_byte >= 0x80 and next_byte <= 0xBF),
start = start,
}
]])
local r = child.lua_get("_G.result")
MiniTest.expect.equality(r.boundary_valid, true)
end

T["UTF-8: different 3-byte chars (中 vs 丰)"] = function()
-- 中 = e4 b8 ad, 丰 = e4 b8 b0 (both start with e4 b8, differ in last byte)
-- Byte comparison would find diff at byte 3, splitting the char
child.lua([[
local old_text = "中文"
local new_text = "丰文"
local start, end_pos, _ = delta.compute_delta(old_text, new_text)
local next_byte = old_text:byte(start + 1)
_G.result = {
boundary_valid = not (next_byte >= 0x80 and next_byte <= 0xBF),
start = start,
}
]])
local r = child.lua_get("_G.result")
MiniTest.expect.equality(r.boundary_valid, true)
end

return T