numerataz · juaoose · Jan 9, 2026 · Jan 9, 2026
diff --git a/lua/ninetyfive/delta.lua b/lua/ninetyfive/delta.lua
@@ -8,23 +8,38 @@ local M = {}
 ---@return number end_pos byte offset in old_text where change ends
 ---@return string insert_text the text to insert at start
 function M.compute_delta(old_text, new_text)
-    -- Find common prefix
+    -- Find common prefix (by byte)
     local i = 1
     while i <= #old_text and i <= #new_text and old_text:sub(i, i) == new_text:sub(i, i) do
         i = i + 1
     end
-    local prefix_len = i - 1
 
-    -- Find common suffix (but don't overlap with prefix)
+    -- Find common suffix (by byte, don't overlap with prefix)
     local j = 0
-    while j < #old_text - prefix_len and j < #new_text - prefix_len
+    while j < #old_text - (i - 1) and j < #new_text - (i - 1)
           and old_text:sub(#old_text - j, #old_text - j) == new_text:sub(#new_text - j, #new_text - j) do
         j = j + 1
     end
 
-    local start = prefix_len
+    -- Adjust start backward to UTF-8 character boundary
+    -- Continuation bytes are 10xxxxxx (0x80-0xBF)
+    local start = i - 1
+    local b = old_text:byte(start + 1)
+    while start > 0 and b and b >= 0x80 and b <= 0xBF do
+        start = start - 1
+        b = old_text:byte(start + 1)
+    end
+
+    -- Adjust end backward to UTF-8 character boundary
     local end_pos = #old_text - j
-    local insert_text = new_text:sub(prefix_len + 1, #new_text - j)
+    b = old_text:byte(end_pos + 1)
+    while end_pos > start and b and b >= 0x80 and b <= 0xBF do
+        end_pos = end_pos - 1
+        b = old_text:byte(end_pos + 1)
+    end
+
+    j = #old_text - end_pos
+    local insert_text = new_text:sub(start + 1, #new_text - j)
 
     return start, end_pos, insert_text
 end

diff --git a/tests/test_delta.lua b/tests/test_delta.lua
@@ -187,4 +187,44 @@ T["add new line at end"] = function()
     MiniTest.expect.equality(r.text, "\nline2")
 end
 
+-- ============ UTF-8 boundary tests ============
+-- These tests verify that byte offsets don't split multi-byte UTF-8 characters.
+-- The bug occurs when two different multi-byte chars share the same leading byte(s)
+-- but differ in a continuation byte - byte-by-byte comparison splits the character.
+
+T["UTF-8: different 2-byte chars (é vs ë)"] = function()
+    -- é = c3 a9, ë = c3 ab (both start with c3, differ in continuation byte)
+    -- Byte comparison would find diff at byte 2 (the continuation byte), splitting the char
+    child.lua([[
+        local old_text = "héllo"
+        local new_text = "hëllo"
+        local start, end_pos, _ = delta.compute_delta(old_text, new_text)
+        -- Check that boundary doesn't fall on a continuation byte (0x80-0xBF)
+        local next_byte = old_text:byte(start + 1)
+        _G.result = {
+            boundary_valid = not (next_byte >= 0x80 and next_byte <= 0xBF),
+            start = start,
+        }
+    ]])
+    local r = child.lua_get("_G.result")
+    MiniTest.expect.equality(r.boundary_valid, true)
+end
+
+T["UTF-8: different 3-byte chars (中 vs 丰)"] = function()
+    -- 中 = e4 b8 ad, 丰 = e4 b8 b0 (both start with e4 b8, differ in last byte)
+    -- Byte comparison would find diff at byte 3, splitting the char
+    child.lua([[
+        local old_text = "中文"
+        local new_text = "丰文"
+        local start, end_pos, _ = delta.compute_delta(old_text, new_text)
+        local next_byte = old_text:byte(start + 1)
+        _G.result = {
+            boundary_valid = not (next_byte >= 0x80 and next_byte <= 0xBF),
+            start = start,
+        }
+    ]])
+    local r = child.lua_get("_G.result")
+    MiniTest.expect.equality(r.boundary_valid, true)
+end
+
 return T