From 1a2440d681ab81353df1af45ee126bee2bf0fc89 Mon Sep 17 00:00:00 2001 From: Andriy Oblivantsev Date: Sun, 15 Feb 2026 16:40:31 +0000 Subject: [PATCH 1/4] fix: deduplicate search results when PST is re-imported When the same PST file is imported multiple times, each import creates a new account with duplicate emails. SearchMulti was unioning all account indices without deduplication, so the same messages appeared multiple times in the search list. Fix: deduplicate in SearchMulti by content checksum (extracted from path format folder/{checksum}-{id}.eml). Uses ROW_NUMBER() to keep one row per unique content across accounts. Co-authored-by: Cursor --- internal/search/index/index.go | 19 +++++++++- internal/search/index/index_test.go | 59 +++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/internal/search/index/index.go b/internal/search/index/index.go index c9c474a..e62ab3b 100644 --- a/internal/search/index/index.go +++ b/internal/search/index/index.go @@ -295,7 +295,24 @@ func SearchMulti(accounts []AccountIndex, query string, offset, limit int) Searc return SearchResult{Query: query, Total: 0, Offset: offset, Limit: limit, Hits: []Hit{}} } - createSQL := "CREATE TEMP TABLE emails AS " + strings.Join(unionParts, " UNION ALL ") + // Build raw union first. + rawUnion := strings.Join(unionParts, " UNION ALL ") + // Deduplicate by content checksum (extracted from path): same email in multiple accounts + // (e.g. re-imported PST) appears once. Rows without checksum in path are kept as-is. + createSQL := `CREATE TEMP TABLE emails AS + SELECT account_id, path, subject, from_addr, to_addr, date, size, body_text + FROM ( + SELECT *, + ROW_NUMBER() OVER ( + PARTITION BY COALESCE( + regexp_extract(path, '([0-9a-f]{16})-', 1), + account_id || '::' || path + ) + ORDER BY date DESC NULLS LAST + ) AS rn + FROM (` + rawUnion + `) u + ) ranked + WHERE rn = 1` if _, err := db.Exec(createSQL); err != nil { log.Printf("ERROR: SearchMulti create: %v", err) return SearchResult{Query: query, Total: 0, Offset: offset, Limit: limit, Hits: []Hit{}} diff --git a/internal/search/index/index_test.go b/internal/search/index/index_test.go index 9986b5f..7530f37 100644 --- a/internal/search/index/index_test.go +++ b/internal/search/index/index_test.go @@ -314,3 +314,62 @@ func TestParquetPersistence(t *testing.T) { t.Errorf("search 'trampoline' after reload = %d, want 1", res.Total) } } + +func TestSearchMultiDeduplicatesByChecksum(t *testing.T) { + // Simulate re-imported PST: two accounts with same emails (same content = same checksum). + // SearchMulti should deduplicate so each email appears once. + root := t.TempDir() + + // Account 1: inbox with checksum-named emails + dir1 := filepath.Join(root, "pst-import-1") + inbox1 := filepath.Join(dir1, "inbox") + if err := os.MkdirAll(inbox1, 0755); err != nil { + t.Fatal(err) + } + eml1 := "From: a@b.com\r\nTo: c@d.com\r\nSubject: Re-import Test\r\nDate: Mon, 10 Feb 2025 12:00:00 +0000\r\nContent-Type: text/plain\r\n\r\nSame content in both imports.\r\n" + os.WriteFile(filepath.Join(inbox1, "a1b2c3d4e5f60001-1.eml"), []byte(eml1), 0644) + eml2 := "From: x@y.com\r\nTo: z@w.com\r\nSubject: Unique One\r\nDate: Tue, 11 Feb 2025 08:00:00 +0000\r\nContent-Type: text/plain\r\n\r\nOnly in first.\r\n" + os.WriteFile(filepath.Join(inbox1, "bbbb111122223333-2.eml"), []byte(eml2), 0644) + + // Account 2: same emails (re-imported PST) — different paths, same checksums + dir2 := filepath.Join(root, "pst-import-2") + inbox2 := filepath.Join(dir2, "inbox") + if err := os.MkdirAll(inbox2, 0755); err != nil { + t.Fatal(err) + } + os.WriteFile(filepath.Join(inbox2, "a1b2c3d4e5f60001-1.eml"), []byte(eml1), 0644) // same checksum + os.WriteFile(filepath.Join(inbox2, "bbbb111122223333-99.eml"), []byte(eml2), 0644) // same checksum + + parquet1 := filepath.Join(root, "idx1.parquet") + parquet2 := filepath.Join(root, "idx2.parquet") + + idx1, err := index.New(dir1, parquet1) + if err != nil { + t.Fatalf("index.New 1: %v", err) + } + idx1.Build() + idx1.Close() + + idx2, err := index.New(dir2, parquet2) + if err != nil { + t.Fatalf("index.New 2: %v", err) + } + idx2.Build() + idx2.Close() + + // SearchMulti across both accounts — should deduplicate by checksum + accounts := []index.AccountIndex{ + {ID: "acct-1", IndexPath: parquet1}, + {ID: "acct-2", IndexPath: parquet2}, + } + result := index.SearchMulti(accounts, "", 0, 100) + + // 2 unique emails, not 4 (2 per account) + if result.Total != 2 { + t.Errorf("SearchMulti total = %d, want 2 (deduplicated)", result.Total) + } + if len(result.Hits) != 2 { + t.Errorf("SearchMulti hits = %d, want 2", len(result.Hits)) + } + t.Logf("SearchMulti: 4 rows across 2 accounts -> %d unique (deduplicated)", result.Total) +} From b30d54b2f5926410a373e9e86b5a7979590039eb Mon Sep 17 00:00:00 2001 From: Andriy Oblivantsev Date: Sun, 15 Feb 2026 16:43:08 +0000 Subject: [PATCH 2/4] style: fix gofmt in index_test.go Co-authored-by: Cursor --- internal/search/index/index_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/search/index/index_test.go b/internal/search/index/index_test.go index 7530f37..a5a4ef1 100644 --- a/internal/search/index/index_test.go +++ b/internal/search/index/index_test.go @@ -337,7 +337,7 @@ func TestSearchMultiDeduplicatesByChecksum(t *testing.T) { if err := os.MkdirAll(inbox2, 0755); err != nil { t.Fatal(err) } - os.WriteFile(filepath.Join(inbox2, "a1b2c3d4e5f60001-1.eml"), []byte(eml1), 0644) // same checksum + os.WriteFile(filepath.Join(inbox2, "a1b2c3d4e5f60001-1.eml"), []byte(eml1), 0644) // same checksum os.WriteFile(filepath.Join(inbox2, "bbbb111122223333-99.eml"), []byte(eml2), 0644) // same checksum parquet1 := filepath.Join(root, "idx1.parquet") From e3504dee0963d226a0fb395b72115414a6961f6e Mon Sep 17 00:00:00 2001 From: Andriy Oblivantsev Date: Sun, 15 Feb 2026 16:49:46 +0000 Subject: [PATCH 3/4] fix: avoid collapsing all emails when path has no checksum format regexp_extract returns empty string on no match; COALESCE('', x) yields '' so all rows were partitioned by '' and collapsed to one. Use NULLIF to treat empty as NULL so fallback to account||path keeps each row unique. Co-authored-by: Cursor --- internal/search/index/index.go | 5 +++-- internal/search/index/index_test.go | 30 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/internal/search/index/index.go b/internal/search/index/index.go index e62ab3b..21e3a5e 100644 --- a/internal/search/index/index.go +++ b/internal/search/index/index.go @@ -298,14 +298,15 @@ func SearchMulti(accounts []AccountIndex, query string, offset, limit int) Searc // Build raw union first. rawUnion := strings.Join(unionParts, " UNION ALL ") // Deduplicate by content checksum (extracted from path): same email in multiple accounts - // (e.g. re-imported PST) appears once. Rows without checksum in path are kept as-is. + // (e.g. re-imported PST) appears once. Rows without checksum in path use account||path (keep all). + // Use NULLIF so regexp_extract empty string is treated as NULL for COALESCE fallback. createSQL := `CREATE TEMP TABLE emails AS SELECT account_id, path, subject, from_addr, to_addr, date, size, body_text FROM ( SELECT *, ROW_NUMBER() OVER ( PARTITION BY COALESCE( - regexp_extract(path, '([0-9a-f]{16})-', 1), + NULLIF(regexp_extract(path, '([0-9a-f]{16})-', 1), ''), account_id || '::' || path ) ORDER BY date DESC NULLS LAST diff --git a/internal/search/index/index_test.go b/internal/search/index/index_test.go index a5a4ef1..e74c03e 100644 --- a/internal/search/index/index_test.go +++ b/internal/search/index/index_test.go @@ -373,3 +373,33 @@ func TestSearchMultiDeduplicatesByChecksum(t *testing.T) { } t.Logf("SearchMulti: 4 rows across 2 accounts -> %d unique (deduplicated)", result.Total) } + +func TestSearchMultiKeepsAllWhenNoChecksumInPath(t *testing.T) { + // Paths without checksum format (e.g. readpst output "message_1.eml") must not be collapsed. + // regexp_extract returns empty/NULL -> we use account||path, so each row stays unique. + root := t.TempDir() + + dir := filepath.Join(root, "readpst-style") + inbox := filepath.Join(dir, "inbox") + if err := os.MkdirAll(inbox, 0755); err != nil { + t.Fatal(err) + } + os.WriteFile(filepath.Join(inbox, "message_1.eml"), []byte("From: a@b.com\r\nTo: c@d.com\r\nSubject: Msg 1\r\nDate: Mon, 10 Feb 2025 12:00:00 +0000\r\n\r\nBody 1"), 0644) + os.WriteFile(filepath.Join(inbox, "message_2.eml"), []byte("From: a@b.com\r\nTo: c@d.com\r\nSubject: Msg 2\r\nDate: Mon, 10 Feb 2025 12:00:00 +0000\r\n\r\nBody 2"), 0644) + os.WriteFile(filepath.Join(inbox, "message_3.eml"), []byte("From: a@b.com\r\nTo: c@d.com\r\nSubject: Msg 3\r\nDate: Mon, 10 Feb 2025 12:00:00 +0000\r\n\r\nBody 3"), 0644) + + parquetPath := filepath.Join(root, "idx.parquet") + idx, err := index.New(dir, parquetPath) + if err != nil { + t.Fatalf("index.New: %v", err) + } + idx.Build() + idx.Close() + + accounts := []index.AccountIndex{{ID: "acct-1", IndexPath: parquetPath}} + result := index.SearchMulti(accounts, "", 0, 100) + + if result.Total != 3 { + t.Errorf("SearchMulti total = %d, want 3 (all kept when no checksum in path)", result.Total) + } +} From c62b7476f2091aa281e85fb27668701bf51e1c41 Mon Sep 17 00:00:00 2001 From: Andriy Oblivantsev Date: Sun, 15 Feb 2026 16:53:55 +0000 Subject: [PATCH 4/4] fix: dedupe by content fingerprint when path has no checksum Paths from readpst (message_1.eml) lack checksum; account||path yielded unique keys per account so same email in 2 accounts appeared twice. Use subject|from|to|date|body as fallback fingerprint for dedup. Co-authored-by: Cursor --- internal/search/index/index.go | 9 ++++--- internal/search/index/index_test.go | 41 ++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/internal/search/index/index.go b/internal/search/index/index.go index 21e3a5e..16cc0a0 100644 --- a/internal/search/index/index.go +++ b/internal/search/index/index.go @@ -297,9 +297,10 @@ func SearchMulti(accounts []AccountIndex, query string, offset, limit int) Searc // Build raw union first. rawUnion := strings.Join(unionParts, " UNION ALL ") - // Deduplicate by content checksum (extracted from path): same email in multiple accounts - // (e.g. re-imported PST) appears once. Rows without checksum in path use account||path (keep all). - // Use NULLIF so regexp_extract empty string is treated as NULL for COALESCE fallback. + // Deduplicate: same email in multiple accounts (e.g. re-imported PST) appears once. + // - Path with checksum (go-pst): use checksum for dedup. + // - Path without checksum (readpst): use content fingerprint (subject|from|to|date|body). + // NULLIF ensures regexp_extract '' is treated as NULL for fallback. createSQL := `CREATE TEMP TABLE emails AS SELECT account_id, path, subject, from_addr, to_addr, date, size, body_text FROM ( @@ -307,7 +308,7 @@ func SearchMulti(accounts []AccountIndex, query string, offset, limit int) Searc ROW_NUMBER() OVER ( PARTITION BY COALESCE( NULLIF(regexp_extract(path, '([0-9a-f]{16})-', 1), ''), - account_id || '::' || path + subject || '|' || COALESCE(from_addr, '') || '|' || COALESCE(to_addr, '') || '|' || COALESCE(CAST(date AS VARCHAR), '') || '|' || COALESCE(body_text, '') ) ORDER BY date DESC NULLS LAST ) AS rn diff --git a/internal/search/index/index_test.go b/internal/search/index/index_test.go index e74c03e..dd8e426 100644 --- a/internal/search/index/index_test.go +++ b/internal/search/index/index_test.go @@ -375,8 +375,8 @@ func TestSearchMultiDeduplicatesByChecksum(t *testing.T) { } func TestSearchMultiKeepsAllWhenNoChecksumInPath(t *testing.T) { - // Paths without checksum format (e.g. readpst output "message_1.eml") must not be collapsed. - // regexp_extract returns empty/NULL -> we use account||path, so each row stays unique. + // Paths without checksum format: content fingerprint used for dedup. + // Single account with 3 different emails -> 3 results. root := t.TempDir() dir := filepath.Join(root, "readpst-style") @@ -400,6 +400,41 @@ func TestSearchMultiKeepsAllWhenNoChecksumInPath(t *testing.T) { result := index.SearchMulti(accounts, "", 0, 100) if result.Total != 3 { - t.Errorf("SearchMulti total = %d, want 3 (all kept when no checksum in path)", result.Total) + t.Errorf("SearchMulti total = %d, want 3", result.Total) + } +} + +func TestSearchMultiDeduplicatesByContentWhenNoChecksumInPath(t *testing.T) { + // Two accounts (re-imported PST via readpst) with same emails -> content fingerprint dedupes. + root := t.TempDir() + + eml1 := "From: a@b.com\r\nTo: c@d.com\r\nSubject: Same\r\nDate: Mon, 10 Feb 2025 12:00:00 +0000\r\n\r\nBody" + eml2 := "From: x@y.com\r\nTo: z@w.com\r\nSubject: Other\r\nDate: Tue, 11 Feb 2025 08:00:00 +0000\r\n\r\nDifferent" + + for _, acc := range []string{"import-1", "import-2"} { + dir := filepath.Join(root, acc) + inbox := filepath.Join(dir, "inbox") + if err := os.MkdirAll(inbox, 0755); err != nil { + t.Fatal(err) + } + os.WriteFile(filepath.Join(inbox, "message_1.eml"), []byte(eml1), 0644) + os.WriteFile(filepath.Join(inbox, "message_2.eml"), []byte(eml2), 0644) + } + + idx1, _ := index.New(filepath.Join(root, "import-1"), filepath.Join(root, "idx1.parquet")) + idx1.Build() + idx1.Close() + idx2, _ := index.New(filepath.Join(root, "import-2"), filepath.Join(root, "idx2.parquet")) + idx2.Build() + idx2.Close() + + accounts := []index.AccountIndex{ + {ID: "acct-1", IndexPath: filepath.Join(root, "idx1.parquet")}, + {ID: "acct-2", IndexPath: filepath.Join(root, "idx2.parquet")}, + } + result := index.SearchMulti(accounts, "", 0, 100) + + if result.Total != 2 { + t.Errorf("SearchMulti total = %d, want 2 (deduplicated by content when no checksum in path)", result.Total) } }