From 7cab66b3accdda68cd90045d0aae4da1473d31cc Mon Sep 17 00:00:00 2001 From: Nat Torkington Date: Sun, 21 Jun 2026 15:58:48 +1200 Subject: [PATCH 01/11] fix(fbmessenger): stop collapsing multiple hashless attachments to one row MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UpsertAttachment dedupes attachments with an empty content_hash to a single row per message (SELECT-then-insert on message_id). The fbmessenger importer stored an empty content_hash whenever an attachment file was missing or no attachments dir was configured, so a Messenger message with several photos whose files were absent recorded only ONE attachment row — the rest were silently dropped. Give hashless attachments a stable synthetic content_hash (sha256 of the export-relative URI, not file bytes) so siblings coexist and re-imports stay idempotent. storage_path stays empty, so no stored content is implied and the file-cleanup paths (which filter on non-empty storage_path) are unaffected. Audit of the other UpsertAttachment callers found no further instances: gmail and the generic ingest path always carry a real MIME content hash, synctechsms always hashes the part bytes, whatsapp stores at most one attachment per message, and teams already uses a synthetic link hash. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_012Ri7QdvXXUMQPke9wrLjSS --- internal/fbmessenger/importer.go | 40 +++++++++++++++----- internal/fbmessenger/importer_test.go | 53 +++++++++++++++++++++++++-- 2 files changed, 80 insertions(+), 13 deletions(-) diff --git a/internal/fbmessenger/importer.go b/internal/fbmessenger/importer.go index f35f62e21..903901f79 100644 --- a/internal/fbmessenger/importer.go +++ b/internal/fbmessenger/importer.go @@ -690,21 +690,19 @@ func writeThreadToStore( } // Attachments. - for _, att := range m.Attachments { + for ai := range m.Attachments { + att := m.Attachments[ai] summary.AttachmentsFound++ storagePath, contentHash, size := handleAttachment(att, opts.AttachmentsDir) if storagePath != "" { summary.AttachmentsStored++ } - if storagePath != "" || contentHash != "" || att.AbsPath != "" { - if err := st.UpsertAttachment(messageID, att.Filename, att.MimeType, storagePath, contentHash, size); err != nil { - logger.Warn("fbmessenger: upsert attachment", "err", err) - } - } else { - // Empty row so the user sees a trace that something was referenced. - if err := st.UpsertAttachment(messageID, att.Filename, att.MimeType, "", "", 0); err != nil { - logger.Warn("fbmessenger: upsert attachment (empty)", "err", err) - } + hash := contentHash + if hash == "" { + hash = fbAttachmentHash(att, ai) + } + if err := st.UpsertAttachment(messageID, att.Filename, att.MimeType, storagePath, hash, size); err != nil { + logger.Warn("fbmessenger: upsert attachment", "err", err) } } @@ -859,3 +857,25 @@ func handleAttachment(att Attachment, attachmentsDir string) (string, string, in } return rel, contentHash, int(info.Size()) } + +// fbAttachmentHash derives a stable synthetic content hash for an attachment +// that has no real (content-derived) hash — e.g. the file is missing or no +// attachments dir was configured. Without it, UpsertAttachment collapses every +// hashless attachment on a message to a single row. Keyed on the export-relative +// URI (present even when the file is missing), then AbsPath/Filename, with the +// loop index as a last-resort tiebreaker, so re-importing the same export is +// idempotent. It is NOT a hash of file bytes, so storage_path stays empty. +func fbAttachmentHash(att Attachment, idx int) string { + key := att.URI + if key == "" { + key = att.AbsPath + } + if key == "" { + key = att.Filename + } + if key == "" { + key = fmt.Sprintf("idx-%d", idx) + } + sum := sha256.Sum256([]byte("fbmessenger\x00" + key)) + return hex.EncodeToString(sum[:]) +} diff --git a/internal/fbmessenger/importer_test.go b/internal/fbmessenger/importer_test.go index 978635149..76dd11fb5 100644 --- a/internal/fbmessenger/importer_test.go +++ b/internal/fbmessenger/importer_test.go @@ -283,7 +283,7 @@ func TestImportDYI_AttachmentPathEscapeRejected(t *testing.T) { err = st.DB().QueryRow("SELECT storage_path, content_hash FROM attachments LIMIT 1").Scan(&sp, &ch) require.NoError(err) assert.Empty(sp, "storage_path: path escape not rejected") - assert.Empty(ch, "content_hash: path escape not rejected") + assert.NotEmpty(ch, "content_hash: rejected attachment now gets a synthetic (non-content) hash") } // TestImportDYI_AttachmentSymlinkRejected verifies that an attachment URI @@ -327,7 +327,10 @@ func TestImportDYI_AttachmentSymlinkRejected(t *testing.T) { err = st.DB().QueryRow("SELECT storage_path, content_hash FROM attachments LIMIT 1").Scan(&sp, &ch) require.NoError(err) assert.Empty(sp, "storage_path: symlinked attachment not rejected") - assert.Empty(ch, "content_hash: symlinked attachment not rejected") + assert.NotEmpty(ch, "content_hash: rejected attachment now gets a synthetic (non-content) hash") + // The synthetic hash must never be the hash of the secret's contents. + leak := fmt.Sprintf("%x", sha256.Sum256([]byte("password=hunter2"))) + assert.NotEqual(leak, ch, "content_hash must never be the secret's content hash") // Defense in depth: assert nothing under attachmentsDir contains the // secret bytes, so even a future copy regression would be caught. _ = filepath.Walk(attachmentsDir, func(p string, info os.FileInfo, err error) error { @@ -362,7 +365,51 @@ func TestImportDYI_MissingAttachment(t *testing.T) { err = st.DB().QueryRow("SELECT storage_path, content_hash FROM attachments LIMIT 1").Scan(&sp, &ch) require.NoError(err) assert.Empty(sp, "storage_path: missing attachment should have empty storage_path") - assert.Empty(ch, "content_hash: missing attachment should have empty content_hash") + assert.NotEmpty(ch, "missing attachment now gets a synthetic (non-content) hash so siblings aren't dropped") +} + +// TestImportDYI_MultipleMissingAttachments verifies that a single message +// with multiple missing photos records one attachment row per photo, rather +// than collapsing them to a single hashless row. Each row gets a stable, +// distinct synthetic content_hash while storage_path stays empty (no bytes +// copied). +func TestImportDYI_MultipleMissingAttachments(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + st := testutil.NewTestStore(t) + tmp := t.TempDir() + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "missing_MIS") + require.NoError(os.MkdirAll(threadPath, 0755)) + body := `{"participants":[{"name":"A"},{"name":"B"}],"messages":[ +{"sender_name":"A","timestamp_ms":1600000000000,"type":"Generic","photos":[{"uri":"messages/inbox/missing_MIS/photos/a.png"},{"uri":"messages/inbox/missing_MIS/photos/b.png"}]} +],"title":"x"}` + require.NoError(os.WriteFile(filepath.Join(threadPath, "message_1.json"), []byte(body), 0644)) + summary, err := ImportDYI(context.Background(), st, ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: tmp, + AttachmentsDir: t.TempDir(), + }) + require.NoError(err) + assert.False(summary.HardErrors, "HardErrors") + + var count int + require.NoError(st.DB().QueryRow("SELECT COUNT(*) FROM attachments").Scan(&count)) + assert.Equal(2, count, "both missing attachments should be recorded as distinct rows") + + rows, err := st.DB().Query("SELECT storage_path, content_hash FROM attachments ORDER BY id") + require.NoError(err) + defer rows.Close() + var hashes []string + for rows.Next() { + var sp, ch string + require.NoError(rows.Scan(&sp, &ch)) + assert.Empty(sp, "storage_path: missing attachment should have empty storage_path") + assert.NotEmpty(ch, "content_hash: missing attachment should have synthetic non-empty hash") + hashes = append(hashes, ch) + } + require.NoError(rows.Err()) + require.Len(hashes, 2) + assert.NotEqual(hashes[0], hashes[1], "the two synthetic hashes must be distinct") } // TestImportDYI_ReactionsFirstClass verifies reaction rows and the From 0da95caa9c4b248500d8035a574ee86ba4075a76 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 24 Jun 2026 20:22:29 -0500 Subject: [PATCH 02/11] fix(fbmessenger): mark synthetic attachment keys Messenger imports still need a stable per-attachment identity when files are missing, rejected, or skipped because no attachments directory is configured. Without that identity, the store's empty-hash fallback collapses multiple hashless attachments on the same message into one row. The previous fallback identity looked exactly like a SHA-256 content hash even though no bytes were stored. Prefix the synthetic key so JSON output and export flows cannot confuse it with content-addressed attachment data, while preserving idempotent re-import behavior. Validation: focused Messenger attachment tests were run red/green; the new assertions failed against bare 64-hex fallback keys before the importer change and passed after prefixing them. Generated with Codex (GPT-5) Co-authored-by: Codex --- internal/fbmessenger/importer.go | 9 ++++++--- internal/fbmessenger/importer_test.go | 23 +++++++++++++++-------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/internal/fbmessenger/importer.go b/internal/fbmessenger/importer.go index 903901f79..8575b9b69 100644 --- a/internal/fbmessenger/importer.go +++ b/internal/fbmessenger/importer.go @@ -858,13 +858,16 @@ func handleAttachment(att Attachment, attachmentsDir string) (string, string, in return rel, contentHash, int(info.Size()) } -// fbAttachmentHash derives a stable synthetic content hash for an attachment +const fbSyntheticAttachmentKeyPrefix = "fbmessenger:attachment:" + +// fbAttachmentHash derives a stable synthetic attachment key for an attachment // that has no real (content-derived) hash — e.g. the file is missing or no // attachments dir was configured. Without it, UpsertAttachment collapses every // hashless attachment on a message to a single row. Keyed on the export-relative // URI (present even when the file is missing), then AbsPath/Filename, with the // loop index as a last-resort tiebreaker, so re-importing the same export is -// idempotent. It is NOT a hash of file bytes, so storage_path stays empty. +// idempotent. The prefix prevents downstream export paths from mistaking it for +// a SHA-256 hash of stored bytes, so storage_path stays empty. func fbAttachmentHash(att Attachment, idx int) string { key := att.URI if key == "" { @@ -877,5 +880,5 @@ func fbAttachmentHash(att Attachment, idx int) string { key = fmt.Sprintf("idx-%d", idx) } sum := sha256.Sum256([]byte("fbmessenger\x00" + key)) - return hex.EncodeToString(sum[:]) + return fbSyntheticAttachmentKeyPrefix + hex.EncodeToString(sum[:]) } diff --git a/internal/fbmessenger/importer_test.go b/internal/fbmessenger/importer_test.go index 76dd11fb5..a305b4836 100644 --- a/internal/fbmessenger/importer_test.go +++ b/internal/fbmessenger/importer_test.go @@ -42,6 +42,12 @@ func countMessages(t *testing.T, st *store.Store, where string) int { return n } +func assertSyntheticAttachmentKey(t *testing.T, got string) { + t.Helper() + assertpkg.Regexp(t, `^fbmessenger:attachment:[0-9a-f]{64}$`, got, "synthetic attachment key") + assertpkg.NotRegexp(t, `^[0-9a-f]{64}$`, got, "synthetic attachment key must not look like a real SHA-256 content hash") +} + func TestImportDYI_JSONSimple(t *testing.T) { assert := assertpkg.New(t) st := testutil.NewTestStore(t) @@ -278,12 +284,13 @@ func TestImportDYI_AttachmentPathEscapeRejected(t *testing.T) { }) require.NoError(err) assert.False(summary.HardErrors, "HardErrors") - // Exactly one attachment row, with empty storage_path and content_hash. + // Exactly one attachment row with empty storage_path and a synthetic + // key that cannot be mistaken for a real content hash. var sp, ch string err = st.DB().QueryRow("SELECT storage_path, content_hash FROM attachments LIMIT 1").Scan(&sp, &ch) require.NoError(err) assert.Empty(sp, "storage_path: path escape not rejected") - assert.NotEmpty(ch, "content_hash: rejected attachment now gets a synthetic (non-content) hash") + assertSyntheticAttachmentKey(t, ch) } // TestImportDYI_AttachmentSymlinkRejected verifies that an attachment URI @@ -327,7 +334,7 @@ func TestImportDYI_AttachmentSymlinkRejected(t *testing.T) { err = st.DB().QueryRow("SELECT storage_path, content_hash FROM attachments LIMIT 1").Scan(&sp, &ch) require.NoError(err) assert.Empty(sp, "storage_path: symlinked attachment not rejected") - assert.NotEmpty(ch, "content_hash: rejected attachment now gets a synthetic (non-content) hash") + assertSyntheticAttachmentKey(t, ch) // The synthetic hash must never be the hash of the secret's contents. leak := fmt.Sprintf("%x", sha256.Sum256([]byte("password=hunter2"))) assert.NotEqual(leak, ch, "content_hash must never be the secret's content hash") @@ -365,14 +372,14 @@ func TestImportDYI_MissingAttachment(t *testing.T) { err = st.DB().QueryRow("SELECT storage_path, content_hash FROM attachments LIMIT 1").Scan(&sp, &ch) require.NoError(err) assert.Empty(sp, "storage_path: missing attachment should have empty storage_path") - assert.NotEmpty(ch, "missing attachment now gets a synthetic (non-content) hash so siblings aren't dropped") + assertSyntheticAttachmentKey(t, ch) } // TestImportDYI_MultipleMissingAttachments verifies that a single message // with multiple missing photos records one attachment row per photo, rather // than collapsing them to a single hashless row. Each row gets a stable, -// distinct synthetic content_hash while storage_path stays empty (no bytes -// copied). +// distinct synthetic key that is not a real content hash while storage_path +// stays empty (no bytes copied). func TestImportDYI_MultipleMissingAttachments(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) @@ -398,13 +405,13 @@ func TestImportDYI_MultipleMissingAttachments(t *testing.T) { rows, err := st.DB().Query("SELECT storage_path, content_hash FROM attachments ORDER BY id") require.NoError(err) - defer rows.Close() + defer func() { require.NoError(rows.Close(), "close attachment rows") }() var hashes []string for rows.Next() { var sp, ch string require.NoError(rows.Scan(&sp, &ch)) assert.Empty(sp, "storage_path: missing attachment should have empty storage_path") - assert.NotEmpty(ch, "content_hash: missing attachment should have synthetic non-empty hash") + assertSyntheticAttachmentKey(t, ch) hashes = append(hashes, ch) } require.NoError(rows.Err()) From 665d0b02bcf2e1046a2ae5d37cfb6bbe59940996 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 24 Jun 2026 20:41:54 -0500 Subject: [PATCH 03/11] fix(fbmessenger): migrate legacy hashless attachments Older Messenger imports could leave an empty content_hash attachment row for a missing or skipped attachment. After synthetic keys became non-empty, re-importing the same export no longer hit the store's empty-hash dedupe path, so the legacy row could survive beside the new synthetic-key row. Remove those legacy empty-hash, empty-storage rows when writing a synthetic Messenger attachment key so upgrade re-imports keep one attachment row per referenced attachment without restoring SHA-looking fake content hashes. Validation: focused regression test was run red/green; it failed with two rows before the importer cleanup and passed after deleting the legacy empty-hash row. Generated with Codex (GPT-5) Co-authored-by: Codex --- internal/fbmessenger/importer.go | 13 ++++++++ internal/fbmessenger/importer_test.go | 44 +++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/internal/fbmessenger/importer.go b/internal/fbmessenger/importer.go index 8575b9b69..e94dcbade 100644 --- a/internal/fbmessenger/importer.go +++ b/internal/fbmessenger/importer.go @@ -700,6 +700,9 @@ func writeThreadToStore( hash := contentHash if hash == "" { hash = fbAttachmentHash(att, ai) + if err := deleteLegacyHashlessAttachment(st, messageID); err != nil { + logger.Warn("fbmessenger: delete legacy hashless attachment", "err", err) + } } if err := st.UpsertAttachment(messageID, att.Filename, att.MimeType, storagePath, hash, size); err != nil { logger.Warn("fbmessenger: upsert attachment", "err", err) @@ -860,6 +863,16 @@ func handleAttachment(att Attachment, attachmentsDir string) (string, string, in const fbSyntheticAttachmentKeyPrefix = "fbmessenger:attachment:" +func deleteLegacyHashlessAttachment(st *store.Store, messageID int64) error { + _, err := st.DB().Exec(st.Rebind(` + DELETE FROM attachments + WHERE message_id = ? + AND (content_hash IS NULL OR content_hash = '') + AND storage_path = '' + `), messageID) + return err +} + // fbAttachmentHash derives a stable synthetic attachment key for an attachment // that has no real (content-derived) hash — e.g. the file is missing or no // attachments dir was configured. Without it, UpsertAttachment collapses every diff --git a/internal/fbmessenger/importer_test.go b/internal/fbmessenger/importer_test.go index a305b4836..e5a4b1879 100644 --- a/internal/fbmessenger/importer_test.go +++ b/internal/fbmessenger/importer_test.go @@ -375,6 +375,50 @@ func TestImportDYI_MissingAttachment(t *testing.T) { assertSyntheticAttachmentKey(t, ch) } +func TestImportDYI_MissingAttachmentReimportMigratesLegacyEmptyHashRow(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + st := testutil.NewTestStore(t) + tmp := t.TempDir() + threadPath := filepath.Join(tmp, "your_activity_across_facebook", "messages", "inbox", "missing_MIS") + require.NoError(os.MkdirAll(threadPath, 0755)) + body := `{"participants":[{"name":"A"},{"name":"B"}],"messages":[ +{"sender_name":"A","timestamp_ms":1600000000000,"type":"Generic","photos":[{"uri":"messages/inbox/missing_MIS/photos/gone.png"}]} +],"title":"x"}` + require.NoError(os.WriteFile(filepath.Join(threadPath, "message_1.json"), []byte(body), 0644)) + opts := ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: tmp, + AttachmentsDir: t.TempDir(), + NoResume: true, + } + _, err := ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var messageID int64 + err = st.DB().QueryRow("SELECT id FROM messages WHERE source_message_id = 'inbox/missing_MIS__0'").Scan(&messageID) + require.NoError(err, "select imported message id") + _, err = st.DB().Exec(st.Rebind("DELETE FROM attachments WHERE message_id = ?"), messageID) + require.NoError(err, "delete current synthetic attachment") + require.NoError(st.UpsertAttachment(messageID, "gone.png", "", "", "", 0), "seed legacy empty-hash attachment") + + _, err = ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var count int + err = st.DB().QueryRow(st.Rebind("SELECT COUNT(*) FROM attachments WHERE message_id = ?"), messageID).Scan(&count) + require.NoError(err, "count attachments") + require.Equal(1, count, "legacy empty-hash row should not survive beside synthetic key") + + var sp, ch string + err = st.DB().QueryRow(st.Rebind( + "SELECT storage_path, content_hash FROM attachments WHERE message_id = ?", + ), messageID).Scan(&sp, &ch) + require.NoError(err, "select migrated attachment") + assert.Empty(sp, "storage_path") + assertSyntheticAttachmentKey(t, ch) +} + // TestImportDYI_MultipleMissingAttachments verifies that a single message // with multiple missing photos records one attachment row per photo, rather // than collapsing them to a single hashless row. Each row gets a stable, From bf38cdf2d1e30b85262daeb7f169c3a01d05b5a9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 24 Jun 2026 22:01:11 -0500 Subject: [PATCH 04/11] fix(fbmessenger): clean legacy rows before attachment upsert Legacy Messenger imports could leave an empty content_hash row when attachment storage was skipped. After the synthetic-key fix, cleanup ran only for hashless reimports, so upgrading the same export with an attachments directory and present file still left the legacy row beside the newly stored content-hash row. Run the narrow legacy empty-hash, empty-storage cleanup before every Messenger attachment upsert. This preserves the synthetic-key upgrade path and also covers upgrades that now produce real stored attachment content. Validation: focused real-storage upgrade regression failed with two rows before moving the cleanup and passed after it ran before all attachment upserts. Generated with Codex (GPT-5) Co-authored-by: Codex --- internal/fbmessenger/importer.go | 6 ++-- internal/fbmessenger/importer_test.go | 48 +++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/internal/fbmessenger/importer.go b/internal/fbmessenger/importer.go index e94dcbade..e4b932850 100644 --- a/internal/fbmessenger/importer.go +++ b/internal/fbmessenger/importer.go @@ -697,12 +697,12 @@ func writeThreadToStore( if storagePath != "" { summary.AttachmentsStored++ } + if err := deleteLegacyHashlessAttachment(st, messageID); err != nil { + logger.Warn("fbmessenger: delete legacy hashless attachment", "err", err) + } hash := contentHash if hash == "" { hash = fbAttachmentHash(att, ai) - if err := deleteLegacyHashlessAttachment(st, messageID); err != nil { - logger.Warn("fbmessenger: delete legacy hashless attachment", "err", err) - } } if err := st.UpsertAttachment(messageID, att.Filename, att.MimeType, storagePath, hash, size); err != nil { logger.Warn("fbmessenger: upsert attachment", "err", err) diff --git a/internal/fbmessenger/importer_test.go b/internal/fbmessenger/importer_test.go index e5a4b1879..727d9b33c 100644 --- a/internal/fbmessenger/importer_test.go +++ b/internal/fbmessenger/importer_test.go @@ -264,6 +264,54 @@ func TestImportDYI_AttachmentStorage(t *testing.T) { assert.Equal(int64(len(png)), size, "size") } +func TestImportDYI_AttachmentStorageReimportMigratesLegacyEmptyHashRow(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + st := testutil.NewTestStore(t) + attachDir := t.TempDir() + opts := ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: "testdata/json_with_media", + Format: "auto", + NoResume: true, + } + _, err := ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var messageID int64 + err = st.DB().QueryRow("SELECT id FROM messages WHERE source_message_id = 'inbox/bob_XYZ789__0'").Scan(&messageID) + require.NoError(err, "select imported message id") + _, err = st.DB().Exec(st.Rebind("DELETE FROM attachments WHERE message_id = ?"), messageID) + require.NoError(err, "delete current synthetic attachment") + require.NoError(st.UpsertAttachment(messageID, "tiny.png", "image/png", "", "", 0), "seed legacy empty-hash attachment") + + opts.AttachmentsDir = attachDir + _, err = ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var count int + err = st.DB().QueryRow(st.Rebind("SELECT COUNT(*) FROM attachments WHERE message_id = ?"), messageID).Scan(&count) + require.NoError(err, "count attachments") + require.Equal(1, count, "legacy empty-hash row should not survive beside stored attachment") + + png, err := os.ReadFile("testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png") + require.NoError(err) + wantHash := fmt.Sprintf("%x", sha256.Sum256(png)) + + var contentHash, storagePath string + var size int64 + err = st.DB().QueryRow(st.Rebind( + "SELECT content_hash, storage_path, size FROM attachments WHERE message_id = ?", + ), messageID).Scan(&contentHash, &storagePath, &size) + require.NoError(err, "select stored attachment") + assert.Equal(wantHash, contentHash, "content_hash") + assert.NotEmpty(storagePath, "storage_path") + assert.Equal(int64(len(png)), size, "size") + got, err := os.ReadFile(filepath.Join(attachDir, storagePath)) + require.NoError(err, "stored file") + assert.Equal(string(png), string(got), "stored bytes") +} + func TestImportDYI_AttachmentPathEscapeRejected(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) From e20d8ed08ee5fb2b467f64b564ed18d5f945c739 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 24 Jun 2026 22:38:21 -0500 Subject: [PATCH 05/11] fix(fbmessenger): remove synthetic placeholders on storage Messenger imports can first record a synthetic attachment placeholder when storage is disabled or a referenced file is unavailable. If a later reimport stores the real attachment bytes, that deterministic synthetic row must be removed or the message advertises both the placeholder and the real content. Delete the matching empty-storage synthetic placeholder after a real content-hash upsert succeeds, while keeping the legacy empty-hash cleanup on the same successful-upsert path. Validation: focused synthetic-placeholder upgrade regression failed with two rows before the cleanup and passed after deleting the deterministic placeholder. Generated with Codex (GPT-5) Co-authored-by: Codex --- internal/fbmessenger/importer.go | 25 +++++++++-- internal/fbmessenger/importer_test.go | 60 +++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 4 deletions(-) diff --git a/internal/fbmessenger/importer.go b/internal/fbmessenger/importer.go index e4b932850..3ace6daf6 100644 --- a/internal/fbmessenger/importer.go +++ b/internal/fbmessenger/importer.go @@ -697,15 +697,22 @@ func writeThreadToStore( if storagePath != "" { summary.AttachmentsStored++ } - if err := deleteLegacyHashlessAttachment(st, messageID); err != nil { - logger.Warn("fbmessenger: delete legacy hashless attachment", "err", err) - } hash := contentHash + syntheticHash := fbAttachmentHash(att, ai) if hash == "" { - hash = fbAttachmentHash(att, ai) + hash = syntheticHash } if err := st.UpsertAttachment(messageID, att.Filename, att.MimeType, storagePath, hash, size); err != nil { logger.Warn("fbmessenger: upsert attachment", "err", err) + continue + } + if err := deleteLegacyHashlessAttachment(st, messageID); err != nil { + logger.Warn("fbmessenger: delete legacy hashless attachment", "err", err) + } + if contentHash != "" { + if err := deleteSyntheticPlaceholderAttachment(st, messageID, syntheticHash); err != nil { + logger.Warn("fbmessenger: delete synthetic attachment placeholder", "err", err) + } } } @@ -873,6 +880,16 @@ func deleteLegacyHashlessAttachment(st *store.Store, messageID int64) error { return err } +func deleteSyntheticPlaceholderAttachment(st *store.Store, messageID int64, contentHash string) error { + _, err := st.DB().Exec(st.Rebind(` + DELETE FROM attachments + WHERE message_id = ? + AND content_hash = ? + AND storage_path = '' + `), messageID, contentHash) + return err +} + // fbAttachmentHash derives a stable synthetic attachment key for an attachment // that has no real (content-derived) hash — e.g. the file is missing or no // attachments dir was configured. Without it, UpsertAttachment collapses every diff --git a/internal/fbmessenger/importer_test.go b/internal/fbmessenger/importer_test.go index 727d9b33c..d700c5700 100644 --- a/internal/fbmessenger/importer_test.go +++ b/internal/fbmessenger/importer_test.go @@ -312,6 +312,66 @@ func TestImportDYI_AttachmentStorageReimportMigratesLegacyEmptyHashRow(t *testin assert.Equal(string(png), string(got), "stored bytes") } +func TestImportDYI_AttachmentStorageReimportRemovesSyntheticPlaceholder(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + st := testutil.NewTestStore(t) + attachDir := t.TempDir() + opts := ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: "testdata/json_with_media", + Format: "auto", + NoResume: true, + } + _, err := ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var messageID int64 + err = st.DB().QueryRow("SELECT id FROM messages WHERE source_message_id = 'inbox/bob_XYZ789__0'").Scan(&messageID) + require.NoError(err, "select imported message id") + + var placeholderHash, placeholderPath string + err = st.DB().QueryRow(st.Rebind( + "SELECT content_hash, storage_path FROM attachments WHERE message_id = ?", + ), messageID).Scan(&placeholderHash, &placeholderPath) + require.NoError(err, "select synthetic placeholder") + assertSyntheticAttachmentKey(t, placeholderHash) + assert.Empty(placeholderPath, "synthetic placeholder storage_path") + + opts.AttachmentsDir = attachDir + _, err = ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var count int + err = st.DB().QueryRow(st.Rebind("SELECT COUNT(*) FROM attachments WHERE message_id = ?"), messageID).Scan(&count) + require.NoError(err, "count attachments") + require.Equal(1, count, "synthetic placeholder should not survive beside stored attachment") + + var placeholderCount int + err = st.DB().QueryRow(st.Rebind( + "SELECT COUNT(*) FROM attachments WHERE message_id = ? AND content_hash = ?", + ), messageID, placeholderHash).Scan(&placeholderCount) + require.NoError(err, "count synthetic placeholder") + assert.Equal(0, placeholderCount, "synthetic placeholder should be removed") + + png, err := os.ReadFile("testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png") + require.NoError(err) + wantHash := fmt.Sprintf("%x", sha256.Sum256(png)) + + var contentHash, storagePath string + var size int64 + err = st.DB().QueryRow(st.Rebind( + "SELECT content_hash, storage_path, size FROM attachments WHERE message_id = ?", + ), messageID).Scan(&contentHash, &storagePath, &size) + require.NoError(err, "select stored attachment") + assert.Equal(wantHash, contentHash, "content_hash") + assert.NotEmpty(storagePath, "storage_path") + assert.Equal(int64(len(png)), size, "size") + got, err := os.ReadFile(filepath.Join(attachDir, storagePath)) + require.NoError(err, "stored file") + assert.Equal(string(png), string(got), "stored bytes") +} + func TestImportDYI_AttachmentPathEscapeRejected(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) From f653b9139e77068e9059eaa2797b4b63f5e3f2f2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 24 Jun 2026 22:47:15 -0500 Subject: [PATCH 06/11] fix(fbmessenger): keep placeholders on storage failure Messenger attachment storage can compute a real content hash before failing to create or write the content-addressed file. Recording that real hash with an empty storage_path blocks a later successful reimport because the store treats the same message/hash pair as already present. Keep the synthetic placeholder identity unless a real storage path was produced. The deterministic placeholder can then remain visible after a failed storage attempt and still be replaced by the real content row once storage succeeds. Validation: focused storage-failure regression failed by recording a real hash with empty storage_path before the fix and passed after preserving the synthetic placeholder through the failed import. Generated with Codex (GPT-5) Co-authored-by: Codex --- internal/fbmessenger/importer.go | 7 +-- internal/fbmessenger/importer_test.go | 76 +++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/internal/fbmessenger/importer.go b/internal/fbmessenger/importer.go index 3ace6daf6..7d634a255 100644 --- a/internal/fbmessenger/importer.go +++ b/internal/fbmessenger/importer.go @@ -697,9 +697,10 @@ func writeThreadToStore( if storagePath != "" { summary.AttachmentsStored++ } - hash := contentHash syntheticHash := fbAttachmentHash(att, ai) - if hash == "" { + hash := contentHash + storedContent := storagePath != "" && contentHash != "" + if !storedContent { hash = syntheticHash } if err := st.UpsertAttachment(messageID, att.Filename, att.MimeType, storagePath, hash, size); err != nil { @@ -709,7 +710,7 @@ func writeThreadToStore( if err := deleteLegacyHashlessAttachment(st, messageID); err != nil { logger.Warn("fbmessenger: delete legacy hashless attachment", "err", err) } - if contentHash != "" { + if storedContent { if err := deleteSyntheticPlaceholderAttachment(st, messageID, syntheticHash); err != nil { logger.Warn("fbmessenger: delete synthetic attachment placeholder", "err", err) } diff --git a/internal/fbmessenger/importer_test.go b/internal/fbmessenger/importer_test.go index d700c5700..5767c66da 100644 --- a/internal/fbmessenger/importer_test.go +++ b/internal/fbmessenger/importer_test.go @@ -372,6 +372,82 @@ func TestImportDYI_AttachmentStorageReimportRemovesSyntheticPlaceholder(t *testi assert.Equal(string(png), string(got), "stored bytes") } +func TestImportDYI_AttachmentStorageFailureKeepsSyntheticPlaceholder(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + st := testutil.NewTestStore(t) + opts := ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: "testdata/json_with_media", + Format: "auto", + NoResume: true, + } + _, err := ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var messageID int64 + err = st.DB().QueryRow("SELECT id FROM messages WHERE source_message_id = 'inbox/bob_XYZ789__0'").Scan(&messageID) + require.NoError(err, "select imported message id") + + var placeholderHash string + err = st.DB().QueryRow(st.Rebind( + "SELECT content_hash FROM attachments WHERE message_id = ?", + ), messageID).Scan(&placeholderHash) + require.NoError(err, "select synthetic placeholder") + assertSyntheticAttachmentKey(t, placeholderHash) + + png, err := os.ReadFile("testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png") + require.NoError(err) + wantHash := fmt.Sprintf("%x", sha256.Sum256(png)) + + badAttachRoot := filepath.Join(t.TempDir(), "attachments-file") + require.NoError(os.WriteFile(badAttachRoot, []byte("not a directory"), 0600), "write bad attachment root") + opts.AttachmentsDir = badAttachRoot + _, err = ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var count int + err = st.DB().QueryRow(st.Rebind("SELECT COUNT(*) FROM attachments WHERE message_id = ?"), messageID).Scan(&count) + require.NoError(err, "count attachments after failed storage") + require.Equal(1, count, "failed storage should keep only the synthetic placeholder") + + var contentHash, storagePath string + err = st.DB().QueryRow(st.Rebind( + "SELECT content_hash, storage_path FROM attachments WHERE message_id = ?", + ), messageID).Scan(&contentHash, &storagePath) + require.NoError(err, "select placeholder after failed storage") + assert.Equal(placeholderHash, contentHash, "content_hash after failed storage") + assert.Empty(storagePath, "storage_path after failed storage") + + var realHashRows int + err = st.DB().QueryRow(st.Rebind( + "SELECT COUNT(*) FROM attachments WHERE message_id = ? AND content_hash = ?", + ), messageID, wantHash).Scan(&realHashRows) + require.NoError(err, "count real-hash rows after failed storage") + assert.Equal(0, realHashRows, "failed storage must not record a real hash with empty storage_path") + + goodAttachRoot := t.TempDir() + opts.AttachmentsDir = goodAttachRoot + _, err = ImportDYI(context.Background(), st, opts) + require.NoError(err) + + err = st.DB().QueryRow(st.Rebind("SELECT COUNT(*) FROM attachments WHERE message_id = ?"), messageID).Scan(&count) + require.NoError(err, "count attachments after successful storage") + require.Equal(1, count, "successful storage should replace the synthetic placeholder") + + var size int64 + err = st.DB().QueryRow(st.Rebind( + "SELECT content_hash, storage_path, size FROM attachments WHERE message_id = ?", + ), messageID).Scan(&contentHash, &storagePath, &size) + require.NoError(err, "select stored attachment after recovery") + assert.Equal(wantHash, contentHash, "content_hash after recovery") + assert.NotEmpty(storagePath, "storage_path after recovery") + assert.Equal(int64(len(png)), size, "size after recovery") + got, err := os.ReadFile(filepath.Join(goodAttachRoot, storagePath)) + require.NoError(err, "stored file after recovery") + assert.Equal(string(png), string(got), "stored bytes after recovery") +} + func TestImportDYI_AttachmentPathEscapeRejected(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) From 285c5bd05ecd39e77e090170c85962c804460e40 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 24 Jun 2026 23:02:38 -0500 Subject: [PATCH 07/11] fix(fbmessenger): repair failed stored attachments A prior failed storage attempt can leave a Messenger attachment row with a real content_hash but an empty storage_path. Because attachments conflict on message_id and content_hash, a later successful import would hit the existing row and leave it unrepaired. When a Messenger attachment is actually stored, remove any same-message same-hash empty-storage row before the upsert so the stored path and size can be written. Keep synthetic placeholder behavior for imports that still do not have stored content. Add a regression test that seeds the stale real-hash empty-path state, reimports with attachment storage enabled, and verifies the stored row and bytes are repaired. Validation: the focused real-hash empty-path regression failed before this cleanup with an empty storage_path and size 0, then passed after deleting the stale row before the stored upsert. Generated with Codex (GPT-5) Co-authored-by: Codex --- internal/fbmessenger/importer.go | 18 ++++++-- internal/fbmessenger/importer_test.go | 60 ++++++++++++++++++++++++--- internal/fbmessenger/types.go | 8 +++- 3 files changed, 75 insertions(+), 11 deletions(-) diff --git a/internal/fbmessenger/importer.go b/internal/fbmessenger/importer.go index 7d634a255..65e9fd46e 100644 --- a/internal/fbmessenger/importer.go +++ b/internal/fbmessenger/importer.go @@ -102,10 +102,10 @@ func ImportDYI(ctx context.Context, st *store.Store, opts ImportOptions) (*Impor } format := strings.ToLower(opts.Format) if format == "" { - format = "auto" + format = formatAuto } switch format { - case "auto", formatJSON, "html", "both": + case formatAuto, formatJSON, "html", "both": // valid default: return nil, fmt.Errorf("fbmessenger: unknown --format %q (valid: auto, json, html, both)", format) @@ -275,7 +275,7 @@ func ImportDYI(ctx context.Context, st *store.Store, opts ImportOptions) (*Impor effective = "html" case "both": // Keep as-is; "both" threads get both parsed. - case "auto": + case formatAuto: if effective == "both" { effective = formatJSON } @@ -702,6 +702,8 @@ func writeThreadToStore( storedContent := storagePath != "" && contentHash != "" if !storedContent { hash = syntheticHash + } else if err := deleteFailedStoredAttachment(st, messageID, contentHash); err != nil { + logger.Warn("fbmessenger: delete failed stored attachment", "err", err) } if err := st.UpsertAttachment(messageID, att.Filename, att.MimeType, storagePath, hash, size); err != nil { logger.Warn("fbmessenger: upsert attachment", "err", err) @@ -891,6 +893,16 @@ func deleteSyntheticPlaceholderAttachment(st *store.Store, messageID int64, cont return err } +func deleteFailedStoredAttachment(st *store.Store, messageID int64, contentHash string) error { + _, err := st.DB().Exec(st.Rebind(` + DELETE FROM attachments + WHERE message_id = ? + AND content_hash = ? + AND storage_path = '' + `), messageID, contentHash) + return err +} + // fbAttachmentHash derives a stable synthetic attachment key for an attachment // that has no real (content-derived) hash — e.g. the file is missing or no // attachments dir was configured. Without it, UpsertAttachment collapses every diff --git a/internal/fbmessenger/importer_test.go b/internal/fbmessenger/importer_test.go index 5767c66da..a7431df6f 100644 --- a/internal/fbmessenger/importer_test.go +++ b/internal/fbmessenger/importer_test.go @@ -22,7 +22,7 @@ func importFixture(t *testing.T, st *store.Store, rootDir string) *ImportSummary opts := ImportOptions{ Me: "test.user@facebook.messenger", RootDir: rootDir, - Format: "auto", + Format: formatAuto, AttachmentsDir: t.TempDir(), } summary, err := ImportDYI(context.Background(), st, opts) @@ -239,7 +239,7 @@ func TestImportDYI_AttachmentStorage(t *testing.T) { opts := ImportOptions{ Me: "test.user@facebook.messenger", RootDir: "testdata/json_with_media", - Format: "auto", + Format: formatAuto, AttachmentsDir: attachDir, } _, err := ImportDYI(context.Background(), st, opts) @@ -272,7 +272,7 @@ func TestImportDYI_AttachmentStorageReimportMigratesLegacyEmptyHashRow(t *testin opts := ImportOptions{ Me: "test.user@facebook.messenger", RootDir: "testdata/json_with_media", - Format: "auto", + Format: formatAuto, NoResume: true, } _, err := ImportDYI(context.Background(), st, opts) @@ -320,7 +320,7 @@ func TestImportDYI_AttachmentStorageReimportRemovesSyntheticPlaceholder(t *testi opts := ImportOptions{ Me: "test.user@facebook.messenger", RootDir: "testdata/json_with_media", - Format: "auto", + Format: formatAuto, NoResume: true, } _, err := ImportDYI(context.Background(), st, opts) @@ -379,7 +379,7 @@ func TestImportDYI_AttachmentStorageFailureKeepsSyntheticPlaceholder(t *testing. opts := ImportOptions{ Me: "test.user@facebook.messenger", RootDir: "testdata/json_with_media", - Format: "auto", + Format: formatAuto, NoResume: true, } _, err := ImportDYI(context.Background(), st, opts) @@ -448,6 +448,54 @@ func TestImportDYI_AttachmentStorageFailureKeepsSyntheticPlaceholder(t *testing. assert.Equal(string(png), string(got), "stored bytes after recovery") } +func TestImportDYI_AttachmentStorageReimportRepairsRealHashEmptyPathRow(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + st := testutil.NewTestStore(t) + attachDir := t.TempDir() + opts := ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: "testdata/json_with_media", + Format: formatAuto, + NoResume: true, + } + _, err := ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var messageID int64 + err = st.DB().QueryRow("SELECT id FROM messages WHERE source_message_id = 'inbox/bob_XYZ789__0'").Scan(&messageID) + require.NoError(err, "select imported message id") + png, err := os.ReadFile("testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png") + require.NoError(err) + wantHash := fmt.Sprintf("%x", sha256.Sum256(png)) + + _, err = st.DB().Exec(st.Rebind("DELETE FROM attachments WHERE message_id = ?"), messageID) + require.NoError(err, "delete current synthetic attachment") + require.NoError(st.UpsertAttachment(messageID, "tiny.png", "image/png", "", wantHash, 0), "seed real-hash empty-path attachment") + + opts.AttachmentsDir = attachDir + _, err = ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var count int + err = st.DB().QueryRow(st.Rebind("SELECT COUNT(*) FROM attachments WHERE message_id = ?"), messageID).Scan(&count) + require.NoError(err, "count attachments") + require.Equal(1, count, "real-hash empty-path row should be repaired, not duplicated") + + var contentHash, storagePath string + var size int64 + err = st.DB().QueryRow(st.Rebind( + "SELECT content_hash, storage_path, size FROM attachments WHERE message_id = ?", + ), messageID).Scan(&contentHash, &storagePath, &size) + require.NoError(err, "select repaired attachment") + assert.Equal(wantHash, contentHash, "content_hash") + assert.NotEmpty(storagePath, "storage_path") + assert.Equal(int64(len(png)), size, "size") + got, err := os.ReadFile(filepath.Join(attachDir, storagePath)) + require.NoError(err, "stored file") + assert.Equal(string(png), string(got), "stored bytes") +} + func TestImportDYI_AttachmentPathEscapeRejected(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) @@ -744,7 +792,7 @@ func TestImportDYI_IsFromMe(t *testing.T) { _, err := ImportDYI(context.Background(), st, ImportOptions{ Me: "test.user@facebook.messenger", RootDir: "testdata/json_simple", - Format: "auto", + Format: formatAuto, AttachmentsDir: t.TempDir(), }) require.NoError(err) diff --git a/internal/fbmessenger/types.go b/internal/fbmessenger/types.go index 9b88c2a80..5fe8223e6 100644 --- a/internal/fbmessenger/types.go +++ b/internal/fbmessenger/types.go @@ -5,8 +5,12 @@ import ( "time" ) -// formatJSON is the Thread.Format value for JSON-sourced threads. -const formatJSON = "json" +const ( + // formatAuto asks the importer to detect Messenger export formats. + formatAuto = "auto" + // formatJSON is the Thread.Format value for JSON-sourced threads. + formatJSON = "json" +) // ErrCorruptJSON is returned by the JSON parser when a message_*.json file // cannot be parsed as valid JSON. Callers should log and skip the thread. From c5a0df3982dfe14c846e7c5745305543faf40437 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 25 Jun 2026 06:57:52 -0500 Subject: [PATCH 08/11] fix(fbmessenger): clean failed hash rows on placeholder reimport A Messenger reimport can compute a real content hash even when storage still fails, then fall back to the synthetic placeholder row. If an older failed import already left the same real hash with an empty storage_path, keeping that row leaves duplicate attachment identities and exposes a SHA-looking hash with no stored file. After the synthetic placeholder upsert succeeds, remove the same-message same-hash empty-storage row so the placeholder remains the only hashless-storage representation until a later successful import can replace it with real content. Validation: seeded the stale real-hash empty-path row into the storage-failure regression; the focused test failed with two rows before the cleanup and passed after removing the stale row on placeholder reimport. Generated with Codex (GPT-5) Co-authored-by: Codex --- internal/fbmessenger/importer.go | 4 ++++ internal/fbmessenger/importer_test.go | 1 + 2 files changed, 5 insertions(+) diff --git a/internal/fbmessenger/importer.go b/internal/fbmessenger/importer.go index 65e9fd46e..a793ed50f 100644 --- a/internal/fbmessenger/importer.go +++ b/internal/fbmessenger/importer.go @@ -716,6 +716,10 @@ func writeThreadToStore( if err := deleteSyntheticPlaceholderAttachment(st, messageID, syntheticHash); err != nil { logger.Warn("fbmessenger: delete synthetic attachment placeholder", "err", err) } + } else if contentHash != "" { + if err := deleteFailedStoredAttachment(st, messageID, contentHash); err != nil { + logger.Warn("fbmessenger: delete failed stored attachment", "err", err) + } } } diff --git a/internal/fbmessenger/importer_test.go b/internal/fbmessenger/importer_test.go index a7431df6f..dd02078ec 100644 --- a/internal/fbmessenger/importer_test.go +++ b/internal/fbmessenger/importer_test.go @@ -399,6 +399,7 @@ func TestImportDYI_AttachmentStorageFailureKeepsSyntheticPlaceholder(t *testing. png, err := os.ReadFile("testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png") require.NoError(err) wantHash := fmt.Sprintf("%x", sha256.Sum256(png)) + require.NoError(st.UpsertAttachment(messageID, "tiny.png", "image/png", "", wantHash, 0), "seed stale real-hash empty-path attachment") badAttachRoot := filepath.Join(t.TempDir(), "attachments-file") require.NoError(os.WriteFile(badAttachRoot, []byte("not a directory"), 0600), "write bad attachment root") From 3db6f3a38193e18fa6fb6986d955b0e29943cb23 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 25 Jun 2026 07:33:55 -0500 Subject: [PATCH 09/11] fix(fbmessenger): clean hashless stale attachment rows A Messenger reimport without an attachments directory, or with a now-missing media file, cannot recompute the real content hash that an older failed storage attempt may have recorded. In that state the importer can still write the deterministic synthetic placeholder, but the stale SHA-looking empty-storage row needs to be removed by the remaining attachment metadata. When a synthetic placeholder import has no content hash available, remove empty-storage 64-character hash rows for the same message, filename, and MIME type. Stored attachments and prefixed synthetic placeholders are left alone. Validation: added a no-attachments-dir reimport regression seeded with a stale real-hash empty-path row; it failed with two rows before the metadata cleanup and passed after the fallback cleanup. Generated with Codex (GPT-5) Co-authored-by: Codex --- internal/fbmessenger/importer.go | 15 +++++++++ internal/fbmessenger/importer_test.go | 45 +++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/internal/fbmessenger/importer.go b/internal/fbmessenger/importer.go index a793ed50f..5dafbc482 100644 --- a/internal/fbmessenger/importer.go +++ b/internal/fbmessenger/importer.go @@ -720,6 +720,8 @@ func writeThreadToStore( if err := deleteFailedStoredAttachment(st, messageID, contentHash); err != nil { logger.Warn("fbmessenger: delete failed stored attachment", "err", err) } + } else if err := deleteFailedStoredAttachmentByMetadata(st, messageID, att.Filename, att.MimeType); err != nil { + logger.Warn("fbmessenger: delete failed stored attachment", "err", err) } } @@ -907,6 +909,19 @@ func deleteFailedStoredAttachment(st *store.Store, messageID int64, contentHash return err } +func deleteFailedStoredAttachmentByMetadata(st *store.Store, messageID int64, filename, mimeType string) error { + _, err := st.DB().Exec(st.Rebind(` + DELETE FROM attachments + WHERE message_id = ? + AND filename = ? + AND mime_type = ? + AND storage_path = '' + AND content_hash <> '' + AND LENGTH(content_hash) = 64 + `), messageID, filename, mimeType) + return err +} + // fbAttachmentHash derives a stable synthetic attachment key for an attachment // that has no real (content-derived) hash — e.g. the file is missing or no // attachments dir was configured. Without it, UpsertAttachment collapses every diff --git a/internal/fbmessenger/importer_test.go b/internal/fbmessenger/importer_test.go index dd02078ec..828bf456a 100644 --- a/internal/fbmessenger/importer_test.go +++ b/internal/fbmessenger/importer_test.go @@ -449,6 +449,51 @@ func TestImportDYI_AttachmentStorageFailureKeepsSyntheticPlaceholder(t *testing. assert.Equal(string(png), string(got), "stored bytes after recovery") } +func TestImportDYI_AttachmentStorageHashlessReimportRemovesStaleRealHashRow(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + st := testutil.NewTestStore(t) + opts := ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: "testdata/json_with_media", + Format: formatAuto, + NoResume: true, + } + _, err := ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var messageID int64 + err = st.DB().QueryRow("SELECT id FROM messages WHERE source_message_id = 'inbox/bob_XYZ789__0'").Scan(&messageID) + require.NoError(err, "select imported message id") + var placeholderHash string + err = st.DB().QueryRow(st.Rebind( + "SELECT content_hash FROM attachments WHERE message_id = ?", + ), messageID).Scan(&placeholderHash) + require.NoError(err, "select synthetic placeholder") + assertSyntheticAttachmentKey(t, placeholderHash) + + png, err := os.ReadFile("testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png") + require.NoError(err) + wantHash := fmt.Sprintf("%x", sha256.Sum256(png)) + require.NoError(st.UpsertAttachment(messageID, "tiny.png", "image/png", "", wantHash, 0), "seed stale real-hash empty-path attachment") + + _, err = ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var count int + err = st.DB().QueryRow(st.Rebind("SELECT COUNT(*) FROM attachments WHERE message_id = ?"), messageID).Scan(&count) + require.NoError(err, "count attachments") + require.Equal(1, count, "hashless reimport should keep only the synthetic placeholder") + + var contentHash, storagePath string + err = st.DB().QueryRow(st.Rebind( + "SELECT content_hash, storage_path FROM attachments WHERE message_id = ?", + ), messageID).Scan(&contentHash, &storagePath) + require.NoError(err, "select placeholder") + assert.Equal(placeholderHash, contentHash, "content_hash") + assert.Empty(storagePath, "storage_path") +} + func TestImportDYI_AttachmentStorageReimportRepairsRealHashEmptyPathRow(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) From 3e5fc2b53051342aa04261a2b31ad5d9730be313 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 25 Jun 2026 08:08:15 -0500 Subject: [PATCH 10/11] fix(fbmessenger): skip placeholders for stored attachments A Messenger reimport can compute the real content hash before failing to write to the current attachment store. If that same content was already stored by an earlier successful import, falling back to the synthetic placeholder creates a duplicate row beside valid stored content. Check for an existing non-empty storage_path row for the computed hash before inserting a placeholder. When stored content already exists, keep it as the attachment identity and remove any stale hashless or synthetic placeholder rows for that Messenger attachment. Validation: added a regression where a successful stored import is followed by a storage-failing reimport; it failed with two rows before the stored-row guard and passed after skipping the placeholder. Generated with Codex (GPT-5) Co-authored-by: Codex --- internal/fbmessenger/importer.go | 28 +++++++++++++++++ internal/fbmessenger/importer_test.go | 44 +++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/internal/fbmessenger/importer.go b/internal/fbmessenger/importer.go index 5dafbc482..05ae2cf1f 100644 --- a/internal/fbmessenger/importer.go +++ b/internal/fbmessenger/importer.go @@ -700,6 +700,20 @@ func writeThreadToStore( syntheticHash := fbAttachmentHash(att, ai) hash := contentHash storedContent := storagePath != "" && contentHash != "" + if !storedContent && contentHash != "" { + stored, err := hasStoredAttachment(st, messageID, contentHash) + if err != nil { + logger.Warn("fbmessenger: check stored attachment", "err", err) + } else if stored { + if err := deleteLegacyHashlessAttachment(st, messageID); err != nil { + logger.Warn("fbmessenger: delete legacy hashless attachment", "err", err) + } + if err := deleteSyntheticPlaceholderAttachment(st, messageID, syntheticHash); err != nil { + logger.Warn("fbmessenger: delete synthetic attachment placeholder", "err", err) + } + continue + } + } if !storedContent { hash = syntheticHash } else if err := deleteFailedStoredAttachment(st, messageID, contentHash); err != nil { @@ -922,6 +936,20 @@ func deleteFailedStoredAttachmentByMetadata(st *store.Store, messageID int64, fi return err } +func hasStoredAttachment(st *store.Store, messageID int64, contentHash string) (bool, error) { + var stored bool + err := st.DB().QueryRow(st.Rebind(` + SELECT EXISTS ( + SELECT 1 + FROM attachments + WHERE message_id = ? + AND content_hash = ? + AND storage_path <> '' + ) + `), messageID, contentHash).Scan(&stored) + return stored, err +} + // fbAttachmentHash derives a stable synthetic attachment key for an attachment // that has no real (content-derived) hash — e.g. the file is missing or no // attachments dir was configured. Without it, UpsertAttachment collapses every diff --git a/internal/fbmessenger/importer_test.go b/internal/fbmessenger/importer_test.go index 828bf456a..80ec35b8e 100644 --- a/internal/fbmessenger/importer_test.go +++ b/internal/fbmessenger/importer_test.go @@ -494,6 +494,50 @@ func TestImportDYI_AttachmentStorageHashlessReimportRemovesStaleRealHashRow(t *t assert.Empty(storagePath, "storage_path") } +func TestImportDYI_AttachmentStorageFailureDoesNotAddPlaceholderBesideStoredAttachment(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + st := testutil.NewTestStore(t) + attachDir := t.TempDir() + opts := ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: "testdata/json_with_media", + Format: formatAuto, + AttachmentsDir: attachDir, + NoResume: true, + } + _, err := ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var messageID int64 + err = st.DB().QueryRow("SELECT id FROM messages WHERE source_message_id = 'inbox/bob_XYZ789__0'").Scan(&messageID) + require.NoError(err, "select imported message id") + png, err := os.ReadFile("testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png") + require.NoError(err) + wantHash := fmt.Sprintf("%x", sha256.Sum256(png)) + + badAttachRoot := filepath.Join(t.TempDir(), "attachments-file") + require.NoError(os.WriteFile(badAttachRoot, []byte("not a directory"), 0600), "write bad attachment root") + opts.AttachmentsDir = badAttachRoot + _, err = ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var count int + err = st.DB().QueryRow(st.Rebind("SELECT COUNT(*) FROM attachments WHERE message_id = ?"), messageID).Scan(&count) + require.NoError(err, "count attachments") + require.Equal(1, count, "failed storage should not add a placeholder beside stored content") + + var contentHash, storagePath string + var size int64 + err = st.DB().QueryRow(st.Rebind( + "SELECT content_hash, storage_path, size FROM attachments WHERE message_id = ?", + ), messageID).Scan(&contentHash, &storagePath, &size) + require.NoError(err, "select stored attachment") + assert.Equal(wantHash, contentHash, "content_hash") + assert.NotEmpty(storagePath, "storage_path") + assert.Equal(int64(len(png)), size, "size") +} + func TestImportDYI_AttachmentStorageReimportRepairsRealHashEmptyPathRow(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) From 8da80c1500d3299afab773763991683c360a3bf2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 25 Jun 2026 08:43:03 -0500 Subject: [PATCH 11/11] fix(fbmessenger): detect stored attachments without storage dir A Messenger reimport with AttachmentsDir unset can still resolve the source media path, but handleAttachment intentionally returns no content_hash because no bytes were stored in this run. That made the existing stored-row check blind to content already imported by an earlier successful run and allowed a synthetic placeholder beside valid stored content. Hash the source attachment only for existing-stored-row detection when the current import did not produce stored content. The importer still writes synthetic keys for unstored attachments unless a stored row for the same source bytes already exists. Validation: added a stored-import then no-attachments-dir reimport regression; it failed with two rows before source-hash detection and passed after using the source hash to skip the placeholder. Generated with Codex (GPT-5) Co-authored-by: Codex --- internal/fbmessenger/importer.go | 32 ++++++++++++++++++-- internal/fbmessenger/importer_test.go | 42 +++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/internal/fbmessenger/importer.go b/internal/fbmessenger/importer.go index 05ae2cf1f..f48382920 100644 --- a/internal/fbmessenger/importer.go +++ b/internal/fbmessenger/importer.go @@ -700,8 +700,12 @@ func writeThreadToStore( syntheticHash := fbAttachmentHash(att, ai) hash := contentHash storedContent := storagePath != "" && contentHash != "" - if !storedContent && contentHash != "" { - stored, err := hasStoredAttachment(st, messageID, contentHash) + existingContentHash := contentHash + if !storedContent && existingContentHash == "" { + existingContentHash = hashAttachmentSource(att) + } + if !storedContent && existingContentHash != "" { + stored, err := hasStoredAttachment(st, messageID, existingContentHash) if err != nil { logger.Warn("fbmessenger: check stored attachment", "err", err) } else if stored { @@ -891,6 +895,30 @@ func handleAttachment(att Attachment, attachmentsDir string) (string, string, in return rel, contentHash, int(info.Size()) } +func hashAttachmentSource(att Attachment) string { + if att.AbsPath == "" { + return "" + } + linfo, err := os.Lstat(att.AbsPath) + if err != nil || !linfo.Mode().IsRegular() { + return "" + } + f, err := os.Open(att.AbsPath) + if err != nil { + return "" + } + defer func() { _ = f.Close() }() + info, err := f.Stat() + if err != nil || !info.Mode().IsRegular() { + return "" + } + h := sha256.New() + if _, err := io.Copy(h, f); err != nil { + return "" + } + return hex.EncodeToString(h.Sum(nil)) +} + const fbSyntheticAttachmentKeyPrefix = "fbmessenger:attachment:" func deleteLegacyHashlessAttachment(st *store.Store, messageID int64) error { diff --git a/internal/fbmessenger/importer_test.go b/internal/fbmessenger/importer_test.go index 80ec35b8e..1ba44c065 100644 --- a/internal/fbmessenger/importer_test.go +++ b/internal/fbmessenger/importer_test.go @@ -538,6 +538,48 @@ func TestImportDYI_AttachmentStorageFailureDoesNotAddPlaceholderBesideStoredAtta assert.Equal(int64(len(png)), size, "size") } +func TestImportDYI_AttachmentStorageHashlessReimportDoesNotAddPlaceholderBesideStoredAttachment(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + st := testutil.NewTestStore(t) + attachDir := t.TempDir() + opts := ImportOptions{ + Me: "test.user@facebook.messenger", + RootDir: "testdata/json_with_media", + Format: formatAuto, + AttachmentsDir: attachDir, + NoResume: true, + } + _, err := ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var messageID int64 + err = st.DB().QueryRow("SELECT id FROM messages WHERE source_message_id = 'inbox/bob_XYZ789__0'").Scan(&messageID) + require.NoError(err, "select imported message id") + png, err := os.ReadFile("testdata/json_with_media/your_activity_across_facebook/messages/inbox/bob_XYZ789/photos/tiny.png") + require.NoError(err) + wantHash := fmt.Sprintf("%x", sha256.Sum256(png)) + + opts.AttachmentsDir = "" + _, err = ImportDYI(context.Background(), st, opts) + require.NoError(err) + + var count int + err = st.DB().QueryRow(st.Rebind("SELECT COUNT(*) FROM attachments WHERE message_id = ?"), messageID).Scan(&count) + require.NoError(err, "count attachments") + require.Equal(1, count, "hashless reimport should not add a placeholder beside stored content") + + var contentHash, storagePath string + var size int64 + err = st.DB().QueryRow(st.Rebind( + "SELECT content_hash, storage_path, size FROM attachments WHERE message_id = ?", + ), messageID).Scan(&contentHash, &storagePath, &size) + require.NoError(err, "select stored attachment") + assert.Equal(wantHash, contentHash, "content_hash") + assert.NotEmpty(storagePath, "storage_path") + assert.Equal(int64(len(png)), size, "size") +} + func TestImportDYI_AttachmentStorageReimportRepairsRealHashEmptyPathRow(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t)