diff --git a/Cargo.lock b/Cargo.lock index 0cd519b..1b70ab2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2150,6 +2150,7 @@ dependencies = [ "rusqlite", "serde", "serde_json", + "sha2", "ureq", ] diff --git a/Cargo.toml b/Cargo.toml index bfb9176..dcd05af 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" ftm-types = "0.4" rusqlite = { version = "0.31", features = ["bundled"] } +sha2 = "0.10" duckdb = { version = "1.10502", features = ["bundled"] } ureq = { version = "3", features = ["json"] } diff --git a/src/main.rs b/src/main.rs index 765c562..a951b11 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,6 +22,7 @@ use ratatui::{ }; use rusqlite::{Connection, params}; use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; use std::{io, path::PathBuf, rc::Rc, time::Instant}; use ftm_types::generated::entities::{Email as FtmEmail, Folder as FtmFolder}; @@ -1295,6 +1296,7 @@ fn create_export_schema(conn: &Connection) -> Result<(), rusqlite::Error> { id INTEGER PRIMARY KEY, folder_id INTEGER NOT NULL REFERENCES folders(id), message_class TEXT NOT NULL, + message_hash TEXT NOT NULL UNIQUE, subject TEXT, sender TEXT, to_recipients TEXT, @@ -1335,6 +1337,36 @@ fn filetime_to_iso(ticks: i64) -> Option { } } +fn message_hash( + sender: Option<&str>, + subject: Option<&str>, + submit_time: Option<&str>, + to_recipients: Option<&str>, + body_text: Option<&str>, + body_html: Option<&str>, + body_rtf: Option<&[u8]>, +) -> String { + let mut h = Sha256::new(); + for field in [ + sender.unwrap_or("").as_bytes(), + b"\x00", + subject.unwrap_or("").as_bytes(), + b"\x00", + submit_time.unwrap_or("").as_bytes(), + b"\x00", + to_recipients.unwrap_or("").as_bytes(), + b"\x00", + body_text.unwrap_or("").as_bytes(), + b"\x00", + body_html.unwrap_or("").as_bytes(), + b"\x00", + body_rtf.unwrap_or(b""), + ] { + h.update(field); + } + format!("{:x}", h.finalize()) +} + fn export_folder( store: Rc, folder: &UnicodeFolder, @@ -1440,14 +1472,25 @@ fn export_folder( }) .unwrap_or(0); - conn.execute( - "INSERT INTO messages (folder_id, message_class, subject, sender, + let hash = message_hash( + sender.as_deref(), + subject.as_deref(), + submit_time.as_deref(), + to_recipients.as_deref(), + body_text.as_deref(), + body_html.as_deref(), + body_rtf.as_deref(), + ); + + let inserted = conn.execute( + "INSERT OR IGNORE INTO messages (folder_id, message_class, message_hash, subject, sender, to_recipients, cc_recipients, submit_time, delivery_time, body_text, body_html, body_rtf, attachment_count) - VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)", + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13)", params![ folder_id, &message_class, + &hash, &subject, &sender, &to_recipients, @@ -1460,7 +1503,9 @@ fn export_folder( attachment_count, ], )?; - counts.1 += 1; + if inserted > 0 { + counts.1 += 1; + } } } @@ -1557,6 +1602,7 @@ fn create_duckdb_schema(conn: &DuckConnection) -> Result<(), duckdb::Error> { id BIGINT PRIMARY KEY DEFAULT nextval('messages_id_seq'), folder_id BIGINT NOT NULL, message_class VARCHAR NOT NULL, + message_hash VARCHAR NOT NULL UNIQUE, subject VARCHAR, sender VARCHAR, to_recipients VARCHAR, @@ -1693,14 +1739,25 @@ fn export_folder_duckdb( }) .unwrap_or(0); - conn.execute( - "INSERT INTO messages (folder_id, message_class, subject, sender, + let hash = message_hash( + sender.as_deref(), + subject.as_deref(), + submit_time.as_deref(), + to_recipients.as_deref(), + body_text.as_deref(), + body_html.as_deref(), + body_rtf.as_deref(), + ); + + let inserted = conn.execute( + "INSERT OR IGNORE INTO messages (folder_id, message_class, message_hash, subject, sender, to_recipients, cc_recipients, submit_time, delivery_time, body_text, body_html, body_rtf, attachment_count) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", duckdb::params![ folder_id, &message_class, + &hash, &subject, &sender, &to_recipients, @@ -1713,7 +1770,9 @@ fn export_folder_duckdb( attachment_count, ], )?; - counts.1 += 1; + if inserted > 0 { + counts.1 += 1; + } } } @@ -4151,6 +4210,95 @@ mod tests { let _ = std::fs::remove_file(&db_path); } + // ── duplicate message_hash tests ───────────────────────────────────────── + + #[test] + fn test_export_sqlite_duplicate_hash_skipped() { + let db_path = std::env::temp_dir().join("pstexplorer_test_export_dedup.db"); + let _ = std::fs::remove_file(&db_path); + + let conn = Connection::open(&db_path).unwrap(); + create_export_schema(&conn).unwrap(); + + conn.execute( + "INSERT INTO folders (parent_id, name, path) VALUES (NULL, 'Inbox', 'Inbox')", + [], + ) + .unwrap(); + let folder_id = conn.last_insert_rowid(); + + let insert = |hash: &str, subject: &str| { + conn.execute( + "INSERT OR IGNORE INTO messages + (folder_id, message_class, message_hash, subject, attachment_count) + VALUES (?1, 'IPM.NOTE', ?2, ?3, 0)", + params![folder_id, hash, subject], + ) + .unwrap() + }; + + insert("deadbeef", "Original"); + let affected = insert("deadbeef", "Duplicate with same hash"); + + assert_eq!(affected, 0, "duplicate insert should be a no-op"); + + let count: i64 = conn + .query_row("SELECT COUNT(*) FROM messages", [], |r| r.get(0)) + .unwrap(); + assert_eq!(count, 1, "only one row should exist after duplicate insert"); + + let subject: String = conn + .query_row("SELECT subject FROM messages", [], |r| r.get(0)) + .unwrap(); + assert_eq!(subject, "Original", "first message should be kept"); + + let _ = std::fs::remove_file(&db_path); + } + + #[test] + fn test_export_duckdb_duplicate_hash_skipped() { + let db_path = std::env::temp_dir().join("pstexplorer_test_export_dedup.duckdb"); + let _ = std::fs::remove_file(&db_path); + + let conn = DuckConnection::open(&db_path).unwrap(); + create_duckdb_schema(&conn).unwrap(); + + let folder_id: i64 = conn + .query_row( + "INSERT INTO folders (parent_id, name, path) VALUES (NULL, 'Inbox', 'Inbox') RETURNING id", + [], + |r| r.get(0), + ) + .unwrap(); + + let insert = |hash: &str, subject: &str| -> usize { + conn.execute( + "INSERT OR IGNORE INTO messages + (folder_id, message_class, message_hash, subject, attachment_count) + VALUES (?, 'IPM.NOTE', ?, ?, 0)", + duckdb::params![folder_id, hash, subject], + ) + .unwrap() + }; + + insert("deadbeef", "Original"); + let affected = insert("deadbeef", "Duplicate with same hash"); + + assert_eq!(affected, 0, "duplicate insert should be a no-op"); + + let count: i64 = conn + .query_row("SELECT COUNT(*) FROM messages", [], |r| r.get(0)) + .unwrap(); + assert_eq!(count, 1, "only one row should exist after duplicate insert"); + + let subject: String = conn + .query_row("SELECT subject FROM messages", [], |r| r.get(0)) + .unwrap(); + assert_eq!(subject, "Original", "first message should be kept"); + + let _ = std::fs::remove_file(&db_path); + } + // ── FTM output tests ───────────────────────────────────────────────────── #[test]