diff --git a/.gitignore b/.gitignore index 2cecb9ff9..cd4d8ce50 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,4 @@ docs/.zensical-build.* docs/assets/static/ docs/assets/generated/ docs/screenshots/demo-data/ +.kata.local.toml diff --git a/.kata.toml b/.kata.toml new file mode 100644 index 000000000..7ddafd6bf --- /dev/null +++ b/.kata.toml @@ -0,0 +1,4 @@ +version = 1 + +[project] +name = "msgvault" diff --git a/CLAUDE.md b/CLAUDE.md index a6e4ae13a..779ba4ff4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -54,8 +54,7 @@ make lint # Run linter ./msgvault sync-incremental you@gmail.com # Incremental sync # TUI and analytics -./msgvault tui # Launch TUI -./msgvault tui --account you@gmail.com # Filter by account +./msgvault tui # Launch TUI (press 'a' inside to filter by account) ./msgvault tui --local # Force local (override remote config) ./msgvault build-cache # Build Parquet cache ./msgvault build-cache --full-rebuild # Full rebuild @@ -68,6 +67,11 @@ make lint # Run linter ./msgvault import-emlx --account me@gmail.com # Specific account(s) ./msgvault import-emlx /path/to/dir --identifier me@gmail.com # Manual fallback +# Microsoft Teams (delegated Graph) +./msgvault add-teams you@tenant.com # Authorize Teams (browser OAuth) +./msgvault sync-teams you@tenant.com # Sync Teams chats + channels +./msgvault sync-teams you@tenant.com --no-channels --limit 50 + # Daemon mode (NAS/server deployment) ./msgvault serve # Start HTTP API + scheduled syncs diff --git a/cmd/msgvault/cmd/add_teams.go b/cmd/msgvault/cmd/add_teams.go new file mode 100644 index 000000000..f03ca1bf1 --- /dev/null +++ b/cmd/msgvault/cmd/add_teams.go @@ -0,0 +1,104 @@ +package cmd + +import ( + "errors" + "fmt" + + "github.com/spf13/cobra" + "go.kenn.io/msgvault/internal/microsoft" + "go.kenn.io/msgvault/internal/store" +) + +var ( + teamsTenantID string + noDefaultIdentityAddTeams bool +) + +var addTeamsCmd = &cobra.Command{ + Use: "add-teams ", + Short: "Authorize Microsoft Teams (delegated Graph) for an account", + Long: `Authorize a Microsoft Teams account using OAuth2 (delegated Graph API). + +This opens a browser for Microsoft authorization, then stores the token for +Teams message ingestion. + +Requires a [microsoft] section in config.toml with your Azure AD app's client_id. +See the docs for Azure AD app registration setup. + +Examples: + msgvault add-teams user@company.com + msgvault add-teams user@company.com --tenant my-tenant-id`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + email := args[0] + + if cfg.Microsoft.ClientID == "" { + return errors.New("microsoft OAuth not configured\n\n" + + "Add to your config.toml:\n\n" + + " [microsoft]\n" + + " client_id = \"your-azure-app-client-id\"\n\n" + + "See docs for Azure AD app registration setup") + } + + tenantID := cfg.Microsoft.EffectiveTenantID() + if teamsTenantID != "" { + tenantID = teamsTenantID + } + + mgr := microsoft.NewGraphManager( + cfg.Microsoft.ClientID, + tenantID, + cfg.TokensDir(), + logger, + ) + + fmt.Printf("Authorizing %s with Microsoft Teams...\n", email) + if err := mgr.Authorize(cmd.Context(), email); err != nil { + return fmt.Errorf("authorize Teams: %w", err) + } + + dbPath := cfg.DatabaseDSN() + s, err := store.Open(dbPath) + if err != nil { + return fmt.Errorf("open database: %w", err) + } + defer func() { _ = s.Close() }() + + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + if err := runStartupMigrationsForIngest(s); err != nil { + return fmt.Errorf("startup migrations: %w", err) + } + + source, err := s.GetOrCreateSource(sourceTypeTeams, email) + if err != nil { + return fmt.Errorf("create source: %w", err) + } + if err := s.UpdateSourceDisplayName(source.ID, email); err != nil { + return fmt.Errorf("set display name: %w", err) + } + + if !noDefaultIdentityAddTeams { + confirmDefaultIdentity(cmd.OutOrStdout(), s, source.ID, email, email, "account-identifier") + } + if err := runPostSourceCreateMigrations(s); err != nil { + return fmt.Errorf("post-source-create migrations: %w", err) + } + + fmt.Printf("\nMicrosoft Teams account authorized successfully!\n") + fmt.Printf(" Email: %s\n", email) + fmt.Println() + fmt.Println("You can now run:") + fmt.Printf(" msgvault sync-teams %s\n", email) + + return nil + }, +} + +func init() { + addTeamsCmd.Flags().StringVar(&teamsTenantID, "tenant", "", + "Azure AD tenant ID (default: \"common\" for multi-tenant)") + addTeamsCmd.Flags().BoolVar(&noDefaultIdentityAddTeams, "no-default-identity", false, noDefaultIdentityHelp) + rootCmd.AddCommand(addTeamsCmd) +} diff --git a/cmd/msgvault/cmd/backfill_teams_media.go b/cmd/msgvault/cmd/backfill_teams_media.go new file mode 100644 index 000000000..c685fdcf3 --- /dev/null +++ b/cmd/msgvault/cmd/backfill_teams_media.go @@ -0,0 +1,128 @@ +package cmd + +import ( + "context" + "errors" + "fmt" + "os" + "os/signal" + "syscall" + "time" + + "github.com/spf13/cobra" + "go.kenn.io/msgvault/internal/microsoft" + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/teams" +) + +var backfillTeamsMediaOnlyIncomplete bool + +var backfillTeamsMediaCmd = &cobra.Command{ + Use: "backfill-teams-media ", + Short: "Re-fetch Teams inline media (hostedContents) for already-imported messages", + Long: `Re-fetch Microsoft Teams inline media (hostedContents) for messages that +were already imported but whose inline images were never downloaded. + +This targets ONLY messages whose stored HTML body contains a hostedContents +URL, instead of re-walking every message. It is idempotent: content-addressed +storage dedupes, so it is safe to re-run. + +Use --only-incomplete to retry just the messages whose inline media is still +missing (e.g. after transient fetch failures), instead of re-fetching all. + +Examples: + msgvault backfill-teams-media user@company.com + msgvault backfill-teams-media user@company.com --only-incomplete`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + email := args[0] + + dbPath := cfg.DatabaseDSN() + s, err := store.Open(dbPath) + if err != nil { + return fmt.Errorf("open database: %w", err) + } + defer func() { _ = s.Close() }() + + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + if err := runStartupMigrationsForIngest(s); err != nil { + return fmt.Errorf("startup migrations: %w", err) + } + + if cfg.Microsoft.ClientID == "" { + return errors.New("microsoft OAuth not configured\n\n" + + "Add to your config.toml:\n\n" + + " [microsoft]\n" + + " client_id = \"your-azure-app-client-id\"\n\n" + + "See docs for Azure AD app registration setup") + } + + mgr := microsoft.NewGraphManager( + cfg.Microsoft.ClientID, + cfg.Microsoft.EffectiveTenantID(), + cfg.TokensDir(), + logger, + ) + tokenFn, err := mgr.TokenSource(cmd.Context(), email) + if err != nil { + return fmt.Errorf("load Teams token: %w (run 'add-teams' first)", err) + } + + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + defer signal.Stop(sigChan) + go func() { + select { + case <-sigChan: + _, _ = fmt.Fprintln(cmd.ErrOrStderr(), "\nInterrupted. Stopping...") + cancel() + case <-ctx.Done(): + } + }() + + qps := float64(cfg.Sync.RateLimitQPS) + if qps <= 0 { + qps = 5 + } + client := teams.NewClient("https://graph.microsoft.com/v1.0", teams.TokenFunc(tokenFn), qps) + imp := teams.NewImporter(s, client) + + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Backfilling Teams inline media for %s\n\n", email) + + sum, err := imp.BackfillInlineMedia(ctx, teams.ImportOptions{ + Email: email, + AttachmentsDir: cfg.AttachmentsDir(), + OnlyIncomplete: backfillTeamsMediaOnlyIncomplete, + Progress: func(s string) { fmt.Println(s) }, + }) + if ctx.Err() != nil { + _, _ = fmt.Fprintln(cmd.OutOrStdout(), "\nInterrupted — re-run backfill-teams-media to resume (idempotent).") + rebuildCacheAfterWrite(dbPath) + return nil + } + if err != nil { + return fmt.Errorf("teams inline-media backfill failed: %w", err) + } + + _, _ = fmt.Fprintln(cmd.OutOrStdout()) + _, _ = fmt.Fprintln(cmd.OutOrStdout(), "Teams inline-media backfill complete!") + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Duration: %s\n", sum.Duration.Round(time.Second)) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Messages processed: %d\n", sum.MessagesProcessed) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Inline images copied:%d\n", sum.InlineImagesCopied) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Errors: %d\n", sum.Errors) + + rebuildCacheAfterWrite(dbPath) + return nil + }, +} + +func init() { + backfillTeamsMediaCmd.Flags().BoolVar(&backfillTeamsMediaOnlyIncomplete, "only-incomplete", false, + "retry only messages whose inline media is still missing (e.g. after transient failures)") + rootCmd.AddCommand(backfillTeamsMediaCmd) +} diff --git a/cmd/msgvault/cmd/constants.go b/cmd/msgvault/cmd/constants.go index ebcca23bb..1de16cf5d 100644 --- a/cmd/msgvault/cmd/constants.go +++ b/cmd/msgvault/cmd/constants.go @@ -6,6 +6,7 @@ const ( sourceTypeGmail = "gmail" sourceTypeIMAP = "imap" sourceTypeMbox = "mbox" + sourceTypeTeams = "teams" ) // Analytics dataset / SQLite table names: the Parquet subdirectory under diff --git a/cmd/msgvault/cmd/quickstart.md b/cmd/msgvault/cmd/quickstart.md index a81990985..1f6bd77e4 100644 --- a/cmd/msgvault/cmd/quickstart.md +++ b/cmd/msgvault/cmd/quickstart.md @@ -146,7 +146,7 @@ msgvault search "quarterly report" --limit 100 --offset 50 | `newer_than:` | Relative date | `newer_than:7d` | | `larger:` | Minimum size | `larger:10M` | | `smaller:` | Maximum size | `smaller:100K` | -| `message_type:` | Message type | `message_type:sms` | +| `message_type:` | Message type | `message_type:teams` | Bare words and `"quoted phrases"` perform full-text search across subject and body. @@ -267,11 +267,9 @@ msgvault version ```bash # Launch the TUI (auto-builds analytics cache if needed) +# Press 'a' inside the TUI to filter by account msgvault tui -# Filter by account -msgvault tui --account user@gmail.com - # Force local database (override remote config) msgvault tui --local ``` diff --git a/cmd/msgvault/cmd/remove_account.go b/cmd/msgvault/cmd/remove_account.go index 885d2a482..8a2620694 100644 --- a/cmd/msgvault/cmd/remove_account.go +++ b/cmd/msgvault/cmd/remove_account.go @@ -163,6 +163,18 @@ func runRemoveAccount(cmd *cobra.Command, args []string) error { tokenPath, err, ) } + case sourceTypeTeams: + graphMgr := microsoft.NewGraphManager( + cfg.Microsoft.ClientID, + cfg.Microsoft.EffectiveTenantID(), + cfg.TokensDir(), + logger, + ) + if err := graphMgr.DeleteToken(source.Identifier); err != nil { + fmt.Fprintf(os.Stderr, + "Warning: could not remove Microsoft Graph token: %v\n", err, + ) + } case sourceTypeIMAP: if source.SyncConfig.Valid && source.SyncConfig.String != "" { imapCfg, parseErr := imaplib.ConfigFromJSON(source.SyncConfig.String) diff --git a/cmd/msgvault/cmd/remove_account_test.go b/cmd/msgvault/cmd/remove_account_test.go index bdf6754b9..a02c8cd9f 100644 --- a/cmd/msgvault/cmd/remove_account_test.go +++ b/cmd/msgvault/cmd/remove_account_test.go @@ -8,6 +8,7 @@ import ( assertpkg "github.com/stretchr/testify/assert" requirepkg "github.com/stretchr/testify/require" "go.kenn.io/msgvault/internal/config" + "go.kenn.io/msgvault/internal/microsoft" "go.kenn.io/msgvault/internal/oauth" "go.kenn.io/msgvault/internal/store" ) @@ -412,6 +413,47 @@ func TestRemoveAccountCmd_GmailRemovesToken(t *testing.T) { assertpkg.True(t, os.IsNotExist(err), "token file should be removed for gmail source") } +func TestRemoveAccountCmd_TeamsRemovesGraphToken(t *testing.T) { + require := requirepkg.New(t) + tmpDir := t.TempDir() + dbPath := tmpDir + "/msgvault.db" + tokensDir := filepath.Join(tmpDir, "tokens") + require.NoError(os.MkdirAll(tokensDir, 0700), "mkdir tokens") + + s, err := store.Open(dbPath) + require.NoError(err, "open store") + require.NoError(s.InitSchema(), "init schema") + _, err = s.GetOrCreateSource("teams", "tok@example.com") + require.NoError(err, "create source") + _ = s.Close() + + mgr := microsoft.NewGraphManager("client-id", "", tokensDir, nil) + tokenPath := mgr.TokenPath("tok@example.com") + require.NoError(os.WriteFile(tokenPath, []byte(`{}`), 0600), "write teams token") + + savedCfg := cfg + defer func() { cfg = savedCfg }() + + cfg = &config.Config{ + HomeDir: tmpDir, + Data: config.DataConfig{DataDir: tmpDir}, + Microsoft: config.MicrosoftConfig{ + ClientID: "client-id", + }, + } + + root := newTestRootCmd() + root.AddCommand(newRemoveAccountCmd()) + root.SetArgs([]string{ + "remove-account", "tok@example.com", "--yes", "--type", "teams", + }) + + require.NoError(root.Execute(), "remove-account") + + _, err = os.Stat(tokenPath) + assertpkg.True(t, os.IsNotExist(err), "Graph token file should be removed for teams source") +} + func TestRemoveAccountCmd_NonGmailSkipsToken(t *testing.T) { require := requirepkg.New(t) tmpDir := t.TempDir() diff --git a/cmd/msgvault/cmd/search.go b/cmd/msgvault/cmd/search.go index c2819cd95..48efac138 100644 --- a/cmd/msgvault/cmd/search.go +++ b/cmd/msgvault/cmd/search.go @@ -46,7 +46,7 @@ Supported operators (local mode only - remote uses simple text search): newer_than: Relative date larger: Size filter (5M, 100K) smaller: Size filter - message_type: Message type filter (sms, mms, whatsapp, email) + message_type: Message type filter (sms, mms, whatsapp, teams, email) Bare words and "quoted phrases" perform full-text search. diff --git a/cmd/msgvault/cmd/serve.go b/cmd/msgvault/cmd/serve.go index 29fe1f21e..5f43d1670 100644 --- a/cmd/msgvault/cmd/serve.go +++ b/cmd/msgvault/cmd/serve.go @@ -15,7 +15,9 @@ import ( "github.com/spf13/cobra" "go.kenn.io/msgvault/internal/api" + "go.kenn.io/msgvault/internal/config" "go.kenn.io/msgvault/internal/gmail" + "go.kenn.io/msgvault/internal/microsoft" "go.kenn.io/msgvault/internal/oauth" "go.kenn.io/msgvault/internal/query" "go.kenn.io/msgvault/internal/scheduler" @@ -23,6 +25,7 @@ import ( "go.kenn.io/msgvault/internal/store" "go.kenn.io/msgvault/internal/sync" "go.kenn.io/msgvault/internal/syncerr" + "go.kenn.io/msgvault/internal/teams" "golang.org/x/oauth2" ) @@ -67,7 +70,7 @@ func runServe(cmd *cobra.Command, args []string) error { } // Validate config - if !cfg.OAuth.HasAnyConfig() { + if !hasServeOAuthConfig(cfg) { return errOAuthNotConfigured() } @@ -306,6 +309,13 @@ func runServe(cmd *cobra.Command, args []string) error { return nil } +func hasServeOAuthConfig(c *config.Config) bool { + if c == nil { + return false + } + return c.OAuth.HasAnyConfig() || c.Microsoft.ClientID != "" +} + // storeAPIAdapter adapts store.Store to the API store interfaces. // Since api.APIMessage, api.StoreStats, etc. are type aliases for store types, // the adapter methods are simple pass-throughs with no conversion needed. @@ -391,65 +401,98 @@ func (a *schedulerAdapter) Status() []api.AccountStatus { return a.scheduler.Status() } -// runScheduledSync performs a sync for a scheduled account. The -// dispatch is by source_type: Gmail accounts run an incremental sync -// using the Gmail History API; IMAP accounts run a full sync (already -// deduplicated by message-id at the store layer, since IMAP has no -// equivalent history API). Under scan-and-fill there is no enqueue step -// — newly-ingested messages get embed_gen = NULL by column default, so -// subsequent embed runs discover and pick them up by scanning; the sync -// path therefore needs no vector-feature wiring. +// runScheduledSync performs a sync for a scheduled account. It resolves +// ALL syncable source rows for the identifier (gmail, imap, teams) and +// dispatches each in turn. When no source row matches, it falls back to +// the Gmail token-first workflow (tokens uploaded via API before the +// source row exists) so that legacy deployments keep working. +// +// Under scan-and-fill there is no enqueue step — newly-ingested messages +// get embed_gen = NULL by column default, so subsequent embed runs +// discover and pick them up by scanning; the sync path therefore needs +// no vector-feature wiring. // -// The identifier passed in is whatever the scheduler holds — for -// Gmail this is the email address, for IMAP it's the full -// `imaps://user@host:port` URL recorded by `add-imap`. +// Per-source errors are collected with errors.Join; the cache rebuild +// runs once after all sources regardless of per-source errors. +// +// The identifier passed in is whatever the scheduler holds — for Gmail +// this is the email address, for IMAP it's the full +// `imaps://user@host:port` URL recorded by `add-imap`, for Teams it is +// the UPN/email recorded by `add-o365`. func runScheduledSync(ctx context.Context, identifier string, s *store.Store, getOAuthMgr func(string) (*oauth.Manager, error)) error { logger.Info("starting scheduled sync", "identifier", identifier) - startTime := time.Now() - src, srcErr := findScheduledSyncSource(s, identifier) + srcs, srcErr := findScheduledSyncSources(s, identifier) if srcErr != nil { - return fmt.Errorf("look up source for %s: %w", identifier, srcErr) + return fmt.Errorf("look up sources for %s: %w", identifier, srcErr) } - // Source type drives dispatch. A nil source falls back to Gmail to - // preserve the token-first workflow (tokens uploaded via API before - // the source row exists). - sourceType := sourceTypeGmail - if src != nil { - sourceType = src.SourceType + // No source row found: fall back to the Gmail token-first workflow + // (preserves behaviour for tokens uploaded via API before the source + // row exists). + if len(srcs) == 0 { + startTime := time.Now() + summary, err := runScheduledGmailSync(ctx, identifier, nil, s, getOAuthMgr) + if err != nil { + return err + } + logger.Info("sync completed", + "identifier", identifier, + "source_type", sourceTypeGmail, + "messages_added", summary.MessagesAdded, + "duration", time.Since(startTime), + ) + rebuildCacheAfterScheduledSync(ctx, identifier) + return nil + } + + var errs []error + for _, src := range srcs { + startTime := time.Now() + sourceType := src.SourceType if sourceType == "" { sourceType = sourceTypeGmail } - } - var ( - summary *gmail.SyncSummary - err error - ) - switch sourceType { - case sourceTypeGmail: - summary, err = runScheduledGmailSync(ctx, identifier, src, s, getOAuthMgr) - case sourceTypeIMAP: - summary, err = runScheduledIMAPSync(ctx, src, s) - default: - return fmt.Errorf("source %q has type %q which is not supported by the daemon scheduler (only gmail and imap)", identifier, sourceType) - } - if err != nil { - return err - } + var ( + summary *gmail.SyncSummary + err error + ) + switch sourceType { + case sourceTypeGmail: + summary, err = runScheduledGmailSync(ctx, identifier, src, s, getOAuthMgr) + case sourceTypeIMAP: + summary, err = runScheduledIMAPSync(ctx, src, s) + case sourceTypeTeams: + err = runScheduledTeamsSync(ctx, src, s) + default: + err = fmt.Errorf("source %q has type %q which is not supported by the daemon scheduler", identifier, sourceType) + } + if err != nil { + errs = append(errs, fmt.Errorf("%s (%s): %w", identifier, sourceType, err)) + continue + } - logger.Info("sync completed", - "identifier", identifier, - "source_type", sourceType, - "messages_added", summary.MessagesAdded, - "duration", time.Since(startTime), - ) + if summary != nil { + logger.Info("sync completed", + "identifier", identifier, + "source_type", sourceType, + "messages_added", summary.MessagesAdded, + "duration", time.Since(startTime), + ) + } else { + logger.Info("sync completed", + "identifier", identifier, + "source_type", sourceType, + "duration", time.Since(startTime), + ) + } + } - // Rebuild cache if stale (covers new messages and deletions). + // Rebuild cache once after all sources, regardless of per-source errors. rebuildCacheAfterScheduledSync(ctx, identifier) - return nil + return errors.Join(errs...) } // findScheduledSyncSource resolves the source row for a scheduler @@ -470,6 +513,7 @@ func findScheduledSyncSource(s *store.Store, identifier string) (*store.Source, return nil, err } var imapSrc *store.Source + var teamsSrc *store.Source for _, src := range sources { switch src.SourceType { case sourceTypeGmail: @@ -478,9 +522,53 @@ func findScheduledSyncSource(s *store.Store, identifier string) (*store.Source, if imapSrc == nil { imapSrc = src } + case sourceTypeTeams: + if teamsSrc == nil { + teamsSrc = src + } } } - return imapSrc, nil + if imapSrc != nil { + return imapSrc, nil + } + return teamsSrc, nil +} + +// findScheduledSyncSources resolves ALL syncable source rows for a +// scheduler identifier. Returns at most one row per syncable type +// (gmail, imap, teams), in that stable order. Non-syncable types +// (mbox, apple-mail, etc.) are skipped. +// +// Returns an empty slice (not nil) when no syncable source matches — +// callers should fall back to the Gmail token-first workflow. +// +// Matches against both sources.identifier and sources.display_name +// (same semantics as findScheduledSyncSource). +func findScheduledSyncSources(s *store.Store, identifier string) ([]*store.Source, error) { + rows, err := s.GetSourcesByIdentifierOrDisplayName(identifier) + if err != nil { + return nil, err + } + + // Collect first occurrence of each syncable type. + seen := make(map[string]*store.Source, 3) + for _, src := range rows { + switch src.SourceType { + case sourceTypeGmail, sourceTypeIMAP, sourceTypeTeams: + if _, dup := seen[src.SourceType]; !dup { + seen[src.SourceType] = src + } + } + } + + // Return in stable order: gmail, imap, teams. + var result []*store.Source + for _, t := range []string{sourceTypeGmail, sourceTypeIMAP, sourceTypeTeams} { + if src, ok := seen[t]; ok { + result = append(result, src) + } + } + return result, nil } // runScheduledGmailSync runs an incremental Gmail sync for the daemon. @@ -602,3 +690,38 @@ func runScheduledIMAPSync(ctx context.Context, src *store.Source, s *store.Store } return summary, nil } + +// runScheduledTeamsSync runs a Teams sync for the daemon. +func runScheduledTeamsSync(ctx context.Context, src *store.Source, s *store.Store) error { + email := src.Identifier + + // Seed the default identity and converge legacy migrations before + // syncing, for parity with the Gmail/IMAP daemon paths. add-teams + // already does both, so this is a no-op in the normal flow, but it + // ensures a Teams source created by another path still gets its + // "me" identity. Auto-default-identity must run BEFORE the legacy + // migration retry (see account_identity.go); serve is a daemon, so + // the confirmation message has no terminal and is discarded. + confirmDefaultIdentity(io.Discard, s, src.ID, email, email, "account-identifier") + if err := runPostSourceCreateMigrations(s); err != nil { + return fmt.Errorf("post-source-create migrations: %w", err) + } + + mgr := microsoft.NewGraphManager(cfg.Microsoft.ClientID, cfg.Microsoft.EffectiveTenantID(), cfg.TokensDir(), logger) + tokenFn, err := mgr.TokenSource(ctx, email) + if err != nil { + return err + } + qps := float64(cfg.Sync.RateLimitQPS) + if qps <= 0 { + qps = 5 + } + client := teams.NewClient("https://graph.microsoft.com/v1.0", teams.TokenFunc(tokenFn), qps) + opts := teams.ImportOptions{ + Email: email, + AttachmentsDir: cfg.AttachmentsDir(), + IncludeChannels: true, + } + _, err = teams.NewImporter(s, client).Import(ctx, opts) + return err +} diff --git a/cmd/msgvault/cmd/serve_test.go b/cmd/msgvault/cmd/serve_test.go index 98f94ab9f..d979d1e5e 100644 --- a/cmd/msgvault/cmd/serve_test.go +++ b/cmd/msgvault/cmd/serve_test.go @@ -118,6 +118,16 @@ client_secrets = "/path/to/secrets.json" assertpkg.Empty(t, scheduled, "expected no scheduled accounts") } +func TestServeOAuthValidationAllowsMicrosoftOnly(t *testing.T) { + assertpkg.True(t, hasServeOAuthConfig(&config.Config{ + Microsoft: config.MicrosoftConfig{ClientID: "azure-client-id"}, + })) +} + +func TestServeOAuthValidationRejectsNoProviders(t *testing.T) { + assertpkg.False(t, hasServeOAuthConfig(&config.Config{})) +} + func TestStoreAPIAdapterServesSourceStatus(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) @@ -404,6 +414,67 @@ func TestRunScheduledIMAPSync_DefaultIdentityIsDisplayName(t *testing.T) { assert.True(foundEmail, "identities = %+v, want one with Address=%q", identities, imapEmail) } +// TestFindScheduledSyncSources verifies that the plural resolver returns +// ALL syncable source types for an identifier (imap + teams together), +// only the matching type for single-type identifiers, and an empty slice +// for unknown identifiers. +func TestFindScheduledSyncSources(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + tmpDir := t.TempDir() + s, err := store.Open(filepath.Join(tmpDir, "msgvault.db")) + require.NoError(err, "open store") + defer func() { _ = s.Close() }() + require.NoError(s.InitSchema(), "init schema") + + // Unknown identifier returns empty slice (not nil), enabling the + // Gmail token-first fallback in runScheduledSync. + got, err := findScheduledSyncSources(s, "missing@example.com") + require.NoError(err, "findScheduledSyncSources(missing)") + assert.Empty(got, "findScheduledSyncSources(missing) should be empty") + + // An address that has BOTH an IMAP source (display_name lookup) and + // a Teams source must return both, in stable order imap then teams. + const ( + imapID = "imaps://nat@host@imap.example.com:993" + sharedEmail = "nat@x.com" + ) + imapSrc, err := s.GetOrCreateSource("imap", imapID) + require.NoError(err, "create imap source") + require.NoError(s.UpdateSourceDisplayName(imapSrc.ID, sharedEmail), "set imap display_name") + + teamsSrc, err := s.GetOrCreateSource("teams", sharedEmail) + require.NoError(err, "create teams source") + + got, err = findScheduledSyncSources(s, sharedEmail) + require.NoError(err, "findScheduledSyncSources(imap+teams)") + require.Len(got, 2, "findScheduledSyncSources(imap+teams) should return 2 sources") + assert.Equal("imap", got[0].SourceType, "first source should be imap") + assert.Equal(imapSrc.ID, got[0].ID, "first source ID") + assert.Equal("teams", got[1].SourceType, "second source should be teams") + assert.Equal(teamsSrc.ID, got[1].ID, "second source ID") + + // A gmail-only identifier returns exactly one gmail source. + const gmailAddr = "g@x.com" + gmailSrc, err := s.GetOrCreateSource("gmail", gmailAddr) + require.NoError(err, "create gmail source") + + got, err = findScheduledSyncSources(s, gmailAddr) + require.NoError(err, "findScheduledSyncSources(gmail)") + require.Len(got, 1, "findScheduledSyncSources(gmail) should return 1 source") + assert.Equal("gmail", got[0].SourceType, "source should be gmail") + assert.Equal(gmailSrc.ID, got[0].ID, "gmail source ID") + + // Non-syncable types (mbox) are ignored; returns empty. + const mboxAddr = "mbox-only@example.com" + _, err = s.GetOrCreateSource("mbox", mboxAddr) + require.NoError(err, "create mbox source") + + got, err = findScheduledSyncSources(s, mboxAddr) + require.NoError(err, "findScheduledSyncSources(mbox-only)") + assert.Empty(got, "findScheduledSyncSources(mbox-only) should be empty") +} + func TestCronExpressionValidation(t *testing.T) { tests := []struct { name string diff --git a/cmd/msgvault/cmd/show_message.go b/cmd/msgvault/cmd/show_message.go index 5dd28e991..64bc1964e 100644 --- a/cmd/msgvault/cmd/show_message.go +++ b/cmd/msgvault/cmd/show_message.go @@ -169,7 +169,11 @@ func outputMessageText(msg *query.MessageDetail) error { if len(msg.Attachments) > 0 { fmt.Println("\nAttachments:") for _, att := range msg.Attachments { - fmt.Printf(" • %s (%s, %s)\n", att.Filename, att.MimeType, formatSize(att.Size)) + if att.URL != "" { + fmt.Printf(" • %s (%s, link) %s\n", att.Filename, att.MimeType, att.URL) + } else { + fmt.Printf(" • %s (%s, %s)\n", att.Filename, att.MimeType, formatSize(att.Size)) + } } } @@ -216,6 +220,9 @@ func outputMessageJSON(msg *query.MessageDetail) error { "size": att.Size, "content_hash": att.ContentHash, } + if att.URL != "" { + attachments[i]["url"] = att.URL + } } output := map[string]any{ @@ -299,7 +306,11 @@ func outputRemoteMessageText(msg *store.APIMessage) error { if len(msg.Attachments) > 0 { fmt.Println("\nAttachments:") for _, att := range msg.Attachments { - fmt.Printf(" • %s (%s, %s)\n", att.Filename, att.MimeType, formatSize(att.Size)) + if att.URL != "" { + fmt.Printf(" • %s (%s, link) %s\n", att.Filename, att.MimeType, att.URL) + } else { + fmt.Printf(" • %s (%s, %s)\n", att.Filename, att.MimeType, formatSize(att.Size)) + } } } @@ -327,6 +338,9 @@ func outputRemoteMessageJSON(msg *store.APIMessage) error { "mime_type": att.MimeType, "size": att.Size, } + if att.URL != "" { + attachments[i]["url"] = att.URL + } } output := map[string]any{ diff --git a/cmd/msgvault/cmd/sync_teams.go b/cmd/msgvault/cmd/sync_teams.go new file mode 100644 index 000000000..3951bd175 --- /dev/null +++ b/cmd/msgvault/cmd/sync_teams.go @@ -0,0 +1,143 @@ +package cmd + +import ( + "context" + "errors" + "fmt" + "os" + "os/signal" + "syscall" + "time" + + "github.com/spf13/cobra" + "go.kenn.io/msgvault/internal/microsoft" + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/teams" +) + +var ( + syncTeamsNoChannels bool + syncTeamsLimit int + syncTeamsFull bool +) + +var syncTeamsCmd = &cobra.Command{ + Use: "sync-teams ", + Short: "Sync Microsoft Teams chats and channels (full or incremental)", + Long: `Sync Microsoft Teams chats and channels for a configured account. + +Full or incremental sync is auto-detected based on what has already been +imported. Re-run to resume after an interruption. + +Use --full to ignore the stored cursor and re-fetch every message (e.g. to +backfill fields added by an importer upgrade). Re-fetched messages are +upserted in place, so this repairs existing rows without creating duplicates. + +Examples: + msgvault sync-teams user@company.com + msgvault sync-teams user@company.com --no-channels + msgvault sync-teams user@company.com --limit 100 + msgvault sync-teams user@company.com --full`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + email := args[0] + + dbPath := cfg.DatabaseDSN() + s, err := store.Open(dbPath) + if err != nil { + return fmt.Errorf("open database: %w", err) + } + defer func() { _ = s.Close() }() + + if err := s.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + if err := runStartupMigrationsForIngest(s); err != nil { + return fmt.Errorf("startup migrations: %w", err) + } + + if cfg.Microsoft.ClientID == "" { + return errors.New("microsoft OAuth not configured\n\n" + + "Add to your config.toml:\n\n" + + " [microsoft]\n" + + " client_id = \"your-azure-app-client-id\"\n\n" + + "See docs for Azure AD app registration setup") + } + + mgr := microsoft.NewGraphManager( + cfg.Microsoft.ClientID, + cfg.Microsoft.EffectiveTenantID(), + cfg.TokensDir(), + logger, + ) + tokenFn, err := mgr.TokenSource(cmd.Context(), email) + if err != nil { + return fmt.Errorf("load Teams token: %w (run 'add-teams' first)", err) + } + + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + defer signal.Stop(sigChan) + go func() { + select { + case <-sigChan: + _, _ = fmt.Fprintln(cmd.ErrOrStderr(), "\nInterrupted. Saving checkpoint...") + cancel() + case <-ctx.Done(): + } + }() + + qps := float64(cfg.Sync.RateLimitQPS) + if qps <= 0 { + qps = 5 + } + client := teams.NewClient("https://graph.microsoft.com/v1.0", teams.TokenFunc(tokenFn), qps) + imp := teams.NewImporter(s, client) + + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Syncing Microsoft Teams for %s\n\n", email) + + opts := teams.ImportOptions{ + Email: email, + AttachmentsDir: cfg.AttachmentsDir(), + IncludeChannels: !syncTeamsNoChannels, + Limit: syncTeamsLimit, + Full: syncTeamsFull, + Progress: func(s string) { fmt.Println(s) }, + } + sum, err := imp.Import(ctx, opts) + if ctx.Err() != nil { + _, _ = fmt.Fprintln(cmd.OutOrStdout(), "\nInterrupted — re-run sync-teams to resume.") + rebuildCacheAfterWrite(dbPath) + return nil + } + if err != nil { + return fmt.Errorf("teams sync failed: %w", err) + } + + _, _ = fmt.Fprintln(cmd.OutOrStdout()) + _, _ = fmt.Fprintln(cmd.OutOrStdout(), "Teams sync complete!") + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Duration: %s\n", sum.Duration.Round(time.Second)) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Chats: %d\n", sum.ChatsProcessed) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Channels: %d\n", sum.ChannelsProcessed) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Messages added: %d\n", sum.MessagesAdded) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Reactions: %d\n", sum.ReactionsAdded) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Attachments: %d\n", sum.AttachmentsFound) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Inline images: %d\n", sum.InlineImagesCopied) + if sum.Errors > 0 { + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " Errors: %d\n", sum.Errors) + } + + rebuildCacheAfterWrite(dbPath) + return nil + }, +} + +func init() { + syncTeamsCmd.Flags().BoolVar(&syncTeamsNoChannels, "no-channels", false, "sync chats only (skip team channels)") + syncTeamsCmd.Flags().IntVar(&syncTeamsLimit, "limit", 0, "max messages per conversation (0 = no limit)") + syncTeamsCmd.Flags().BoolVar(&syncTeamsFull, "full", false, "ignore stored cursor and re-fetch every message (repairs/backfills existing rows in place)") + rootCmd.AddCommand(syncTeamsCmd) +} diff --git a/docs/architecture/storage.md b/docs/architecture/storage.md index 1e4fa5d3e..1dd524758 100644 --- a/docs/architecture/storage.md +++ b/docs/architecture/storage.md @@ -49,7 +49,7 @@ All message data (metadata, labels, participants, and raw MIME) lives in the con | `conversation_id` | INTEGER FK | References `conversations` | | `source_id` | INTEGER FK | References `sources` | | `source_message_id` | TEXT | Source-specific message ID | -| `message_type` | TEXT | `email`, `whatsapp`, `imessage`, `google_voice_text` | +| `message_type` | TEXT | `email`, `whatsapp`, `imessage`, `google_voice_text`, `teams` | | `sent_at` | DATETIME | Send timestamp | | `sender_id` | INTEGER FK | References `participants` | | `subject` | TEXT | Message subject | diff --git a/docs/superpowers/plans/2026-06-19-teams-ingestion.md b/docs/superpowers/plans/2026-06-19-teams-ingestion.md new file mode 100644 index 000000000..3523bda63 --- /dev/null +++ b/docs/superpowers/plans/2026-06-19-teams-ingestion.md @@ -0,0 +1,2007 @@ +# Microsoft Teams Ingestion Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Sync the signed-in user's own Microsoft Teams 1:1/group/meeting chats and channel messages into msgvault via delegated Microsoft Graph, searchable alongside Gmail/Outlook. + +**Architecture:** A new `internal/teams` package (Graph REST client + sync orchestration + message mapping) talks to Graph using a delegated OAuth token obtained through a new Graph-scoped manager in `internal/microsoft` (the existing IMAP `Manager` rejects non-IMAP tokens). Messages are written through the existing granular store path (`UpsertMessage` → `UpsertMessageBody` → `UpsertMessageRawWithFormat("teams_json")` → `UpsertFTS`), reusing the generic chat schema with no new core tables. Chats incrementally sync via `lastModifiedDateTime` list filtering (no delegated chat delta exists); channels via `/messages/delta`. Per-conversation cursors persist as a JSON map in `sync_runs.cursor_after` (read back via `GetLastSuccessfulSync`), mirroring Gmail's historyID and fbmessenger's resume blob. + +**Tech Stack:** Go, Microsoft Graph v1.0 REST, `golang.org/x/oauth2`, existing `internal/store` (SQLite/Postgres), testify, `net/http/httptest` for a fake Graph server. + +**Spec:** `docs/superpowers/specs/2026-06-18-teams-ingestion-design.md` (load-bearing verified 2026-06-19; transcripts are a separate spec and OUT OF SCOPE here). + +--- + +## Conventions locked in by this plan + +- `source_type` = `"teams"` (new constant `sourceTypeTeams`). +- `message_type` = `"teams"`; `raw_format` = `"teams_json"`. +- `conversation_type`: `oneOnOne` → `"direct_chat"`; `group` and `meeting` → `"group_chat"`. +- Token file: `teams_.json` (distinct from IMAP's `microsoft_.json`). +- Participant resolution: resolve AAD object id → `mail` via `GET /users/{id}`; if `mail` present (or sender is `emailUser` whose id IS an email) use `EnsureParticipant(email, displayName, domain)` so Teams unifies with Gmail/Outlook by email; otherwise `EnsureParticipantByIdentifier("teams", aadObjectId, displayName)`. +- Cursor model: a JSON `SyncState` map persisted via `CompleteSync(syncID, jsonString)` into `sync_runs.cursor_after`, read next run via `GetLastSuccessfulSync(sourceID).CursorAfter`. In-run resume writes the same JSON to `cursor_before` via `UpdateSyncCheckpoint`. +- Write path per message (from WhatsApp/fbmessenger): `UpsertMessage` → `UpsertMessageBody` → `UpsertMessageRawWithFormat(id, raw, "teams_json")` → `UpsertFTS` → (recipients/reactions/attachments as needed). FTS is ALWAYS a separate call. +- Graph base URL is injectable so tests point the client at `httptest.NewServer`. + +## File structure + +``` +internal/microsoft/graph_oauth.go # NEW: delegated Graph token manager (no IMAP scope coupling) +internal/microsoft/graph_oauth_test.go # NEW +internal/teams/ + ├── client.go # NEW: Graph REST client (base URL, token, paging, 429/Retry-After) + ├── client_test.go # NEW: httptest-backed + ├── types.go # NEW: Graph DTOs (chat, chatMessage, channel, identitySet, hostedContent, delta envelope) + ImportOptions/ImportSummary + ├── mapping.go # NEW: chatMessage -> store.Message + body/text + participants + ├── mapping_test.go # NEW + ├── participants.go # NEW: identity -> participant resolution (with /users/{id} cache) + ├── participants_test.go # NEW + ├── syncstate.go # NEW: SyncState JSON (per-conversation cursors) marshal/load + ├── syncstate_test.go # NEW + ├── importer.go # NEW: orchestration (chats + channels), persist sequence, checkpointing + └── importer_test.go # NEW: end-to-end against fake Graph server +cmd/msgvault/cmd/constants.go # MODIFY: add sourceTypeTeams +cmd/msgvault/cmd/add_teams.go # NEW: `add-account --teams` style OAuth + source create +cmd/msgvault/cmd/sync_teams.go # NEW: `sync-teams ` full/incremental +cmd/msgvault/cmd/serve.go # MODIFY: scheduler case for "teams" +internal/config/config.go # (reuse MicrosoftConfig.ClientID/TenantID; no change expected) +``` + +--- + +## Task 1: Add the `teams` source-type constant + +**Files:** +- Modify: `cmd/msgvault/cmd/constants.go` + +- [ ] **Step 1: Add the constant** + +Open `cmd/msgvault/cmd/constants.go` and add `sourceTypeTeams` alongside the existing source-type constants (the file already defines `sourceTypeGmail`, `sourceTypeIMAP`, etc. around lines 6-8): + +```go + sourceTypeTeams = "teams" +``` + +- [ ] **Step 2: Build** + +Run: `go build ./...` +Expected: compiles (unused constant is fine in a `const` block). + +- [ ] **Step 3: Commit** + +```bash +git add cmd/msgvault/cmd/constants.go +git commit -m "feat(teams): add teams source-type constant" +``` + +--- + +## Task 2: Delegated Graph OAuth token manager + +The existing `internal/microsoft.Manager` validates `Scopes[0]` is an IMAP scope and namespaces tokens as `microsoft_.json` (`oauth.go:237-254`, `:602`). A Graph token fails that. Add a sibling manager that reuses the browser/device flow but stores `teams_.json` and requests Graph scopes. + +**Files:** +- Create: `internal/microsoft/graph_oauth.go` +- Test: `internal/microsoft/graph_oauth_test.go` + +- [ ] **Step 1: Write the failing test for token path + scopes** + +```go +package microsoft + +import ( + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestGraphTokenPath(t *testing.T) { + dir := filepath.Join("tmp", "tokens") + m := &GraphManager{tokensDir: dir} + assert.Equal(t, filepath.Join(dir, "teams_user@example.com.json"), m.TokenPath("user@example.com")) +} + +func TestGraphScopes(t *testing.T) { + got := GraphScopes() + assert.Contains(t, got, "https://graph.microsoft.com/Chat.Read") + assert.Contains(t, got, "https://graph.microsoft.com/ChannelMessage.Read.All") + assert.Contains(t, got, "https://graph.microsoft.com/Team.ReadBasic.All") + assert.Contains(t, got, "https://graph.microsoft.com/Channel.ReadBasic.All") + assert.Contains(t, got, "https://graph.microsoft.com/User.Read") + assert.Contains(t, got, scopeOfflineAccess) +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/microsoft/ -run 'TestGraph' -v` +Expected: FAIL — `GraphManager`/`GraphScopes` undefined. + +- [ ] **Step 3: Implement the manager** + +Reuse the existing constants (`DefaultTenant`, `redirectPort`, `callbackPath`, `scopeOfflineAccess`) and the existing browser-flow + token-persistence helpers in `oauth.go`. Mirror `NewManager`/`Authorize`/`TokenSource` but with Graph scopes and the `teams_` prefix, and WITHOUT the IMAP `Scopes[0]` validation. + +```go +package microsoft + +import ( + "context" + "fmt" + "log/slog" + "path/filepath" + + "golang.org/x/oauth2" +) + +// Graph delegated scopes for Teams ingestion (verified live 2026-06-19). +const ( + scopeGraphChatRead = "https://graph.microsoft.com/Chat.Read" + scopeGraphChannelMessage = "https://graph.microsoft.com/ChannelMessage.Read.All" + scopeGraphTeamReadBasic = "https://graph.microsoft.com/Team.ReadBasic.All" + scopeGraphChannelBasic = "https://graph.microsoft.com/Channel.ReadBasic.All" + scopeGraphUserRead = "https://graph.microsoft.com/User.Read" + scopeGraphUserReadBasic = "https://graph.microsoft.com/User.ReadBasic.All" +) + +// GraphScopes is the delegated scope set requested for Teams. +func GraphScopes() []string { + return []string{ + scopeGraphChatRead, + scopeGraphChannelMessage, + scopeGraphTeamReadBasic, + scopeGraphChannelBasic, + scopeGraphUserRead, + scopeGraphUserReadBasic, + scopeOfflineAccess, + "openid", + scopeEmail, + } +} + +// GraphManager performs the delegated browser OAuth flow for Microsoft Graph +// and persists tokens as teams_.json, independent of the IMAP Manager. +type GraphManager struct { + clientID string + tenantID string + tokensDir string + logger *slog.Logger +} + +func NewGraphManager(clientID, tenantID, tokensDir string, logger *slog.Logger) *GraphManager { + if tenantID == "" { + tenantID = DefaultTenant + } + return &GraphManager{clientID: clientID, tenantID: tenantID, tokensDir: tokensDir, logger: logger} +} + +func (m *GraphManager) TokenPath(email string) string { + return filepath.Join(m.tokensDir, "teams_"+sanitizeEmail(email)+".json") +} + +func (m *GraphManager) HasToken(email string) bool { + _, err := loadTokenFile(m.TokenPath(email)) + return err == nil +} + +// Authorize runs the interactive browser auth-code flow and writes the token file. +func (m *GraphManager) Authorize(ctx context.Context, email string) error { + cfg := m.oauthConfig() + tok, err := runBrowserFlow(ctx, cfg, m.tenantID, email, m.logger) // existing helper in oauth.go + if err != nil { + return fmt.Errorf("graph authorize: %w", err) + } + return saveTokenFile(m.TokenPath(email), tokenFileFromOAuth(tok, m.tenantID, GraphScopes())) +} + +// TokenSource returns a function yielding a fresh access token (auto-refresh), +// with NO IMAP scope validation. +func (m *GraphManager) TokenSource(ctx context.Context, email string) (func(context.Context) (string, error), error) { + tf, err := loadTokenFile(m.TokenPath(email)) + if err != nil { + return nil, fmt.Errorf("no Teams token for %s — run 'msgvault add-account %s --teams': %w", email, email, err) + } + cfg := m.oauthConfig() + src := cfg.TokenSource(ctx, tf.toOAuthToken()) + persisting := persistingTokenSource(src, m.TokenPath(email), tf) // existing helper pattern in oauth.go + return func(ctx context.Context) (string, error) { + t, err := persisting.Token() + if err != nil { + return "", err + } + return t.AccessToken, nil + }, nil +} + +func (m *GraphManager) oauthConfig() *oauth2.Config { + return newAzureOAuthConfig(m.clientID, m.tenantID, GraphScopes()) // existing helper in oauth.go +} +``` + +> **Implementation note:** `oauth.go` already contains the browser-flow, token-file load/save, and refresh-persistence helpers used by `Manager`. Reuse them. The exact private helper names (`runBrowserFlow`, `loadTokenFile`, `saveTokenFile`, `persistingTokenSource`, `newAzureOAuthConfig`, `sanitizeEmail`, `tokenFileFromOAuth`, `toOAuthToken`) must be confirmed against `oauth.go` while implementing; if a needed helper is currently unexported-but-IMAP-specific, extract the IMAP-agnostic core into a shared private function rather than duplicating. Do NOT modify the existing IMAP `Manager` behavior. + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/microsoft/ -run 'TestGraph' -v` +Expected: PASS. + +- [ ] **Step 5: Verify existing Microsoft tests still pass** + +Run: `go test ./internal/microsoft/ -v` +Expected: PASS (existing IMAP tests unaffected). + +- [ ] **Step 6: Format, vet, commit** + +```bash +go fmt ./... && go vet ./... +git add internal/microsoft/graph_oauth.go internal/microsoft/graph_oauth_test.go +git commit -m "feat(teams): delegated Graph OAuth token manager" +``` + +--- + +## Task 3: Graph DTO types + +**Files:** +- Create: `internal/teams/types.go` + +- [ ] **Step 1: Define the Graph DTOs and option/summary structs** + +These mirror the Graph v1.0 JSON shapes confirmed in the load-bearing pass (identities carry id + displayName, no email; attachments use `contentType:"reference"`; hosted content via `$value`). + +```go +package teams + +import "time" + +// ---- Graph response envelopes ---- + +type listResponse[T any] struct { + Value []T `json:"value"` + NextLink string `json:"@odata.nextLink"` + DeltaLink string `json:"@odata.deltaLink"` +} + +// ---- Chats & channels ---- + +type Chat struct { + ID string `json:"id"` + ChatType string `json:"chatType"` // oneOnOne | group | meeting + Topic string `json:"topic"` + OnlineInfo *struct { + JoinWebURL string `json:"joinWebUrl"` + } `json:"onlineMeetingInfo"` +} + +type JoinedTeam struct { + ID string `json:"id"` + DisplayName string `json:"displayName"` +} + +type Channel struct { + ID string `json:"id"` + DisplayName string `json:"displayName"` + MembershipType string `json:"membershipType"` // standard | private | shared +} + +// ---- Messages ---- + +type ChatMessage struct { + ID string `json:"id"` + ReplyToID string `json:"replyToId"` + MessageType string `json:"messageType"` // message | systemEventMessage | ... + CreatedDateTime time.Time `json:"createdDateTime"` + LastModifiedDateTime time.Time `json:"lastModifiedDateTime"` + DeletedDateTime *time.Time `json:"deletedDateTime"` + Subject string `json:"subject"` + Importance string `json:"importance"` + From *IdentitySet `json:"from"` + Body MessageBody `json:"body"` + Attachments []Attachment `json:"attachments"` + Mentions []Mention `json:"mentions"` + Reactions []Reaction `json:"reactions"` +} + +type MessageBody struct { + ContentType string `json:"contentType"` // html | text + Content string `json:"content"` +} + +type IdentitySet struct { + User *Identity `json:"user"` + Application *Identity `json:"application"` +} + +type Identity struct { + ID string `json:"id"` + DisplayName string `json:"displayName"` + UserIdentityType string `json:"userIdentityType"` // aadUser | emailUser | anonymousGuest | skypeUser | ... +} + +type Attachment struct { + ID string `json:"id"` + ContentType string `json:"contentType"` // "reference" => shared file link + ContentURL string `json:"contentUrl"` + Name string `json:"name"` +} + +type Mention struct { + ID int `json:"id"` + MentionText string `json:"mentionText"` + Mentioned *IdentitySet `json:"mentioned"` +} + +type Reaction struct { + ReactionType string `json:"reactionType"` // like | heart | laugh | ... + CreatedDateTime time.Time `json:"createdDateTime"` + User *IdentitySet `json:"user"` +} + +// GraphUser is the subset of /users/{id} we resolve for participant email. +type GraphUser struct { + ID string `json:"id"` + Mail string `json:"mail"` + UserPrincipalName string `json:"userPrincipalName"` + DisplayName string `json:"displayName"` +} + +// ---- Importer options/summary ---- + +type ImportOptions struct { + Email string + AttachmentsDir string + IncludeChannels bool // default true; allows chats-only runs + Limit int // 0 = no limit (per-conversation message cap, for scoped runs) + After time.Time // zero = no lower bound +} + +type ImportSummary struct { + Duration time.Duration + SourceID int64 + ChatsProcessed int64 + ChannelsProcessed int64 + MessagesProcessed int64 + MessagesAdded int64 + MessagesUpdated int64 + ReactionsAdded int64 + AttachmentsFound int64 + InlineImagesCopied int64 + Participants int64 + Errors int64 +} +``` + +- [ ] **Step 2: Build** + +Run: `go build ./internal/teams/` +Expected: compiles. + +- [ ] **Step 3: Commit** + +```bash +go fmt ./... && git add internal/teams/types.go +git commit -m "feat(teams): Graph DTO types and importer options" +``` + +--- + +## Task 4: Graph REST client with paging and Retry-After + +Gmail's client has no `Retry-After` parsing (`client.go:91-188` uses fixed throttles), so implement it here. The client takes an injectable base URL and token function so tests use `httptest`. + +**Files:** +- Create: `internal/teams/client.go` +- Test: `internal/teams/client_test.go` + +- [ ] **Step 1: Write the failing test (paging + Retry-After)** + +```go +package teams + +import ( + "context" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func tokenFn(string) (string, error) { return "test-token", nil } + +func TestClientGetJSONPaging(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "Bearer test-token", r.Header.Get("Authorization")) + n := atomic.AddInt32(&calls, 1) + w.Header().Set("Content-Type", "application/json") + if n == 1 { + w.Write([]byte(`{"value":[{"id":"a"}],"@odata.nextLink":"` + r.Host + `/page2"}`)) + return + } + w.Write([]byte(`{"value":[{"id":"b"}],"@odata.deltaLink":"DELTA"}`)) + })) + defer srv.Close() + + c := NewClient(srv.URL, func(context.Context) (string, error) { return "test-token", nil }, 5) + var got []Chat + delta, err := c.getAllPages(context.Background(), "/me/chats", func(page []Chat) { got = append(got, page...) }) + require.NoError(t, err) + assert.Equal(t, "DELTA", delta) + assert.Len(t, got, 2) +} + +func TestClientRetryAfter(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if atomic.AddInt32(&calls, 1) == 1 { + w.Header().Set("Retry-After", "0") + w.WriteHeader(http.StatusTooManyRequests) + return + } + w.Write([]byte(`{"value":[]}`)) + })) + defer srv.Close() + + c := NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50) + _, err := c.getAllPages(context.Background(), "/x", func([]Chat) {}) + require.NoError(t, err) + assert.EqualValues(t, 2, atomic.LoadInt32(&calls)) +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/teams/ -run 'TestClient' -v` +Expected: FAIL — `NewClient`/`getAllPages` undefined. + +- [ ] **Step 3: Implement the client** + +```go +package teams + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "strconv" + "strings" + "time" + + "golang.org/x/time/rate" +) + +const maxRetries = 8 + +type TokenFunc func(context.Context) (string, error) + +type Client struct { + baseURL string + token TokenFunc + http *http.Client + limiter *rate.Limiter +} + +func NewClient(baseURL string, token TokenFunc, qps float64) *Client { + if qps <= 0 { + qps = 5 + } + return &Client{ + baseURL: strings.TrimRight(baseURL, "/"), + token: token, + http: &http.Client{Timeout: 60 * time.Second}, + limiter: rate.NewLimiter(rate.Limit(qps), 1), + } +} + +// get performs a single authenticated GET with rate limiting + Retry-After. +func (c *Client) get(ctx context.Context, url string) ([]byte, error) { + if !strings.HasPrefix(url, "http") { + url = c.baseURL + url + } + for attempt := 0; attempt < maxRetries; attempt++ { + if err := c.limiter.Wait(ctx); err != nil { + return nil, err + } + tok, err := c.token(ctx) + if err != nil { + return nil, err + } + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, err + } + req.Header.Set("Authorization", "Bearer "+tok) + req.Header.Set("Accept", "application/json") + resp, err := c.http.Do(req) + if err != nil { + return nil, err + } + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + + switch { + case resp.StatusCode == http.StatusOK: + return body, nil + case resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode >= 500: + wait := retryAfter(resp.Header.Get("Retry-After"), attempt) + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(wait): + } + continue + default: + return nil, fmt.Errorf("graph GET %s: status %d: %s", url, resp.StatusCode, string(body)) + } + } + return nil, fmt.Errorf("graph GET %s: exhausted %d retries", url, maxRetries) +} + +func retryAfter(header string, attempt int) time.Duration { + if header != "" { + if secs, err := strconv.Atoi(strings.TrimSpace(header)); err == nil { + return time.Duration(secs) * time.Second + } + } + // exponential fallback, capped + d := time.Duration(1< 60*time.Second { + d = 60 * time.Second + } + return d +} + +func (c *Client) getJSON(ctx context.Context, url string, out any) error { + body, err := c.get(ctx, url) + if err != nil { + return err + } + return json.Unmarshal(body, out) +} + +// getAllPages follows @odata.nextLink, invoking fn per page, and returns the +// terminal @odata.deltaLink (empty for non-delta endpoints). +func (c *Client) getAllPages(ctx context.Context, startURL string, fn func([]Chat)) (string, error) { + url := startURL + for { + var page listResponse[Chat] + if err := c.getJSON(ctx, url, &page); err != nil { + return "", err + } + fn(page.Value) + if page.NextLink != "" { + url = page.NextLink + continue + } + return page.DeltaLink, nil + } +} +``` + +> The test JSON returns a `nextLink` of the form `/page2`; the client prefixes with `http` when absent. Real Graph returns absolute `https://graph.microsoft.com/...` links. In tests, the fake server's `nextLink` must be absolute — adjust the test handler to emit `"http://"+r.Host+"/page2"` if the assertion fails. (Keep the generic-over-`Chat` `getAllPages` for chats; add typed twins `getAllMessages`/`getAllChannels` in Task 6/7, or generalize with a small JSON-roundtrip helper — choose the minimal approach when implementing.) + +> Add `golang.org/x/time/rate` if not already in go.mod: `go get golang.org/x/time/rate` (it is a common transitive dep; verify with `go list -m golang.org/x/time` first). + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/teams/ -run 'TestClient' -v` +Expected: PASS. + +- [ ] **Step 5: Format, vet, commit** + +```bash +go fmt ./... && go vet ./internal/teams/ +git add internal/teams/client.go internal/teams/client_test.go go.mod go.sum +git commit -m "feat(teams): Graph REST client with paging and Retry-After" +``` + +--- + +## Task 5: Participant resolution (identity → store participant) + +**Files:** +- Create: `internal/teams/participants.go` +- Test: `internal/teams/participants_test.go` + +- [ ] **Step 1: Write the failing test** + +```go +package teams + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.kenn.io/msgvault/internal/testutil" +) + +func TestResolveParticipant_EmailUserUsesIDAsEmail(t *testing.T) { + st := testutil.NewTestStore(t) + r := newParticipantResolver(st, nil) // nil client => no /users lookup needed + id := &Identity{ID: "alice@outlook.com", DisplayName: "Alice", UserIdentityType: "emailUser"} + pid, err := r.resolve(context.Background(), id) + require.NoError(t, err) + assert.NotZero(t, pid) +} + +func TestResolveParticipant_AADUserResolvesMail(t *testing.T) { + st := testutil.NewTestStore(t) + fake := &fakeUserLookup{mail: map[string]string{"obj-1": "bob@example.com"}} + r := newParticipantResolver(st, fake) + id := &Identity{ID: "obj-1", DisplayName: "Bob", UserIdentityType: "aadUser"} + pid, err := r.resolve(context.Background(), id) + require.NoError(t, err) + assert.NotZero(t, pid) + + // second call hits the cache, not the lookup + _, err = r.resolve(context.Background(), id) + require.NoError(t, err) + assert.Equal(t, 1, fake.calls) +} + +type fakeUserLookup struct { + mail map[string]string + calls int +} + +func (f *fakeUserLookup) GetUser(_ context.Context, id string) (*GraphUser, error) { + f.calls++ + return &GraphUser{ID: id, Mail: f.mail[id]}, nil +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/teams/ -run 'TestResolveParticipant' -v` +Expected: FAIL — undefined symbols. + +- [ ] **Step 3: Implement the resolver** + +```go +package teams + +import ( + "context" + "strings" + + "go.kenn.io/msgvault/internal/store" +) + +type userLookup interface { + GetUser(ctx context.Context, id string) (*GraphUser, error) +} + +type participantResolver struct { + store *store.Store + lookup userLookup + cache map[string]int64 // identity id -> participant id +} + +func newParticipantResolver(s *store.Store, lookup userLookup) *participantResolver { + return &participantResolver{store: s, lookup: lookup, cache: map[string]int64{}} +} + +// resolve maps a Graph identity to a store participant id, unifying with +// email identities when possible. Returns 0 for nil/unresolvable identities. +func (r *participantResolver) resolve(ctx context.Context, id *Identity) (int64, error) { + if id == nil || id.ID == "" { + return 0, nil + } + if pid, ok := r.cache[id.ID]; ok { + return pid, nil + } + var pid int64 + var err error + switch id.UserIdentityType { + case "emailUser": + pid, err = r.byEmail(id.ID, id.DisplayName) + case "aadUser", "onPremiseAadUser": + email := r.lookupMail(ctx, id.ID) + if email != "" { + pid, err = r.byEmail(email, id.DisplayName) + } else { + pid, err = r.store.EnsureParticipantByIdentifier("teams", id.ID, id.DisplayName) + } + default: + // application/bot, anonymousGuest, skypeUser, ACS: no email + pid, err = r.store.EnsureParticipantByIdentifier("teams", id.ID, id.DisplayName) + } + if err != nil { + return 0, err + } + r.cache[id.ID] = pid + return pid, nil +} + +func (r *participantResolver) byEmail(email, displayName string) (int64, error) { + domain := "" + if at := strings.LastIndex(email, "@"); at >= 0 { + domain = strings.ToLower(email[at+1:]) + } + return r.store.EnsureParticipant(strings.ToLower(email), displayName, domain) +} + +func (r *participantResolver) lookupMail(ctx context.Context, objectID string) string { + if r.lookup == nil { + return "" + } + u, err := r.lookup.GetUser(ctx, objectID) + if err != nil || u == nil { + return "" + } + if u.Mail != "" { + return u.Mail + } + // UPN is often (not always) an SMTP address; accept it as best-effort. + if strings.Contains(u.UserPrincipalName, "@") && !strings.Contains(u.UserPrincipalName, "#EXT#") { + return u.UserPrincipalName + } + return "" +} +``` + +- [ ] **Step 4: Add the real `GetUser` to the client** + +In `internal/teams/client.go` add: + +```go +func (c *Client) GetUser(ctx context.Context, id string) (*GraphUser, error) { + var u GraphUser + if err := c.getJSON(ctx, "/users/"+id+"?$select=id,mail,userPrincipalName,displayName", &u); err != nil { + return nil, err + } + return &u, nil +} +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `go test ./internal/teams/ -run 'TestResolveParticipant' -v` +Expected: PASS. + +- [ ] **Step 6: Format, vet, commit** + +```bash +go fmt ./... && go vet ./internal/teams/ +git add internal/teams/participants.go internal/teams/participants_test.go internal/teams/client.go +git commit -m "feat(teams): identity-to-participant resolution with /users cache" +``` + +--- + +## Task 6: Message mapping (chatMessage → store.Message + body text) + +**Files:** +- Create: `internal/teams/mapping.go` +- Test: `internal/teams/mapping_test.go` + +- [ ] **Step 1: Write the failing test** + +```go +package teams + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +func TestHTMLToText(t *testing.T) { + got := htmlToText(`

Hello Bob see link

`) + assert.Contains(t, got, "Hello") + assert.Contains(t, got, "Bob") + assert.NotContains(t, got, "

") +} + +func TestMapMessageBasics(t *testing.T) { + gm := &ChatMessage{ + ID: "m1", + CreatedDateTime: time.Date(2025, 1, 2, 3, 4, 5, 0, time.UTC), + LastModifiedDateTime: time.Date(2025, 1, 2, 3, 4, 5, 0, time.UTC), + Body: MessageBody{ContentType: "html", Content: "

hi there

"}, + Attachments: []Attachment{{ContentType: "reference", ContentURL: "http://sp/f", Name: "f.docx"}}, + } + msg, text := mapMessage(gm, 10, 20) + assert.Equal(t, "teams", msg.MessageType) + assert.Equal(t, "m1", msg.SourceMessageID) + assert.True(t, msg.SentAt.Valid) + assert.True(t, msg.HasAttachments) + assert.Equal(t, 1, msg.AttachmentCount) + assert.Equal(t, "hi there", text) + assert.Contains(t, msg.Snippet.String, "hi there") +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/teams/ -run 'TestHTMLToText|TestMapMessage' -v` +Expected: FAIL — undefined. + +- [ ] **Step 3: Implement mapping** + +For HTML→text, check whether `internal/mime` or `internal/textutil` already exposes an HTML-to-text function (the research noted email HTML→text exists for MIME). Prefer reusing it; only add a local fallback if none is exported. + +```go +package teams + +import ( + "database/sql" + "regexp" + "strings" + + "go.kenn.io/msgvault/internal/store" +) + +var tagRe = regexp.MustCompile(`<[^>]+>`) + +// htmlToText is a minimal fallback. If internal/textutil exposes an +// HTML-to-text helper, call that instead and delete this. +func htmlToText(html string) string { + s := tagRe.ReplaceAllString(html, " ") + s = strings.ReplaceAll(s, " ", " ") + s = strings.ReplaceAll(s, "&", "&") + s = strings.ReplaceAll(s, "<", "<") + s = strings.ReplaceAll(s, ">", ">") + return strings.TrimSpace(regexp.MustCompile(`\s+`).ReplaceAllString(s, " ")) +} + +func snippet(text string) string { + r := []rune(text) + if len(r) > 100 { + return string(r[:100]) + } + return text +} + +// mapMessage builds the store.Message and returns the derived body text. +func mapMessage(gm *ChatMessage, conversationID, sourceID int64) (store.Message, string) { + text := gm.Body.Content + if strings.EqualFold(gm.Body.ContentType, "html") { + text = htmlToText(gm.Body.Content) + } + msg := store.Message{ + ConversationID: conversationID, + SourceID: sourceID, + SourceMessageID: gm.ID, + MessageType: "teams", + SentAt: sql.NullTime{Time: gm.CreatedDateTime, Valid: !gm.CreatedDateTime.IsZero()}, + ReceivedAt: sql.NullTime{Time: gm.CreatedDateTime, Valid: !gm.CreatedDateTime.IsZero()}, + Snippet: sql.NullString{String: snippet(text), Valid: text != ""}, + HasAttachments: len(gm.Attachments) > 0, + AttachmentCount: len(gm.Attachments), + } + if gm.Subject != "" { + msg.Subject = sql.NullString{String: gm.Subject, Valid: true} + } + return msg, text +} + +func conversationType(chatType string) string { + if chatType == "oneOnOne" { + return "direct_chat" + } + return "group_chat" // group, meeting +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/teams/ -run 'TestHTMLToText|TestMapMessage' -v` +Expected: PASS. + +- [ ] **Step 5: Format, vet, commit** + +```bash +go fmt ./... && go vet ./internal/teams/ +git add internal/teams/mapping.go internal/teams/mapping_test.go +git commit -m "feat(teams): chatMessage to store.Message mapping" +``` + +--- + +## Task 7: SyncState (per-conversation cursors) + +**Files:** +- Create: `internal/teams/syncstate.go` +- Test: `internal/teams/syncstate_test.go` + +- [ ] **Step 1: Write the failing test** + +```go +package teams + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestSyncStateRoundTrip(t *testing.T) { + s := NewSyncState() + s.SetChatCursor("19:abc@thread.v2", "2026-01-01T00:00:00Z") + s.SetChannelDelta("team1/chanA", "https://graph/delta?token=xyz") + + blob, err := s.Marshal() + require.NoError(t, err) + + got, err := LoadSyncState(blob) + require.NoError(t, err) + assert.Equal(t, "2026-01-01T00:00:00Z", got.ChatCursor("19:abc@thread.v2")) + assert.Equal(t, "https://graph/delta?token=xyz", got.ChannelDelta("team1/chanA")) + assert.Equal(t, "", got.ChatCursor("unknown")) +} + +func TestLoadSyncStateEmpty(t *testing.T) { + got, err := LoadSyncState("") + require.NoError(t, err) + assert.Equal(t, "", got.ChatCursor("anything")) +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/teams/ -run 'TestSyncState|TestLoadSyncState' -v` +Expected: FAIL — undefined. + +- [ ] **Step 3: Implement** + +```go +package teams + +import "encoding/json" + +type SyncState struct { + Chats map[string]string `json:"chats"` // chatID -> max lastModifiedDateTime (RFC3339) + Channels map[string]string `json:"channels"` // "teamID/channelID" -> deltaLink +} + +func NewSyncState() *SyncState { + return &SyncState{Chats: map[string]string{}, Channels: map[string]string{}} +} + +func LoadSyncState(blob string) (*SyncState, error) { + s := NewSyncState() + if blob == "" { + return s, nil + } + if err := json.Unmarshal([]byte(blob), s); err != nil { + return nil, err + } + if s.Chats == nil { + s.Chats = map[string]string{} + } + if s.Channels == nil { + s.Channels = map[string]string{} + } + return s, nil +} + +func (s *SyncState) Marshal() (string, error) { + b, err := json.Marshal(s) + return string(b), err +} + +func (s *SyncState) ChatCursor(chatID string) string { return s.Chats[chatID] } +func (s *SyncState) SetChatCursor(chatID, cursor string) { s.Chats[chatID] = cursor } +func (s *SyncState) ChannelDelta(key string) string { return s.Channels[key] } +func (s *SyncState) SetChannelDelta(key, link string) { s.Channels[key] = link } +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/teams/ -run 'TestSyncState|TestLoadSyncState' -v` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +go fmt ./... && git add internal/teams/syncstate.go internal/teams/syncstate_test.go +git commit -m "feat(teams): per-conversation sync-state cursors" +``` + +--- + +## Task 8: Client enumeration + message-fetch methods + +Add the concrete Graph calls the importer needs. (These wrap `getJSON`/paging; keep them thin so the importer is testable against the fake server.) + +**Files:** +- Modify: `internal/teams/client.go` +- Test: `internal/teams/client_test.go` + +- [ ] **Step 1: Write the failing test (chats + channel messages)** + +```go +func TestListChatsAndMessages(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case strings.HasPrefix(r.URL.Path, "/me/chats/") && strings.Contains(r.URL.Path, "/messages"): + w.Write([]byte(`{"value":[{"id":"m1","createdDateTime":"2025-01-01T00:00:00Z","body":{"contentType":"text","content":"hi"}}]}`)) + case r.URL.Path == "/me/chats": + w.Write([]byte(`{"value":[{"id":"19:x@thread.v2","chatType":"oneOnOne"}]}`)) + default: + http.Error(w, "no", 404) + } + })) + defer srv.Close() + + c := NewClient(srv.URL, tokenFnCtx, 50) + chats, err := c.ListChats(context.Background()) + require.NoError(t, err) + require.Len(t, chats, 1) + + msgs, _, err := c.ListChatMessages(context.Background(), chats[0].ID, "", "") + require.NoError(t, err) + require.Len(t, msgs, 1) + assert.Equal(t, "m1", msgs[0].ID) +} + +func tokenFnCtx(context.Context) (string, error) { return "t", nil } +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/teams/ -run 'TestListChatsAndMessages' -v` +Expected: FAIL — undefined methods. + +- [ ] **Step 3: Implement enumeration + message methods** + +Generalize paging to any element type with a small helper to avoid duplicating `getAllPages` per type: + +```go +// pageThrough follows nextLink, decoding each page into []T via json, calling fn. +// Returns the terminal deltaLink. +func pageThrough[T any](ctx context.Context, c *Client, startURL string, fn func([]T)) (string, error) { + url := startURL + for { + var page listResponse[T] + if err := c.getJSON(ctx, url, &page); err != nil { + return "", err + } + fn(page.Value) + if page.NextLink != "" { + url = page.NextLink + continue + } + return page.DeltaLink, nil + } +} + +func (c *Client) ListChats(ctx context.Context) ([]Chat, error) { + var out []Chat + _, err := pageThrough[Chat](ctx, c, "/me/chats?$top=50", func(p []Chat) { out = append(out, p...) }) + return out, err +} + +func (c *Client) ListJoinedTeams(ctx context.Context) ([]JoinedTeam, error) { + var out []JoinedTeam + _, err := pageThrough[JoinedTeam](ctx, c, "/me/joinedTeams", func(p []JoinedTeam) { out = append(out, p...) }) + return out, err +} + +func (c *Client) ListChannels(ctx context.Context, teamID string) ([]Channel, error) { + var out []Channel + _, err := pageThrough[Channel](ctx, c, "/teams/"+teamID+"/channels", func(p []Channel) { out = append(out, p...) }) + return out, err +} + +// ListChatMessages: pass sinceISO for incremental (empty = full backfill). +// Returns messages and the nextURL is handled internally (full drain). +func (c *Client) ListChatMessages(ctx context.Context, chatID, sinceISO, _ string) ([]ChatMessage, string, error) { + url := "/me/chats/" + chatID + "/messages?$top=50" + if sinceISO != "" { + url += "&$filter=lastModifiedDateTime%20gt%20" + sinceISO + "&$orderby=lastModifiedDateTime%20desc" + } + var out []ChatMessage + _, err := pageThrough[ChatMessage](ctx, c, url, func(p []ChatMessage) { out = append(out, p...) }) + return out, "", err +} + +// ChannelMessagesDelta drives the delta endpoint (or a stored deltaLink) to +// completion, returning all messages and the new deltaLink. +func (c *Client) ChannelMessagesDelta(ctx context.Context, teamID, channelID, deltaLink string) ([]ChatMessage, string, error) { + start := deltaLink + if start == "" { + start = "/teams/" + teamID + "/channels/" + channelID + "/messages/delta" + } + var out []ChatMessage + newDelta, err := pageThrough[ChatMessage](ctx, c, start, func(p []ChatMessage) { out = append(out, p...) }) + return out, newDelta, err +} + +func (c *Client) ListChannelMessages(ctx context.Context, teamID, channelID string) ([]ChatMessage, error) { + var out []ChatMessage + _, err := pageThrough[ChatMessage](ctx, c, "/teams/"+teamID+"/channels/"+channelID+"/messages?$top=50", func(p []ChatMessage) { out = append(out, p...) }) + return out, err +} + +func (c *Client) ListReplies(ctx context.Context, teamID, channelID, messageID string) ([]ChatMessage, error) { + var out []ChatMessage + _, err := pageThrough[ChatMessage](ctx, c, "/teams/"+teamID+"/channels/"+channelID+"/messages/"+messageID+"/replies", func(p []ChatMessage) { out = append(out, p...) }) + return out, err +} +``` + +> Now the original `getAllPages(...func([]Chat))` from Task 4 is redundant — replace it (and update `TestClientGetJSONPaging`/`TestClientRetryAfter`) to call `pageThrough[Chat](ctx, c, url, fn)`. Keep one paging implementation. + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/teams/ -run 'TestClient|TestListChats' -v` +Expected: PASS. + +- [ ] **Step 5: Format, vet, commit** + +```bash +go fmt ./... && go vet ./internal/teams/ +git add internal/teams/client.go internal/teams/client_test.go +git commit -m "feat(teams): Graph enumeration and message-fetch methods" +``` + +--- + +## Task 9: Importer — chats end-to-end + +This is the orchestration core. Test it against a fake Graph server through the real store. + +**Files:** +- Create: `internal/teams/importer.go` +- Test: `internal/teams/importer_test.go` + +- [ ] **Step 1: Write the failing end-to-end test (chats)** + +```go +package teams + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.kenn.io/msgvault/internal/testutil" +) + +func fakeChatGraph(t *testing.T) *httptest.Server { + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + w.Write([]byte(`{"value":[{"id":"19:x@thread.v2","chatType":"oneOnOne","topic":"DM"}]}`)) + case strings.Contains(r.URL.Path, "/messages"): + w.Write([]byte(`{"value":[ + {"id":"m1","createdDateTime":"2025-01-01T00:00:00Z","lastModifiedDateTime":"2025-01-01T00:00:00Z", + "from":{"user":{"id":"alice@outlook.com","displayName":"Alice","userIdentityType":"emailUser"}}, + "body":{"contentType":"text","content":"hello world"}} + ]}`)) + default: + http.Error(w, "404", 404) + } + })) +} + +func TestImportChatsEndToEnd(t *testing.T) { + srv := fakeChatGraph(t) + defer srv.Close() + st := testutil.NewTestStore(t) + + c := NewClient(srv.URL, tokenFnCtx, 50) + imp := NewImporter(st, c) + sum, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: false}) + require.NoError(t, err) + assert.EqualValues(t, 1, sum.ChatsProcessed) + assert.EqualValues(t, 1, sum.MessagesAdded) + + // message persisted and FTS-searchable + var cnt int + require.NoError(t, st.DB().QueryRow(`SELECT COUNT(*) FROM messages WHERE message_type='teams'`).Scan(&cnt)) + assert.Equal(t, 1, cnt) +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/teams/ -run 'TestImportChatsEndToEnd' -v` +Expected: FAIL — `NewImporter`/`Import` undefined. + +- [ ] **Step 3: Implement the importer (chats path)** + +```go +package teams + +import ( + "context" + "database/sql" + "encoding/json" + "time" + + "go.kenn.io/msgvault/internal/store" +) + +type Importer struct { + store *store.Store + client *Client + res *participantResolver +} + +func NewImporter(s *store.Store, c *Client) *Importer { + return &Importer{store: s, client: c, res: newParticipantResolver(s, c)} +} + +func (imp *Importer) Import(ctx context.Context, opts ImportOptions) (*ImportSummary, error) { + start := time.Now() + src, err := imp.store.GetOrCreateSource(sourceTypeTeams, opts.Email) + if err != nil { + return nil, err + } + sum := &ImportSummary{SourceID: src.ID} + + // load prior cursors + state := NewSyncState() + if prev, err := imp.store.GetLastSuccessfulSync(src.ID); err == nil && prev != nil && prev.CursorAfter.Valid { + if s, err := LoadSyncState(prev.CursorAfter.String); err == nil { + state = s + } + } + + syncID, err := imp.store.StartSync(src.ID, "teams") + if err != nil { + return nil, err + } + defer func() { + if err != nil { + _ = imp.store.FailSync(syncID, err.Error()) + } + }() + + if err = imp.syncChats(ctx, src.ID, opts, state, sum); err != nil { + return sum, err + } + if opts.IncludeChannels { + if err = imp.syncChannels(ctx, src.ID, opts, state, sum); err != nil { + return sum, err + } + } + + blob, _ := state.Marshal() + if err = imp.store.CompleteSync(syncID, blob); err != nil { + return sum, err + } + sum.Duration = time.Since(start) + return sum, nil +} + +const sourceTypeTeams = "teams" + +func (imp *Importer) syncChats(ctx context.Context, sourceID int64, opts ImportOptions, state *SyncState, sum *ImportSummary) error { + chats, err := imp.client.ListChats(ctx) + if err != nil { + return err + } + for _, ch := range chats { + if ctx.Err() != nil { + return ctx.Err() + } + convID, err := imp.store.EnsureConversationWithType(sourceID, ch.ID, conversationType(ch.ChatType), ch.Topic) + if err != nil { + return err + } + since := state.ChatCursor(ch.ID) + msgs, _, err := imp.client.ListChatMessages(ctx, ch.ID, since, "") + if err != nil { + sum.Errors++ + continue + } + maxCursor := since + for i := range msgs { + gm := &msgs[i] + added, err := imp.persistMessage(ctx, convID, sourceID, gm, sum) + if err != nil { + return err + } + if added { + sum.MessagesAdded++ + } else { + sum.MessagesUpdated++ + } + sum.MessagesProcessed++ + if iso := gm.LastModifiedDateTime.UTC().Format(time.RFC3339); iso > maxCursor { + maxCursor = iso + } + } + if maxCursor != "" { + state.SetChatCursor(ch.ID, maxCursor) + } + sum.ChatsProcessed++ + } + return nil +} + +// persistMessage writes a single message via the granular store path. +// Returns true if newly inserted. +func (imp *Importer) persistMessage(ctx context.Context, convID, sourceID int64, gm *ChatMessage, sum *ImportSummary) (bool, error) { + // deleted tombstone + if gm.DeletedDateTime != nil { + _ = imp.store.MarkMessageDeleted(sourceID, gm.ID) + return false, nil + } + msg, text := mapMessage(gm, convID, sourceID) + if gm.From != nil { + pid, err := imp.res.resolve(ctx, identityOf(gm.From)) + if err != nil { + return false, err + } + if pid != 0 { + msg.SenderID = sql.NullInt64{Int64: pid, Valid: true} + } + } + messageID, err := imp.store.UpsertMessage(&msg) + if err != nil { + return false, err + } + bodyHTML := sql.NullString{} + if gm.Body.ContentType == "html" { + bodyHTML = sql.NullString{String: gm.Body.Content, Valid: true} + } + if err := imp.store.UpsertMessageBody(messageID, sql.NullString{String: text, Valid: text != ""}, bodyHTML); err != nil { + return false, err + } + if raw, err := json.Marshal(gm); err == nil { + _ = imp.store.UpsertMessageRawWithFormat(messageID, raw, "teams_json") + } + senderAddr := "" + if gm.From != nil && identityOf(gm.From) != nil { + senderAddr = identityOf(gm.From).DisplayName + } + _ = imp.store.UpsertFTS(messageID, msg.Subject.String, text, senderAddr, "", "") + + // reactions + for _, rc := range gm.Reactions { + pid, _ := imp.res.resolve(ctx, identityOf(rc.User)) + if pid != 0 { + if err := imp.store.UpsertReaction(messageID, pid, rc.ReactionType, rc.ReactionType, rc.CreatedDateTime); err == nil { + sum.ReactionsAdded++ + } + } + } + // shared-file attachment links (contentType "reference"); inline images in Task 11 + for _, att := range gm.Attachments { + if att.ContentType == "reference" { + _ = imp.store.UpsertAttachment(messageID, att.Name, "", att.ContentURL, "", 0) + sum.AttachmentsFound++ + } + } + return true, nil +} + +func identityOf(set *IdentitySet) *Identity { + if set == nil { + return nil + } + if set.User != nil { + return set.User + } + return set.Application +} +``` + +> **Open detail to confirm while implementing:** `UpsertMessage` returns only an id, not an inserted/updated flag. If distinguishing added-vs-updated matters for the summary, check whether the store exposes `RowsAffected` or query existence first; otherwise treat all as "added" and drop `MessagesUpdated`. Don't block the task on this — pick the simplest correct behavior. + +> `MarkMessageDeleted(sourceID, sourceMessageID)` exists (`messages.go:766`). Confirm the exact signature; adjust if it differs. + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/teams/ -run 'TestImportChatsEndToEnd' -v` +Expected: PASS. + +- [ ] **Step 5: Format, vet, full package test, commit** + +```bash +go fmt ./... && go vet ./internal/teams/ +go test ./internal/teams/ -v +git add internal/teams/importer.go internal/teams/importer_test.go +git commit -m "feat(teams): importer chats path end-to-end" +``` + +--- + +## Task 10: Importer — channels (delta + backfill + replies) + +**Files:** +- Modify: `internal/teams/importer.go` +- Test: `internal/teams/importer_test.go` + +- [ ] **Step 1: Write the failing test (channels via delta)** + +```go +func fakeChannelGraph(t *testing.T) *httptest.Server { + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/joinedTeams": + w.Write([]byte(`{"value":[{"id":"team1","displayName":"Acme"}]}`)) + case strings.HasSuffix(r.URL.Path, "/channels"): + w.Write([]byte(`{"value":[{"id":"chanA","displayName":"General","membershipType":"standard"}]}`)) + case strings.HasSuffix(r.URL.Path, "/messages/delta"): + w.Write([]byte(`{"value":[ + {"id":"c1","createdDateTime":"2025-02-01T00:00:00Z","lastModifiedDateTime":"2025-02-01T00:00:00Z", + "body":{"contentType":"text","content":"channel post"}} + ],"@odata.deltaLink":"` + "http://" + r.Host + `/delta?token=next"}`)) + default: + http.Error(w, "404", 404) + } + })) +} + +func TestImportChannelsEndToEnd(t *testing.T) { + srv := fakeChannelGraph(t) + defer srv.Close() + st := testutil.NewTestStore(t) + + imp := NewImporter(st, NewClient(srv.URL, tokenFnCtx, 50)) + sum, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: true}) + require.NoError(t, err) + assert.EqualValues(t, 1, sum.ChannelsProcessed) + assert.EqualValues(t, 1, sum.MessagesAdded) + + // delta link persisted for next run + src, _ := st.GetOrCreateSource("teams", "me@example.com") + prev, _ := st.GetLastSuccessfulSync(src.ID) + state, _ := LoadSyncState(prev.CursorAfter.String) + assert.Contains(t, state.ChannelDelta("team1/chanA"), "token=next") +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/teams/ -run 'TestImportChannelsEndToEnd' -v` +Expected: FAIL — `syncChannels` undefined / no channels processed. + +- [ ] **Step 3: Implement `syncChannels`** + +```go +func (imp *Importer) syncChannels(ctx context.Context, sourceID int64, opts ImportOptions, state *SyncState, sum *ImportSummary) error { + teams, err := imp.client.ListJoinedTeams(ctx) + if err != nil { + return err + } + for _, tm := range teams { + channels, err := imp.client.ListChannels(ctx, tm.ID) + if err != nil { + sum.Errors++ + continue + } + for _, chn := range channels { + if ctx.Err() != nil { + return ctx.Err() + } + title := tm.DisplayName + " / " + chn.DisplayName + convID, err := imp.store.EnsureConversationWithType(sourceID, tm.ID+"/"+chn.ID, "channel", title) + if err != nil { + return err + } + key := tm.ID + "/" + chn.ID + prevDelta := state.ChannelDelta(key) + + var msgs []ChatMessage + var newDelta string + if prevDelta == "" { + // first run: backfill roots + replies, then prime delta + roots, err := imp.client.ListChannelMessages(ctx, tm.ID, chn.ID) + if err != nil { + sum.Errors++ + continue + } + msgs = append(msgs, roots...) + for i := range roots { + replies, err := imp.client.ListReplies(ctx, tm.ID, chn.ID, roots[i].ID) + if err == nil { + msgs = append(msgs, replies...) + } + } + // prime the delta cursor for subsequent runs + _, newDelta, _ = imp.client.ChannelMessagesDelta(ctx, tm.ID, chn.ID, "") + } else { + // incremental: drive delta from stored link, with fallback + msgs, newDelta, err = imp.client.ChannelMessagesDelta(ctx, tm.ID, chn.ID, prevDelta) + if err != nil { + // delta-token rot (400/410): full re-page + dedupe via upsert + roots, rerr := imp.client.ListChannelMessages(ctx, tm.ID, chn.ID) + if rerr != nil { + sum.Errors++ + continue + } + msgs = roots + for i := range roots { + if replies, rerr := imp.client.ListReplies(ctx, tm.ID, chn.ID, roots[i].ID); rerr == nil { + msgs = append(msgs, replies...) + } + } + _, newDelta, _ = imp.client.ChannelMessagesDelta(ctx, tm.ID, chn.ID, "") + } + } + + if err := imp.persistChannelMessages(ctx, convID, sourceID, msgs, sum); err != nil { + return err + } + if newDelta != "" { + state.SetChannelDelta(key, newDelta) + } + sum.ChannelsProcessed++ + } + } + return nil +} + +func (imp *Importer) persistChannelMessages(ctx context.Context, convID, sourceID int64, msgs []ChatMessage, sum *ImportSummary) error { + // pass 1: roots and replies whose parent already exists + for i := range msgs { + gm := &msgs[i] + added, err := imp.persistMessage(ctx, convID, sourceID, gm, sum) + if err != nil { + return err + } + if added { + sum.MessagesAdded++ + } else { + sum.MessagesUpdated++ + } + sum.MessagesProcessed++ + // set reply linkage if parent is known + if gm.ReplyToID != "" { + _ = imp.store.SetReplyTo(sourceID, gm.ID, gm.ReplyToID) + } + } + return nil +} +``` + +> **Reply linkage:** the plan calls `store.SetReplyTo(sourceID, childSourceMsgID, parentSourceMsgID)` to populate `messages.reply_to_message_id` by resolving the parent's `source_message_id` to its internal id. The research did NOT find such a method — **verify** whether one exists; if not, add a small store method in this task: +> +> ```go +> // internal/store/messages.go +> func (s *Store) SetReplyTo(sourceID int64, childSourceMessageID, parentSourceMessageID string) error { +> _, err := s.db.Exec(s.dialect.Rebind(` +> UPDATE messages SET reply_to_message_id = +> (SELECT id FROM messages WHERE source_id = ? AND source_message_id = ?) +> WHERE source_id = ? AND source_message_id = ?`), +> sourceID, parentSourceMessageID, sourceID, childSourceMessageID) +> return err +> } +> ``` +> Confirm the dialect/Rebind helper name against existing store methods before writing. Because backfill inserts roots before replies (roots list first, replies appended), the parent is present when the reply is processed. For delta runs where a reply may arrive before its root, the UPDATE simply sets NULL and a later run reconciles — acceptable. + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/teams/ -run 'TestImportChannelsEndToEnd|TestImportChatsEndToEnd' -v` +Expected: PASS. + +- [ ] **Step 5: Format, vet, full package test, commit** + +```bash +go fmt ./... && go vet ./... +go test ./internal/teams/ ./internal/store/ -v +git add internal/teams/importer.go internal/teams/importer_test.go internal/store/messages.go +git commit -m "feat(teams): importer channels path (delta + backfill + replies)" +``` + +--- + +## Task 11: Inline images (hostedContents) download + +Inline images need a separate `GET .../hostedContents/{id}/$value` (bytes are null on the message read). Detect them by the `hostedContents/{id}/$value` URL pattern in the body HTML and store into content-addressed storage. + +**Files:** +- Modify: `internal/teams/client.go` (raw `$value` GET), `internal/teams/importer.go` +- Test: `internal/teams/importer_test.go` + +- [ ] **Step 1: Write the failing test** + +```go +func TestInlineImageDownloaded(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case strings.Contains(r.URL.Path, "/hostedContents/") && strings.HasSuffix(r.URL.Path, "/$value"): + w.Header().Set("Content-Type", "image/png") + w.Write([]byte("PNGDATA")) + case r.URL.Path == "/me/chats": + w.Header().Set("Content-Type", "application/json") + w.Write([]byte(`{"value":[{"id":"19:x@thread.v2","chatType":"oneOnOne"}]}`)) + case strings.Contains(r.URL.Path, "/messages"): + w.Header().Set("Content-Type", "application/json") + body := `
` + w.Write([]byte(`{"value":[{"id":"m1","createdDateTime":"2025-01-01T00:00:00Z","body":{"contentType":"html","content":` + jsonString(body) + `}}]}`)) + default: + http.Error(w, "404", 404) + } + })) + defer srv.Close() + st := testutil.NewTestStore(t) + dir := t.TempDir() + + imp := NewImporter(st, NewClient(srv.URL, tokenFnCtx, 50)) + sum, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", AttachmentsDir: dir}) + require.NoError(t, err) + assert.EqualValues(t, 1, sum.InlineImagesCopied) +} + +func jsonString(s string) string { b, _ := json.Marshal(s); return string(b) } +``` + +(Add `import "encoding/json"` to the test file if not present.) + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/teams/ -run 'TestInlineImageDownloaded' -v` +Expected: FAIL. + +- [ ] **Step 3: Implement raw fetch + extraction + storage** + +Add to `client.go`: + +```go +func (c *Client) GetRaw(ctx context.Context, url string) ([]byte, error) { + return c.get(ctx, url) +} +``` + +Reuse the existing content-addressed attachment storage helper. The research found `export.StoreAttachmentFile` and `store.UpsertAttachment(messageID, filename, mimeType, storagePath, contentHash, size)`. In `importer.go` add inline-image handling inside `persistMessage` (after body persistence), guarded by `opts.AttachmentsDir != ""`. Thread `opts` into `persistMessage` (change its signature to accept `opts ImportOptions` or store `imp.attachmentsDir`). + +```go +var hostedRe = regexp.MustCompile(`https://[^"'\s)]+/hostedContents/[^"'\s)]+/\$value`) + +func (imp *Importer) downloadInlineImages(ctx context.Context, messageID int64, bodyHTML, attachmentsDir string, sum *ImportSummary) { + if attachmentsDir == "" { + return + } + for _, url := range hostedRe.FindAllString(bodyHTML, -1) { + data, err := imp.client.GetRaw(ctx, url) + if err != nil || len(data) == 0 { + sum.Errors++ + continue + } + hash := sha256Hex(data) + storagePath, err := storeContentAddressed(attachmentsDir, hash, data) // mirror export.StoreAttachmentFile + if err != nil { + sum.Errors++ + continue + } + if err := imp.store.UpsertAttachment(messageID, hash, "", storagePath, hash, len(data)); err == nil { + sum.InlineImagesCopied++ + } + } +} +``` + +> Implement `sha256Hex` and `storeContentAddressed` by reusing existing helpers: the research cited `internal/export/store_attachment.go` (`export.StoreAttachmentFile`) and content-addressing at `ab/abcd...`. Prefer calling those exported helpers over re-implementing; only add thin local wrappers if the exported signature doesn't fit. Confirm the helper name/signature while implementing. + +Call `imp.downloadInlineImages(ctx, messageID, gm.Body.Content, attachmentsDir, sum)` in `persistMessage` when `gm.Body.ContentType == "html"`. + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/teams/ -run 'TestInlineImageDownloaded' -v` +Expected: PASS. + +- [ ] **Step 5: Format, vet, full package, commit** + +```bash +go fmt ./... && go vet ./... +go test ./internal/teams/ -v +git add internal/teams/client.go internal/teams/importer.go internal/teams/importer_test.go +git commit -m "feat(teams): download inline hosted-content images" +``` + +--- + +## Task 12: `add-account --teams` CLI command + +Template: `cmd/msgvault/cmd/addo365.go` (gates on `cfg.Microsoft.ClientID`, builds the manager, calls `Authorize`, creates the source). Extend `add-account` with a `--teams` flag OR add a dedicated command — this plan uses a dedicated `add_teams.go` to avoid entangling the Gmail/IMAP add-account flow, then notes the `--teams` alias as optional. + +**Files:** +- Create: `cmd/msgvault/cmd/add_teams.go` + +- [ ] **Step 1: Implement the command** + +```go +package cmd + +import ( + "fmt" + + "github.com/spf13/cobra" + "go.kenn.io/msgvault/internal/microsoft" + "go.kenn.io/msgvault/internal/store" +) + +var addTeamsCmd = &cobra.Command{ + Use: "add-teams ", + Short: "Authorize Microsoft Teams (delegated Graph) for an account", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + email := args[0] + if cfg.Microsoft.ClientID == "" { + return fmt.Errorf("Microsoft client_id not configured — set [microsoft].client_id in config.toml") + } + s, err := store.Open(cfg.DatabaseDSN()) + if err != nil { + return err + } + defer s.Close() + if err := s.InitSchema(); err != nil { + return err + } + if err := runStartupMigrationsForIngest(s); err != nil { + return err + } + + mgr := microsoft.NewGraphManager(cfg.Microsoft.ClientID, cfg.Microsoft.EffectiveTenantID(), cfg.TokensDir(), logger()) + if err := mgr.Authorize(cmd.Context(), email); err != nil { + return fmt.Errorf("authorize Teams: %w", err) + } + + src, err := s.GetOrCreateSource(sourceTypeTeams, email) + if err != nil { + return err + } + _ = s.UpdateSourceDisplayName(src.ID, email) + if err := runPostSourceCreateMigrations(s); err != nil { + return err + } + fmt.Printf("Teams authorized for %s. Run: msgvault sync-teams %s\n", email, email) + return nil + }, +} + +func init() { + rootCmd.AddCommand(addTeamsCmd) +} +``` + +> Confirm helper names against the codebase while implementing: `logger()` (or how `addo365.go` obtains its `*slog.Logger`), `cfg.Microsoft.EffectiveTenantID()` (research cited it), `cfg.TokensDir()`, `UpdateSourceDisplayName`, `runStartupMigrationsForIngest`, `runPostSourceCreateMigrations`. Copy exact usage from `addo365.go`. + +- [ ] **Step 2: Build** + +Run: `go build ./...` +Expected: compiles. + +- [ ] **Step 3: Manual smoke (documented, not automated)** + +Run: `./msgvault add-teams you@yourtenant.com` and complete the browser consent. +Expected: writes `~/.msgvault/tokens/teams_you@yourtenant.com.json` and prints the next-step hint. (Requires the Entra app from the spec.) + +- [ ] **Step 4: Format, vet, commit** + +```bash +go fmt ./... && go vet ./... +git add cmd/msgvault/cmd/add_teams.go +git commit -m "feat(teams): add-teams OAuth command" +``` + +--- + +## Task 13: `sync-teams` CLI command + +Template: `import_messenger.go` (store open, signal handling, summary, `rebuildCacheAfterWrite`). Wire the Graph token source from the `GraphManager`. + +**Files:** +- Create: `cmd/msgvault/cmd/sync_teams.go` + +- [ ] **Step 1: Implement the command** + +```go +package cmd + +import ( + "context" + "fmt" + "os" + "os/signal" + "syscall" + "time" + + "github.com/spf13/cobra" + "go.kenn.io/msgvault/internal/microsoft" + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/teams" +) + +var ( + syncTeamsNoChannels bool + syncTeamsLimit int +) + +var syncTeamsCmd = &cobra.Command{ + Use: "sync-teams ", + Short: "Sync Microsoft Teams chats and channels (full or incremental)", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + email := args[0] + s, err := store.Open(cfg.DatabaseDSN()) + if err != nil { + return err + } + defer s.Close() + if err := s.InitSchema(); err != nil { + return err + } + if err := runStartupMigrations(s); err != nil { + return err + } + + mgr := microsoft.NewGraphManager(cfg.Microsoft.ClientID, cfg.Microsoft.EffectiveTenantID(), cfg.TokensDir(), logger()) + tokenFn, err := mgr.TokenSource(cmd.Context(), email) + if err != nil { + return err + } + + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + defer signal.Stop(sigChan) + go func() { <-sigChan; cancel() }() + + client := teams.NewClient("https://graph.microsoft.com/v1.0", teams.TokenFunc(tokenFn), float64(cfg.Sync.RateLimitQPS)) + imp := teams.NewImporter(s, client) + sum, err := imp.Import(ctx, teams.ImportOptions{ + Email: email, + AttachmentsDir: cfg.AttachmentsDir(), + IncludeChannels: !syncTeamsNoChannels, + Limit: syncTeamsLimit, + }) + if ctx.Err() != nil { + fmt.Println("Interrupted — re-run sync-teams to resume.") + rebuildCacheAfterWrite(cfg.DatabaseDSN()) + return nil + } + if err != nil { + return err + } + fmt.Printf("Teams sync complete for %s in %s: %d chats, %d channels, %d messages added (%d reactions, %d attachments, %d inline images, %d errors)\n", + email, sum.Duration.Round(time.Second), sum.ChatsProcessed, sum.ChannelsProcessed, sum.MessagesAdded, sum.ReactionsAdded, sum.AttachmentsFound, sum.InlineImagesCopied, sum.Errors) + rebuildCacheAfterWrite(cfg.DatabaseDSN()) + return nil + }, +} + +func init() { + syncTeamsCmd.Flags().BoolVar(&syncTeamsNoChannels, "no-channels", false, "sync chats only (skip team channels)") + syncTeamsCmd.Flags().IntVar(&syncTeamsLimit, "limit", 0, "max messages per conversation (0 = no limit)") + rootCmd.AddCommand(syncTeamsCmd) +} +``` + +> Confirm: `cfg.Sync.RateLimitQPS` field name/type, `rebuildCacheAfterWrite` signature (research showed it takes a db path), `runStartupMigrations`, and that `teams.TokenFunc(tokenFn)` type-converts cleanly (both are `func(context.Context)(string,error)`). Adjust to match exact codebase names. + +- [ ] **Step 2: Build** + +Run: `go build ./...` +Expected: compiles. + +- [ ] **Step 3: Manual smoke** + +Run: `./msgvault sync-teams you@yourtenant.com --limit 50` +Expected: prints a summary; `./msgvault tui` shows Teams messages searchable. + +- [ ] **Step 4: Format, vet, commit** + +```bash +go fmt ./... && go vet ./... +git add cmd/msgvault/cmd/sync_teams.go +git commit -m "feat(teams): sync-teams command" +``` + +--- + +## Task 14: Scheduler integration (`serve`) + +Add a `"teams"` case to the daemon's per-source-type dispatch (`serve.go:runScheduledSync` switch ~`:418`, and `findScheduledSyncSource` ~`:462-469`). + +**Files:** +- Modify: `cmd/msgvault/cmd/serve.go` + +- [ ] **Step 1: Add the teams case** + +In `runScheduledSync`'s `switch sourceType` block, add before `default`: + +```go + case sourceTypeTeams: + err = runScheduledTeamsSync(ctx, email, s) +``` + +Add the helper (mirrors `runScheduledGmailSync` but returns only error since Teams has its own summary type): + +```go +func runScheduledTeamsSync(ctx context.Context, email string, s *store.Store) error { + mgr := microsoft.NewGraphManager(cfg.Microsoft.ClientID, cfg.Microsoft.EffectiveTenantID(), cfg.TokensDir(), logger()) + tokenFn, err := mgr.TokenSource(ctx, email) + if err != nil { + return err + } + client := teams.NewClient("https://graph.microsoft.com/v1.0", teams.TokenFunc(tokenFn), float64(cfg.Sync.RateLimitQPS)) + _, err = teams.NewImporter(s, client).Import(ctx, teams.ImportOptions{ + Email: email, + AttachmentsDir: cfg.AttachmentsDir(), + IncludeChannels: true, + }) + return err +} +``` + +Also add `sourceTypeTeams` to the inner switch in `findScheduledSyncSource` so the daemon recognizes Teams sources as schedulable. + +> The existing switch's `default` returns "only gmail and imap" — update that message to include teams, and confirm whether `runScheduledSync` shares a `summary` variable typed `*gmail.SyncSummary` (research said the gmail/imap helpers return that). The Teams helper deliberately returns only `error`; assign via a separate branch that doesn't touch `summary`, or refactor the shared variable. Keep the change minimal. + +- [ ] **Step 2: Build** + +Run: `go build ./...` +Expected: compiles. + +- [ ] **Step 3: Verify the daemon recognizes teams sources** + +Run: `go vet ./cmd/...` and a quick `./msgvault serve --help` +Expected: no errors. (Full daemon e2e requires a live token; the importer is already covered by Task 9/10 tests.) + +- [ ] **Step 4: Format, vet, commit** + +```bash +go fmt ./... && go vet ./... +git add cmd/msgvault/cmd/serve.go +git commit -m "feat(teams): schedule Teams syncs in serve daemon" +``` + +--- + +## Task 15: Full suite + docs + +**Files:** +- Modify: `CLAUDE.md` (Quick Commands), `docs/superpowers/specs/2026-06-18-teams-ingestion-design.md` (status → implemented) + +- [ ] **Step 1: Run the whole test suite** + +Run: `make test` +Expected: PASS (SQLite). If a Postgres instance is available: `make test-pg`. + +- [ ] **Step 2: Lint** + +Run: `make lint-ci` +Expected: clean. + +- [ ] **Step 3: Document the commands** + +Add to `CLAUDE.md` under Quick Commands: + +```bash +./msgvault add-teams you@tenant.com # Authorize Teams (delegated Graph) +./msgvault sync-teams you@tenant.com # Sync Teams chats + channels +./msgvault sync-teams you@tenant.com --no-channels --limit 50 +``` + +- [ ] **Step 4: Commit** + +```bash +go fmt ./... +git add CLAUDE.md docs/superpowers/specs/2026-06-18-teams-ingestion-design.md +git commit -m "docs(teams): document Teams commands; mark spec implemented" +``` + +--- + +## Self-review notes (already applied) + +- **Spec coverage:** chats (Task 9), channels std+private (Task 10, private channels are ordinary channel objects), meeting chats (Task 9 — `meeting` chatType → group_chat), rich text/body_html + FTS (Tasks 6/9), reactions (Task 9), shared-file links (Task 9), inline images (Task 11), AAD→email participant resolution (Task 5), incremental cursors chats-vs-channels (Tasks 7/9/10), OAuth independence/IMAP-blocker (Task 2), CLI (Tasks 12/13), scheduler (Task 14). Transcripts intentionally excluded (separate spec). +- **Unverified store helpers flagged for confirmation during implementation:** `SetReplyTo` (likely new, code provided), added-vs-updated flag from `UpsertMessage`, exact private helper names in `microsoft/oauth.go`, `export.StoreAttachmentFile`/content-address helper signature, `cfg.Microsoft.EffectiveTenantID`, `rebuildCacheAfterWrite`. Each task names the thing to confirm and gives a working fallback. +- **Type consistency:** `TokenFunc`, `SyncState` accessors, `ImportOptions`/`ImportSummary` field names used consistently across tasks (note the ASCII fix for `InlineImagesCopied` in Task 3). +- **No `is_edited`/`mention` assumptions:** edits handled via re-upsert on `lastModifiedDateTime`; deletes via `MarkMessageDeleted`. `recipient_type='mention'` is not written in this plan (mentions are captured in raw JSON; promoting them to recipient rows is a deferred enhancement) — avoids relying on schema-only behavior. diff --git a/docs/superpowers/specs/2026-06-18-teams-ingestion-design.md b/docs/superpowers/specs/2026-06-18-teams-ingestion-design.md new file mode 100644 index 000000000..45186a739 --- /dev/null +++ b/docs/superpowers/specs/2026-06-18-teams-ingestion-design.md @@ -0,0 +1,274 @@ +# Microsoft Teams Ingestion — Design + +Date: 2026-06-18 +Status: Implemented on branch feat/teams-ingestion (chats, group chats, +channel posts/replies, meeting chats; transcripts remain deferred to +2026-06-18-teams-transcripts-design.md). + +## Goal + +Sync and store the user's Microsoft Teams messages — 1:1 chats, group +chats, channel posts/replies, and meeting chats — into msgvault so they +are searchable alongside Gmail/Outlook through the existing TUI, FTS, and +Parquet analytics. Rich text, reactions, @mentions, links, inline images, +and meeting transcripts are preserved. Recordings are referenced by link, +not downloaded. + +## Decisions (from brainstorm) + +- **Account type:** Work/school (Entra/Azure AD) tenant where the user is + admin and can register an app and grant admin consent. +- **Data scope:** The user's *own* data only, via **delegated** OAuth. + No tenant-wide export, so Microsoft's metered Teams export billing is + not involved. +- **Content scope:** 1:1 & group chats, channel messages (standard + + private), and meeting chats. **Meeting transcripts are out of scope for + this build** — moved to a separate spec + (`2026-06-18-teams-transcripts-design.md`) after the load-bearing pass + showed delegated transcript content is effectively organizer-only and + unavailable for expired meetings. +- **Attachments:** Download small inline/pasted images (hosted content) + into content-addressed storage; for shared SharePoint/OneDrive files + store filename + link + metadata only (no bytes). Downloading shared + file bytes is out of scope — tracked in kata `c0gf` (preserve a + departing user's files before account removal). +- **Acquisition:** Live Microsoft Graph delta sync (Approach A), mirroring + the existing Gmail live-sync model. Transcripts are sequenced as the + final phase because they are the most permission-sensitive surface. + +## Architecture + +``` +cmd/msgvault/cmd/sync_teams.go ← new CLI command(s) + │ +internal/teams/ ← new package (parallel to internal/whatsapp, internal/gmail) + ├── client.go Graph REST client (paging, 429 + Retry-After, $batch) + ├── sync.go orchestration: enumerate chats/channels → delta → persist + ├── messages.go map Graph chatMessage → store.Message (+ body, recipients, reactions) + ├── transcripts.go (phase 4) onlineMeetings → transcript text + └── checkpoint.go delta-link + cursor persistence + │ +internal/microsoft/oauth.go ← extended: add Graph delegated scopes + token source + │ +internal/store/... ← reused as-is (UpsertMessage, EnsureConversationWithType, …) +``` + +Per sync run: authenticate (delegated token) → list chats + joined +teams/channels → for each conversation run a **delta query** from the +saved delta link → for each `chatMessage` build a `store.Message`, +resolve participants by UPN/email, persist body (HTML + derived text), +reactions, mentions, attachment metadata + inline images → save the new +delta link as the checkpoint. + +## Data model mapping + +Reuses the existing generic chat schema — **no new core tables**. + +| Teams concept | msgvault storage | +|---|---| +| Source (your account) | `sources` row, `source_type = "teams"`, `identifier = your UPN` | +| 1:1 / group chat | `conversations`, `conversation_type = "direct_chat"` / `"group_chat"` | +| Team channel | `conversations`, `conversation_type = "channel"`, `title = "Team / Channel"` | +| Meeting chat | `conversations`, `conversation_type = "group_chat"` (flagged as meeting) | +| `chatMessage` | `messages`, `message_type = "teams"`, `source_message_id = Graph id` | +| Reply (channel thread) | `reply_to_message_id` → root post | +| Sender | `message_recipients` `"from"`; other participants `"to"` | +| @mention | `message_recipients` `"mention"` | +| Rich text (HTML body) | `message_bodies.body_html` + derived `body_text`; FTS indexes text | +| Reactions (👍 ❤️ …) | `reactions` table | +| Inline pasted image | `attachments` (downloaded bytes, content-addressed) | +| Shared SharePoint/OneDrive file | `attachments` row: filename + link + metadata, **no bytes** | +| Raw message JSON | `message_raw`, `raw_format = "teams_json"` (re-parseable) | + +Mapping caveats from the load-bearing pass: + +- **Identities carry no email inline** (LB-D3). `from`/`mentions` give an + **AAD object id** + (often null) displayName. Resolution: add an + `aad_object_id` identifier type to `participant_identifiers`, and resolve + id → `mail`/`userPrincipalName` via `GET /users/{id}` (best-effort: mail + can be null, UPN ≠ SMTP always). Branch on `userIdentityType`: + `aadUser` → resolve; `emailUser` → the id *is* an email (free key); + `application` (bots/connectors) / `anonymousGuest` / `skypeUser` / + `azureCommunicationServicesUser` → no email, store id-only. +- **Inline images** need a **separate** `GET .../hostedContents/{id}/$value` + per image (`contentBytes` is null on the message/list read). Detect them + by the `hostedContents/{id}/$value` URL pattern (covers ``, custom + emoji, card images, custom-reaction images), not just ``. +- **Shared-file attachments** are `chatMessageAttachment` with + `contentType:"reference"` and a SharePoint `contentUrl`. Filter on + `contentType == "reference"` so cards/tabs/meeting refs aren't recorded + as files. + +Side effect: once AAD ids are resolved to `mail`, a person's Teams +messages and their Gmail/Outlook mail unify under one `participant`, +enabling cross-platform search of a single human. (The unification depends +on successful id→mail resolution, so it is best-effort for guests/bots.) + +## OAuth & permissions (delegated) + +Extend `internal/microsoft/oauth.go` (today IMAP-scoped) to also issue a +Graph token. Delegated scopes: + +- `Chat.Read` — your 1:1/group/meeting chats and their messages +- `ChannelMessage.Read.All` — messages in channels of teams you belong to +- `Team.ReadBasic.All`, `Channel.ReadBasic.All` — enumerate joined teams/channels +- `User.ReadBasic.All` — resolve Teams AAD user ids to email/display names +- `User.Read` — your identity; `offline_access` — refresh tokens + +(Transcript scopes `OnlineMeetings.Read` + `OnlineMeetingTranscript.Read.All` +belong to the separate transcripts spec, not this build.) + +One app registered in Entra, admin consent granted once for the `.All` +scopes; client ID/secret placed in config like existing OAuth apps. + +> Implementation note (LB, codebase-verified): the existing +> `microsoft.Manager.TokenSource` validates that a token's `Scopes[0]` is +> an IMAP scope and rejects it otherwise. The Graph token path must branch +> around this IMAP-specific validation rather than reuse it unchanged. + +### IMAP and Teams are independent + +Outlook/IMAP and Teams can each be used alone or together: + +- **Separate token files:** IMAP uses `microsoft_.json` (existing); + Teams uses `teams_.json` (new). Each command requests only its own + scope set, so the consent screen shows only what that feature needs. +- **IMAP-only:** existing Outlook/IMAP `add-account`; `--teams` never + invoked; no Graph scopes requested or consented. +- **Teams-only:** `add-account --teams` consents to Graph scopes only; + no IMAP token created. +- **Both:** two independent grants; either can be revoked/re-run without + affecting the other. + +The only shared artifact is the Entra app registration (client ID): + +- **(a) One app, incremental consent (default):** the app lists both IMAP + and Graph permissions as *available*, but each flow requests only its + subset; a Teams-only user is never prompted for IMAP. One registration + to administer. +- **(b) Two apps (alternative):** fully separate registrations via the + existing `[oauth.apps.*]` named-app config. More isolation, more admin + overhead. + +Neither feature requires the other at runtime. + +## Incremental sync & checkpointing + +> Revised after the load-bearing pass (see findings below). Chats and +> channels use **different** mechanisms — there is no single delta cursor. + +**Chats (1:1 / group / meeting).** No delegated per-chat `delta` endpoint +exists (load-bearing A1). Use the list endpoint for both phases: + +- **Backfill:** `GET /me/chats/{id}/messages` paginated via + `@odata.nextLink`. (Residual risk LB-1: docs don't *guarantee* unbounded + history — validate live.) +- **Incremental:** `GET /me/chats/{id}/messages?$filter=lastModifiedDateTime gt {cursor}&$orderby=lastModifiedDateTime desc`. + Both `$filter` and `$orderby` **must** target the same property or the + filter is silently ignored. Cursor = the max `lastModifiedDateTime` seen + for that chat (a timestamp, **not** an `@odata.deltaLink`). + +**Channels.** The list endpoint has **no** date `$filter` (only +`$top`/`$expand`), so the chat approach doesn't transfer: + +- **Backfill:** `GET /teams/{team}/channels/{channel}/messages` (roots) + + `.../messages/{id}/replies` per root (2-level structure). Resumable via + `@odata.nextLink`. +- **Incremental:** prefer `/messages/delta` (returns `@odata.deltaLink`), + with a **fallback** to a full re-page + client-side dedupe by + `(id, lastModifiedDateTime)` if delta proves unavailable under delegated + consent (residual risk LB-2). Delta tokens can rot (HTTP 400/410) — on + that error, restart delta from scratch. + +**Checkpoint model.** `sync_checkpoints` / `cursor_before` JSON stores a +per-conversation cursor that is **either** a timestamp (chats) **or** a +deltaLink/nextLink (channels) — a small tagged-union, not a uniform delta +link. Honor 429 + `Retry-After` like the Gmail rate limiter. Reuse +`sync_runs` for resumable backfill, matching the fbmessenger/whatsapp +importers. + +**Edits/deletes.** Edits bump `lastModifiedDateTime` (caught by the chat +filter / channel delta) → `UpsertMessage` ON CONFLICT updates the row. +Deletes carry `deletedDateTime`; map to the existing soft-delete column. +Delta delete semantics under delegated consent are unverified — treat +delete capture as best-effort. + +## CLI & daemon integration + +- `msgvault add-account --teams` — delegated browser OAuth for Graph scopes. +- `msgvault sync-teams ` — full/incremental (auto-detected via stored + per-conversation cursors); `--after` / `--limit` for scoped first runs, + mirroring `sync-full`. +- Hooks into `msgvault serve` scheduled syncs so it runs alongside Gmail. +- Parquet cache / TUI / search require **no changes**: Teams messages flow + through the same `messages`/FTS path and are immediately searchable and + account-filterable. + +## Transcripts — moved to a separate spec + +Transcripts are **not** part of this build. The load-bearing pass found the +delegated transcript surface materially more constrained (organizer-only +content, expired-meeting gaps, an indirect `joinWebUrl` resolve step), so +it gets its own design: `2026-06-18-teams-transcripts-design.md` +(kata for that work tracked separately). The validated resolution path and +coverage limits are recorded there. + +## Testing + +- Table-driven tests with testify (`assert`/`require`), per project + conventions — no new `t.Errorf`/`t.Fatalf`. +- Unit tests over **recorded Graph JSON fixtures** (synthetic, no real + PII) for the `chatMessage` → `store.Message` mapping. +- e2e-style test running a fake Graph HTTP server through the full + sync → store → search path. +- Live Graph validated manually against the tenant during the + `/load-bearing` pass. + +## Load-bearing findings (validated 2026-06-18, Microsoft Learn docs) + +**Confirmed:** `/me/chats` lists all chat types under `Chat.Read`; +delegated own-data reads are **not metered** (export APIs de-metered +2025-08-25); channel list + `/replies` give full history under +`ChannelMessage.Read.All`; private channels readable by members; teams +enumerable via `/me/joinedTeams` + `/channels`; transcript fetch path, +VTT format and scopes confirmed; inline images via `hostedContents`; +reactions/mentions/attachments inline with message read; shared files via +`contentType:"reference"`. + +**Falsified vs. the original spec (corrected above):** + +- **A1** — no delegated per-chat `delta`; chats use list + `lastModifiedDateTime` + filter (with matching `$orderby`). +- **B2** — channel delta caps at ~8 months; backfill must use list + replies. +- **C1** — no chat→meeting nav and no meeting id on the chat; resolve via + `joinWebUrl` filter. +- **D3** — identities carry AAD id, not email; need `/users/{id}` resolution. + +**Residual risks — RESOLVED via live Graph Explorer probe (2026-06-19, +delegated, tenant "Ontempo NZ"):** + +- **LB-1 — VERIFIED.** `GET /me/chats/{id}/messages?$filter=createdDateTime lt 2025-09-01T00:00:00Z&$orderby=createdDateTime desc` + returned messages older than the ~8-month delta window (HTTP 200). The + per-chat list serves full history under delegated `Chat.Read`, so chat + backfill via the list endpoint is sound. +- **LB-2 — VERIFIED.** `GET /teams/{team}/channels/{channel}/messages/delta` + returned HTTP 200 with an `@odata.nextLink` under delegated + `ChannelMessage.Read.All` (the terminal `@odata.deltaLink` arrives after + paging through `nextLink` — standard delta behavior). The delegated delta + endpoint works; channel incremental sync can use it directly. (The + list+re-page fallback remains in the design for delta-token rot / 400-410 + recovery.) + +Consent learnings for the Entra app: enumerating channels needs +`Channel.ReadBasic.All` (in addition to `Team.ReadBasic.All` for teams); +`ChannelMessage.Read.All` is consented per the message/delta endpoints. + +(LB-3, delegated transcript access, moved with transcripts to the separate +spec.) + +## Out of scope + +- Meeting transcripts — separate spec `2026-06-18-teams-transcripts-design.md`. +- Tenant-wide / compliance export (metered app-only Graph APIs). +- Downloading shared SharePoint/OneDrive file bytes — kata `c0gf`. +- Downloading meeting recording video. diff --git a/docs/superpowers/specs/2026-06-18-teams-transcripts-design.md b/docs/superpowers/specs/2026-06-18-teams-transcripts-design.md new file mode 100644 index 000000000..65a0d9417 --- /dev/null +++ b/docs/superpowers/specs/2026-06-18-teams-transcripts-design.md @@ -0,0 +1,51 @@ +# Microsoft Teams Meeting Transcripts — Design (stub) + +Date: 2026-06-18 +Status: Deferred. Split out of the Teams ingestion spec +(`2026-06-18-teams-ingestion-design.md`) after the load-bearing pass. +Needs its own brainstorm + plan before implementation. + +## Why separate + +The main Teams ingestion build archives chats, channels, and meeting chats. +Transcripts were originally a final phase there, but the load-bearing pass +(2026-06-18, validated against Microsoft Learn docs) showed the delegated +transcript surface is materially more constrained and lower-certainty, so +it deserves its own design rather than riding along. + +## Validated facts (carry into the future design) + +- **No direct chat→meeting link, and no meeting id on the chat** (LB-C1, + confirmed). Resolution path: + 1. Read `chat.onlineMeetingInfo.joinWebUrl` from the meeting chat. + 2. `GET /me/onlineMeetings?$filter=JoinWebUrl eq '{url-encoded joinWebUrl}'` + (delegated `OnlineMeetings.Read`, no admin consent) → onlineMeeting + id. + 3. `GET /me/onlineMeetings/{id}/transcripts` then + `.../transcripts/{tid}/content?$format=text/vtt` + (delegated `OnlineMeetingTranscript.Read.All`, **admin consent**). +- **VTT** is the supported format (`.docx` deprecated 2023). + +## Coverage limits (the reason it's deferred) + +- **Organizer-only (LB-C3, leaning confirmed):** delegated transcript + *content* is effectively limited to meetings the signed-in user + organized. Attendee-token access is undocumented and reportedly 403s. + The application-permission model is organizer/RSC-centric. +- **Expired meetings:** transcripts are unavailable once a meeting has + expired — a real gap for a long-horizon archive. +- **Calendar-event association required:** ad-hoc meeting chats with no + scheduled event won't resolve. + +## Open questions for the future brainstorm + +- Is organizer-only coverage worth a delegated build, or should this use + **application permissions** (broader coverage, bigger consent/ops, and + the metered/payment caveat to re-verify)? +- LB-3 live probe: confirm whether an attendee token ever returns + transcript content, and the practical expiration window. +- Storage shape: transcript text as a message body on the meeting + conversation vs. a dedicated child record; link (not bytes) to recording. + +## Tracking + +kata: see the `teams` + `transcripts` labelled issue. diff --git a/internal/api/handlers.go b/internal/api/handlers.go index 4df02ff66..2bffd1724 100644 --- a/internal/api/handlers.go +++ b/internal/api/handlers.go @@ -151,6 +151,7 @@ type AttachmentInfo struct { Filename string `json:"filename"` MimeType string `json:"mime_type"` Size int64 `json:"size_bytes"` + URL string `json:"url,omitempty"` } // SearchResult represents search results. @@ -267,6 +268,7 @@ func messageDetailFromQuery(qMsg *query.MessageDetail) MessageDetail { Filename: att.Filename, MimeType: att.MimeType, Size: att.Size, + URL: att.URL, }) } diff --git a/internal/export/attachments.go b/internal/export/attachments.go index ff56be38a..dcf71aa81 100644 --- a/internal/export/attachments.go +++ b/internal/export/attachments.go @@ -63,6 +63,10 @@ func Attachments(zipFilename, attachmentsDir string, attachments []query.Attachm usedNames := make(map[string]int) for _, att := range attachments { + if att.URL != "" { + stats.Errors = append(stats.Errors, fmt.Sprintf("%s: URL-backed attachment is available at %s", att.Filename, att.URL)) + continue + } if err := ValidateContentHash(att.ContentHash); err != nil { stats.Errors = append(stats.Errors, fmt.Sprintf("%s: %v", att.Filename, err)) continue @@ -265,6 +269,10 @@ func AttachmentsToDir(outputDir, attachmentsDir string, attachments []query.Atta usedNames := make(map[string]int) for _, att := range attachments { + if att.URL != "" { + result.Errors = append(result.Errors, fmt.Sprintf("%s: URL-backed attachment is available at %s", att.Filename, att.URL)) + continue + } if err := ValidateContentHash(att.ContentHash); err != nil { result.Errors = append(result.Errors, fmt.Sprintf("%s: %v", att.Filename, err)) continue diff --git a/internal/export/attachments_test.go b/internal/export/attachments_test.go index 222e0ceec..c650fd0b8 100644 --- a/internal/export/attachments_test.go +++ b/internal/export/attachments_test.go @@ -162,6 +162,13 @@ func TestAttachmentsToDir(t *testing.T) { }, wantErrors: 1, }, + { + name: "url backed attachment is reported as link", + setup: func(_ *testing.T, _ string) []query.AttachmentInfo { + return []query.AttachmentInfo{{Filename: "deck.pptx", URL: "https://sp/deck.pptx"}} + }, + wantErrors: 1, + }, { name: "missing content file is reported", setup: func(_ *testing.T, _ string) []query.AttachmentInfo { @@ -444,6 +451,13 @@ func TestAttachments(t *testing.T) { }, wantSubstrings: []string{"file.txt: invalid content hash"}, }, + { + name: "url backed attachment reports link", + setup: func(_ *testing.T, _ string) []query.AttachmentInfo { + return []query.AttachmentInfo{{Filename: "deck.pptx", URL: "https://sp/deck.pptx"}} + }, + wantSubstrings: []string{"deck.pptx: URL-backed attachment is available at https://sp/deck.pptx"}, + }, { name: "single-char content hash is skipped", setup: func(_ *testing.T, _ string) []query.AttachmentInfo { diff --git a/internal/microsoft/graph_oauth.go b/internal/microsoft/graph_oauth.go new file mode 100644 index 000000000..a8171766a --- /dev/null +++ b/internal/microsoft/graph_oauth.go @@ -0,0 +1,285 @@ +package microsoft + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "go.kenn.io/msgvault/internal/fileutil" + "golang.org/x/oauth2" +) + +// Microsoft Graph delegated permission scopes for Teams ingestion. +const ( + scopeGraphChatRead = "https://graph.microsoft.com/Chat.Read" + scopeGraphChannelMessage = "https://graph.microsoft.com/ChannelMessage.Read.All" + scopeGraphTeamReadBasic = "https://graph.microsoft.com/Team.ReadBasic.All" + scopeGraphChannelBasic = "https://graph.microsoft.com/Channel.ReadBasic.All" + scopeGraphUserRead = "https://graph.microsoft.com/User.Read" + scopeGraphUserReadBasic = "https://graph.microsoft.com/User.ReadBasic.All" +) + +// GraphScopes returns the OAuth scopes requested for Microsoft Teams ingestion +// via the Graph API. Unlike the IMAP scopes, these are identical for personal +// and organizational accounts. +func GraphScopes() []string { + return []string{ + scopeGraphChatRead, scopeGraphChannelMessage, scopeGraphTeamReadBasic, + scopeGraphChannelBasic, scopeGraphUserRead, scopeGraphUserReadBasic, + scopeOfflineAccess, "openid", scopeEmail, + } +} + +// GraphManager is a sibling of Manager that runs the same interactive browser +// auth-code flow but requests Microsoft Graph scopes and persists tokens under +// a "teams_" filename prefix. It deliberately omits the IMAP scope-validation +// and IMAP-host logic of Manager. +// +// The heavy browser-flow and ID-token verification machinery is reused via an +// internal *Manager delegate; only token storage (filename prefix) and the +// scope set differ. This keeps Manager's external behavior unchanged. +type GraphManager struct { + clientID string + tenantID string + tokensDir string + logger *slog.Logger + + // Test hooks, mirrored onto the internal delegate. See Manager. + browserFlowFn func(ctx context.Context, email string, scopes []string) (*oauth2.Token, string, error) + verifyIDTokenFn func(ctx context.Context, rawIDToken string) (*idTokenClaims, error) +} + +// NewGraphManager constructs a GraphManager. An empty tenantID defaults to the +// multi-tenant "common" endpoint; a nil logger defaults to slog.Default(). +func NewGraphManager(clientID, tenantID, tokensDir string, logger *slog.Logger) *GraphManager { + if tenantID == "" { + tenantID = DefaultTenant + } + if logger == nil { + logger = slog.Default() + } + return &GraphManager{ + clientID: clientID, + tenantID: tenantID, + tokensDir: tokensDir, + logger: logger, + } +} + +// delegate builds an internal *Manager used only for its reusable browser-flow +// and ID-token verification logic. Token storage is handled by GraphManager +// itself (with the teams_ prefix), so the delegate's tokensDir is irrelevant. +func (m *GraphManager) delegate() *Manager { + return &Manager{ + clientID: m.clientID, + tenantID: m.tenantID, + tokensDir: m.tokensDir, + logger: m.logger, + browserFlowFn: m.browserFlowFn, + verifyIDTokenFn: m.verifyIDTokenFn, + } +} + +// TokenPath returns the on-disk location of the persisted Graph token for an +// account, namespaced with a "teams_" prefix to keep it distinct from the IMAP +// Manager's "microsoft_" tokens. +func (m *GraphManager) TokenPath(email string) string { + return filepath.Join(m.tokensDir, "teams_"+sanitizeEmail(email)+".json") +} + +// Authorize runs the interactive browser auth-code flow requesting Graph +// scopes, verifies the returned ID token matches the expected email, and +// persists the token. Unlike Manager.Authorize there is no IMAP scope +// correction step — Graph scopes are identical across account types. +func (m *GraphManager) Authorize(ctx context.Context, email string) error { + scopes := GraphScopes() + d := m.delegate() + token, nonce, err := d.doBrowserFlow(ctx, email, scopes) + if err != nil { + return err + } + _, claims, err := d.resolveTokenEmail(ctx, email, token, nonce) + if err != nil { + return err + } + tenantID := "" + if claims != nil { + tenantID = claims.TenantID + } + return m.saveToken(email, token, scopes, tenantID) +} + +// TokenSource loads the persisted Graph token and returns a function yielding a +// fresh (auto-refreshed) access token. The returned function is safe for +// concurrent use. There is NO IMAP scope validation and NO IMAP-host logic. +// +// Token refresh HTTP requests run against context.Background so they are not +// cancelled if the caller's context expires between calls; each attempt is +// bounded by tokenRefreshTimeout. +func (m *GraphManager) TokenSource(ctx context.Context, email string) (func(context.Context) (string, error), error) { + tf, err := m.loadTokenFile(email) + if err != nil { + return nil, fmt.Errorf("no valid token for %s: %w", email, err) + } + + scopes := tf.Scopes + if len(scopes) == 0 { + scopes = GraphScopes() + } else if missing := missingGraphScopes(scopes); len(missing) > 0 { + return nil, fmt.Errorf( + "token for %s is missing Microsoft Graph scopes %s — run 'msgvault add-teams %s' to re-authorize", + email, strings.Join(missing, ", "), email, + ) + } + + refreshTenant := m.tenantID + if tf.TenantID != "" { + refreshTenant = tf.TenantID + } + oauthCfg := m.delegate().oauthConfigWithTenant(refreshTenant, scopes) + // context.Background so refreshes outlive the caller's (sync-scoped) ctx. + ts := oauthCfg.TokenSource(context.Background(), &tf.Token) + + var ( + mu sync.Mutex + lastAccessToken = tf.AccessToken + lastRefreshToken = tf.RefreshToken + lastExpiry = tf.Expiry + ) + + return func(callCtx context.Context) (string, error) { + type tokenResult struct { + tok *oauth2.Token + err error + } + ch := make(chan tokenResult, 1) + go func() { + tok, err := ts.Token() + ch <- tokenResult{tok, err} + }() + + timer := time.NewTimer(tokenRefreshTimeout) + defer timer.Stop() + + var tok *oauth2.Token + select { + case res := <-ch: + if res.err != nil { + return "", fmt.Errorf("refresh Microsoft Graph token: %w", res.err) + } + tok = res.tok + case <-timer.C: + return "", fmt.Errorf("microsoft graph token refresh timed out after %s — check network connectivity", tokenRefreshTimeout) + case <-callCtx.Done(): + return "", fmt.Errorf("microsoft graph token refresh cancelled: %w", callCtx.Err()) + } + + mu.Lock() + changed := tok.AccessToken != lastAccessToken || + tok.RefreshToken != lastRefreshToken || + !tok.Expiry.Equal(lastExpiry) + if changed { + lastAccessToken = tok.AccessToken + lastRefreshToken = tok.RefreshToken + lastExpiry = tok.Expiry + } + mu.Unlock() + + if changed { + if saveErr := m.saveToken(email, tok, scopes, tf.TenantID); saveErr != nil { + return "", fmt.Errorf("save refreshed microsoft graph token for %s: %w (token refreshed but not persisted — re-run may require re-authorization)", email, saveErr) + } + } + + return tok.AccessToken, nil + }, nil +} + +// HasToken reports whether a persisted Graph token exists for the account. +func (m *GraphManager) HasToken(email string) bool { + _, err := os.Stat(m.TokenPath(email)) + return err == nil +} + +// DeleteToken removes the local Graph token file. Missing files are not an +// error. (Graph refresh tokens expire naturally; no remote revocation here.) +func (m *GraphManager) DeleteToken(email string) error { + err := os.Remove(m.TokenPath(email)) + if os.IsNotExist(err) { + return nil + } + return err +} + +// saveToken atomically persists the token in the same on-disk JSON format as +// the IMAP Manager (tokenFile), under the teams_ filename. +func (m *GraphManager) saveToken(email string, token *oauth2.Token, scopes []string, tenantID string) error { + if err := fileutil.SecureMkdirAll(m.tokensDir, 0700); err != nil { + return err + } + + tf := tokenFile{Token: *token, Scopes: scopes, TenantID: tenantID} + data, err := json.MarshalIndent(tf, "", " ") + if err != nil { + return err + } + + path := m.TokenPath(email) + tmpFile, err := os.CreateTemp(m.tokensDir, ".teams-token-*.tmp") + if err != nil { + return fmt.Errorf("create temp token file: %w", err) + } + tmpPath := tmpFile.Name() + + if _, err := tmpFile.Write(data); err != nil { + _ = tmpFile.Close() + _ = os.Remove(tmpPath) + return fmt.Errorf("write temp token file: %w", err) + } + if err := tmpFile.Close(); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("close temp token file: %w", err) + } + if err := fileutil.SecureChmod(tmpPath, 0600); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("chmod temp token file: %w", err) + } + if err := os.Rename(tmpPath, path); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("rename temp token file: %w", err) + } + return nil +} + +func (m *GraphManager) loadTokenFile(email string) (*tokenFile, error) { + path := m.TokenPath(email) + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var tf tokenFile + if err := json.Unmarshal(data, &tf); err != nil { + return nil, err + } + return &tf, nil +} + +func missingGraphScopes(scopes []string) []string { + have := make(map[string]struct{}, len(scopes)) + for _, scope := range scopes { + have[scope] = struct{}{} + } + var missing []string + for _, scope := range GraphScopes() { + if _, ok := have[scope]; !ok { + missing = append(missing, scope) + } + } + return missing +} diff --git a/internal/microsoft/graph_oauth_test.go b/internal/microsoft/graph_oauth_test.go new file mode 100644 index 000000000..a1390629b --- /dev/null +++ b/internal/microsoft/graph_oauth_test.go @@ -0,0 +1,172 @@ +package microsoft + +import ( + "context" + "log/slog" + "path/filepath" + "sync" + "testing" + + assertpkg "github.com/stretchr/testify/assert" + requirepkg "github.com/stretchr/testify/require" + "golang.org/x/oauth2" +) + +func TestGraphTokenPath(t *testing.T) { + dir := filepath.Join("tmp", "tokens") + m := &GraphManager{tokensDir: dir} + assertpkg.Equal(t, filepath.Join(dir, "teams_user@example.com.json"), m.TokenPath("user@example.com")) +} + +func TestGraphScopes(t *testing.T) { + assert := assertpkg.New(t) + got := GraphScopes() + assert.Contains(got, "https://graph.microsoft.com/Chat.Read") + assert.Contains(got, "https://graph.microsoft.com/ChannelMessage.Read.All") + assert.Contains(got, "https://graph.microsoft.com/Team.ReadBasic.All") + assert.Contains(got, "https://graph.microsoft.com/Channel.ReadBasic.All") + assert.Contains(got, "https://graph.microsoft.com/User.Read") + assert.Contains(got, "https://graph.microsoft.com/User.ReadBasic.All") + assert.Contains(got, scopeOfflineAccess) +} + +func TestNewGraphManager_DefaultsTenant(t *testing.T) { + m := NewGraphManager("client", "", "tmp/tokens", nil) + assertpkg.Equal(t, DefaultTenant, m.tenantID, "tenantID should default to common") + requirepkg.NotNil(t, m.logger, "logger should default") +} + +func TestGraphManager_SaveLoadHasToken(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + dir := t.TempDir() + m := NewGraphManager("client", "common", dir, slog.Default()) + + assert.False(m.HasToken("user@example.com"), "HasToken false before save") + + token := &oauth2.Token{AccessToken: "a", RefreshToken: "r", TokenType: "Bearer"} + require.NoError(m.saveToken("user@example.com", token, GraphScopes(), "tid-1")) + + assert.True(m.HasToken("user@example.com"), "HasToken true after save") + + tf, err := m.loadTokenFile("user@example.com") + require.NoError(err) + assert.Equal("a", tf.AccessToken, "AccessToken") + assert.Equal("tid-1", tf.TenantID, "TenantID") + assert.Contains(tf.Scopes, "https://graph.microsoft.com/Chat.Read", "Graph scope persisted") + + // The on-disk format must be loadable by the IMAP Manager's loader too. + imap := &Manager{tokensDir: dir} + imapTf, err := imap.loadTokenFile("user@example.com") + require.Error(err, "IMAP Manager uses microsoft_ prefix, should not find teams_ file") + _ = imapTf +} + +func TestGraphManager_Authorize_PersistsGraphToken(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + dir := t.TempDir() + m := NewGraphManager("test-client", "common", dir, slog.Default()) + m.verifyIDTokenFn = testVerifyFn + + var gotScopes []string + m.browserFlowFn = func(_ context.Context, email string, scopes []string) (*oauth2.Token, string, error) { + gotScopes = scopes + idToken := makeIDToken(t, map[string]any{"email": email, "tid": "org-tid"}) + tok := (&oauth2.Token{AccessToken: "graph-access", RefreshToken: "graph-refresh", TokenType: "Bearer"}). + WithExtra(map[string]any{"id_token": idToken}) + return tok, "test-nonce", nil + } + + require.NoError(m.Authorize(t.Context(), "user@company.com")) + + // Graph scopes requested (no IMAP scope correction logic). + assert.Contains(gotScopes, "https://graph.microsoft.com/Chat.Read", "requested Graph scope") + assert.NotContains(gotScopes, ScopeIMAPOrg, "must not request IMAP scope") + + tf, err := m.loadTokenFile("user@company.com") + require.NoError(err) + assert.Equal("graph-access", tf.AccessToken, "AccessToken") + assert.Equal("org-tid", tf.TenantID, "TenantID persisted") + assert.Contains(tf.Scopes, "https://graph.microsoft.com/Chat.Read", "Graph scope persisted") +} + +func TestGraphManager_Authorize_Mismatch(t *testing.T) { + dir := t.TempDir() + m := NewGraphManager("test-client", "common", dir, slog.Default()) + m.verifyIDTokenFn = testVerifyFn + m.browserFlowFn = func(_ context.Context, _ string, _ []string) (*oauth2.Token, string, error) { + idToken := makeIDToken(t, map[string]any{"email": "other@example.com"}) + tok := (&oauth2.Token{AccessToken: "x", TokenType: "Bearer"}). + WithExtra(map[string]any{"id_token": idToken}) + return tok, "nonce", nil + } + err := m.Authorize(t.Context(), "user@company.com") + requirepkg.Error(t, err, "expected mismatch error") + mismatch := &TokenMismatchError{} + assertpkg.ErrorAs(t, err, &mismatch, "expected *TokenMismatchError") +} + +func TestGraphManager_TokenSource_NoIMAPValidation(t *testing.T) { + dir := t.TempDir() + m := NewGraphManager("test-client", "common", dir, slog.Default()) + + // Save a Graph token. There is no IMAP scope; the IMAP Manager would + // reject this, but GraphManager must accept it. + token := &oauth2.Token{AccessToken: "graph-access", RefreshToken: "graph-refresh", TokenType: "Bearer"} + requirepkg.NoError(t, m.saveToken("user@company.com", token, GraphScopes(), "org-tid")) + + ts, err := m.TokenSource(t.Context(), "user@company.com") + requirepkg.NoError(t, err) + requirepkg.NotNil(t, ts, "TokenSource returned nil") +} + +func TestGraphManager_TokenSource_StaleGraphScopesReturnsError(t *testing.T) { + require := requirepkg.New(t) + dir := t.TempDir() + m := NewGraphManager("test-client", "common", dir, slog.Default()) + + token := &oauth2.Token{AccessToken: "graph-access", RefreshToken: "graph-refresh", TokenType: "Bearer"} + oldScopes := []string{ + "https://graph.microsoft.com/Chat.Read", + "https://graph.microsoft.com/ChannelMessage.Read.All", + "https://graph.microsoft.com/Team.ReadBasic.All", + "https://graph.microsoft.com/Channel.ReadBasic.All", + "https://graph.microsoft.com/User.Read", + scopeOfflineAccess, + "openid", + scopeEmail, + } + require.NoError(m.saveToken("user@company.com", token, oldScopes, "org-tid")) + + _, err := m.TokenSource(t.Context(), "user@company.com") + require.Error(err, "expected stale Graph scope error") + require.ErrorContains(err, "missing Microsoft Graph scopes") + require.ErrorContains(err, "User.ReadBasic.All") + require.ErrorContains(err, "msgvault add-teams user@company.com") +} + +func TestGraphManager_TokenSource_MissingToken(t *testing.T) { + m := NewGraphManager("test-client", "common", t.TempDir(), slog.Default()) + _, err := m.TokenSource(t.Context(), "nobody@example.com") + requirepkg.Error(t, err, "expected error for missing token") + assertpkg.ErrorContains(t, err, "no valid token") +} + +func TestGraphManager_TokenSource_Concurrent(t *testing.T) { + dir := t.TempDir() + m := NewGraphManager("test-client", "common", dir, slog.Default()) + token := &oauth2.Token{AccessToken: "graph-access", RefreshToken: "graph-refresh", TokenType: "Bearer"} + requirepkg.NoError(t, m.saveToken("user@company.com", token, GraphScopes(), "org-tid")) + + fn, err := m.TokenSource(t.Context(), "user@company.com") + requirepkg.NoError(t, err) + + var wg sync.WaitGroup + for range 10 { + wg.Go(func() { + _, _ = fn(t.Context()) + }) + } + wg.Wait() +} diff --git a/internal/query/duckdb_text.go b/internal/query/duckdb_text.go index 6031236d6..d867a8944 100644 --- a/internal/query/duckdb_text.go +++ b/internal/query/duckdb_text.go @@ -14,7 +14,7 @@ var _ TextEngine = (*DuckDBEngine)(nil) // textTypeFilter returns a SQL condition restricting to text message types. func textTypeFilter() string { - return "msg.message_type IN ('whatsapp','imessage','sms','mms','google_voice_text')" + return "msg.message_type IN ('whatsapp','imessage','sms','mms','google_voice_text','teams')" } // textSenderJoin resolves the sending participant (p_sender) for each text @@ -444,7 +444,7 @@ func (e *DuckDBEngine) TextSearch( LEFT JOIN participants p ON p.id = m.sender_id LEFT JOIN conversations c ON c.id = m.conversation_id WHERE messages_fts MATCH ? - AND m.message_type IN ('whatsapp','imessage','sms','mms','google_voice_text') + AND m.message_type IN ('whatsapp','imessage','sms','mms','google_voice_text','teams') AND %s ORDER BY m.sent_at DESC LIMIT ? OFFSET ? diff --git a/internal/query/models.go b/internal/query/models.go index 586b0307f..8015de5f6 100644 --- a/internal/query/models.go +++ b/internal/query/models.go @@ -89,6 +89,7 @@ type AttachmentInfo struct { MimeType string Size int64 ContentHash string + URL string } // ViewType represents the type of aggregate view. diff --git a/internal/query/shared.go b/internal/query/shared.go index 5fea99158..b06b6ca67 100644 --- a/internal/query/shared.go +++ b/internal/query/shared.go @@ -240,7 +240,7 @@ func fetchParticipantsShared(ctx context.Context, db *sql.DB, rebind rebindFunc, // rebind rewrites the ? placeholders for the driver in use. func fetchAttachmentsShared(ctx context.Context, db *sql.DB, rebind rebindFunc, tablePrefix string, msg *MessageDetail) error { rows, err := db.QueryContext(ctx, rebind(fmt.Sprintf(` - SELECT id, COALESCE(filename, ''), COALESCE(mime_type, ''), COALESCE(size, 0), COALESCE(content_hash, '') + SELECT id, COALESCE(filename, ''), COALESCE(mime_type, ''), COALESCE(size, 0), COALESCE(content_hash, ''), COALESCE(storage_path, '') FROM %sattachments WHERE message_id = ? `, tablePrefix)), msg.ID) @@ -251,15 +251,24 @@ func fetchAttachmentsShared(ctx context.Context, db *sql.DB, rebind rebindFunc, for rows.Next() { var att AttachmentInfo - if err := rows.Scan(&att.ID, &att.Filename, &att.MimeType, &att.Size, &att.ContentHash); err != nil { + var storagePath string + if err := rows.Scan(&att.ID, &att.Filename, &att.MimeType, &att.Size, &att.ContentHash, &storagePath); err != nil { return err } + if isURLStoragePath(storagePath) { + att.URL = storagePath + att.ContentHash = "" + } msg.Attachments = append(msg.Attachments, att) } return rows.Err() } +func isURLStoragePath(path string) bool { + return strings.HasPrefix(path, "http://") || strings.HasPrefix(path, "https://") +} + // extractBodyFromRawShared extracts text body from compressed MIME data. // tablePrefix is "" for direct SQLite or "sqlite_db." for DuckDB's sqlite_scan. // rebind rewrites the ? placeholders for the driver in use. diff --git a/internal/query/sqlite.go b/internal/query/sqlite.go index 9670df1ff..cc03e6298 100644 --- a/internal/query/sqlite.go +++ b/internal/query/sqlite.go @@ -975,17 +975,22 @@ func (e *SQLiteEngine) getMessageByQuery(ctx context.Context, whereClause string // GetAttachment retrieves attachment metadata by ID. func (e *SQLiteEngine) GetAttachment(ctx context.Context, id int64) (*AttachmentInfo, error) { var att AttachmentInfo + var storagePath string err := e.queryRowContext(ctx, ` - SELECT id, COALESCE(filename, ''), COALESCE(mime_type, ''), COALESCE(size, 0), COALESCE(content_hash, '') + SELECT id, COALESCE(filename, ''), COALESCE(mime_type, ''), COALESCE(size, 0), COALESCE(content_hash, ''), COALESCE(storage_path, '') FROM attachments WHERE id = ? - `, id).Scan(&att.ID, &att.Filename, &att.MimeType, &att.Size, &att.ContentHash) + `, id).Scan(&att.ID, &att.Filename, &att.MimeType, &att.Size, &att.ContentHash, &storagePath) if err == sql.ErrNoRows { return nil, nil //nolint:nilnil // Engine.GetAttachment uses (nil, nil) for not-found; callers branch on the nil result } if err != nil { return nil, fmt.Errorf("get attachment: %w", err) } + if isURLStoragePath(storagePath) { + att.URL = storagePath + att.ContentHash = "" + } return &att, nil } diff --git a/internal/query/sqlite_crud_test.go b/internal/query/sqlite_crud_test.go index 13e0ea665..0977a2097 100644 --- a/internal/query/sqlite_crud_test.go +++ b/internal/query/sqlite_crud_test.go @@ -256,6 +256,45 @@ func TestGetMessageWithAttachments(t *testing.T) { assert.True(found, "expected to find doc.pdf attachment") } +func TestGetMessageWithURLBackedAttachment(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + env := newTestEnv(t) + + _, err := env.DB.Exec(` + INSERT INTO attachments (message_id, filename, mime_type, size, content_hash, storage_path) + VALUES (1, 'deck.pptx', 'reference', 0, '0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef', 'https://sp/deck.pptx') + `) + require.NoError(err, "insert URL-backed attachment") + + msg, err := env.Engine.GetMessage(env.Ctx, 1) + require.NoError(err, "GetMessage") + require.Len(msg.Attachments, 1) + assert.Equal("deck.pptx", msg.Attachments[0].Filename) + assert.Empty(msg.Attachments[0].ContentHash) + assert.Equal("https://sp/deck.pptx", msg.Attachments[0].URL) +} + +func TestGetAttachmentClearsURLBackedContentHash(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + env := newTestEnv(t) + + result, err := env.DB.Exec(` + INSERT INTO attachments (message_id, filename, mime_type, size, content_hash, storage_path) + VALUES (1, 'recording.mp4', 'video/mp4', 0, 'abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcd', 'https://sp/recording.mp4') + `) + require.NoError(err, "insert URL-backed attachment") + attID, err := result.LastInsertId() + require.NoError(err, "LastInsertId") + + att, err := env.Engine.GetAttachment(env.Ctx, attID) + require.NoError(err, "GetAttachment") + require.NotNil(att) + assert.Empty(att.ContentHash) + assert.Equal("https://sp/recording.mp4", att.URL) +} + func TestGetMessageBySourceID(t *testing.T) { env := newTestEnv(t) diff --git a/internal/query/sqlite_text.go b/internal/query/sqlite_text.go index 8396e8f4f..fab643dca 100644 --- a/internal/query/sqlite_text.go +++ b/internal/query/sqlite_text.go @@ -44,13 +44,13 @@ func parseSQLiteTimestamp(s string) (time.Time, error) { // textMsgTypeFilter returns a SQL condition restricting to text message types. // Uses the m. table alias used in text query methods. func textMsgTypeFilter() string { - return "m.message_type IN ('whatsapp','imessage','sms','mms','google_voice_text')" + return "m.message_type IN ('whatsapp','imessage','sms','mms','google_voice_text','teams')" } // textMsgTypeFilterAlias returns a SQL condition restricting to text message types // using the given table alias. func textMsgTypeFilterAlias(alias string) string { - return alias + ".message_type IN ('whatsapp','imessage','sms','mms','google_voice_text')" + return alias + ".message_type IN ('whatsapp','imessage','sms','mms','google_voice_text','teams')" } func sqliteDirection(d SortDirection) string { @@ -453,7 +453,7 @@ func (e *SQLiteEngine) TextSearch( LEFT JOIN participants p ON p.id = m.sender_id LEFT JOIN conversations c ON c.id = m.conversation_id WHERE fts.messages_fts MATCH ? - AND m.message_type IN ('whatsapp','imessage','sms','mms','google_voice_text') + AND m.message_type IN ('whatsapp','imessage','sms','mms','google_voice_text','teams') AND %s ORDER BY m.sent_at DESC LIMIT ? OFFSET ? diff --git a/internal/query/text_models.go b/internal/query/text_models.go index dc9d7572d..2e6b1eea8 100644 --- a/internal/query/text_models.go +++ b/internal/query/text_models.go @@ -95,7 +95,7 @@ type TextStatsOptions struct { // TextMessageTypes lists the message_type values included in Texts mode. var TextMessageTypes = []string{ - "whatsapp", "imessage", "sms", "mms", "google_voice_text", + "whatsapp", "imessage", "sms", "mms", "google_voice_text", "teams", } // textSortFieldToSortField converts a TextSortField to the generic SortField diff --git a/internal/query/text_search_live_test.go b/internal/query/text_search_live_test.go index 3637e0d08..530fbee6e 100644 --- a/internal/query/text_search_live_test.go +++ b/internal/query/text_search_live_test.go @@ -106,7 +106,7 @@ func TestSQLiteEngine_TextSearch_ExcludesSourceDeleted(t *testing.T) { assertpkg.Empty(t, results, "want 0 results after source delete") } -func TestTextModeIncludesMMSAndExcludesSynctechCalls(t *testing.T) { +func TestTextModeIncludesTeamsAndMMSAndExcludesSynctechCalls(t *testing.T) { assert := assertpkg.New(t) db, _ := openTextSearchDB(t) engine := NewSQLiteEngine(db) @@ -114,7 +114,8 @@ func TestTextModeIncludesMMSAndExcludesSynctechCalls(t *testing.T) { insertTextSearchMessage(t, db, 2, "sms", "sms body") insertTextSearchMessage(t, db, 3, "mms", "mms body") - insertTextSearchMessage(t, db, 4, "synctech_sms_call", "missed call body") + insertTextSearchMessage(t, db, 4, "teams", "teams body") + insertTextSearchMessage(t, db, 5, "synctech_sms_call", "missed call body") results, err := engine.TextSearch(ctx, "body", 10, 0) requirepkg.NoError(t, err, "TextSearch") @@ -124,11 +125,13 @@ func TestTextModeIncludesMMSAndExcludesSynctechCalls(t *testing.T) { } assert.Contains(types, "sms", "text mode should include sms") assert.Contains(types, "mms", "text mode should include mms") + assert.Contains(types, "teams", "text mode should include teams") assert.NotContains(types, "synctech_sms_call", "text mode should not include call log") } -func TestIsTextMessageTypeIncludesMMSAndExcludesSynctechCalls(t *testing.T) { +func TestIsTextMessageTypeIncludesTeamsAndMMSAndExcludesSynctechCalls(t *testing.T) { assertpkg.True(t, IsTextMessageType("mms"), "mms should be a text message type") + assertpkg.True(t, IsTextMessageType("teams"), "teams should be a text message type") assertpkg.False(t, IsTextMessageType("synctech_sms_call"), "synctech_sms_call should not be a text message type") } diff --git a/internal/remote/engine.go b/internal/remote/engine.go index d0fbe27fa..735a02874 100644 --- a/internal/remote/engine.go +++ b/internal/remote/engine.go @@ -515,6 +515,7 @@ func (e *Engine) GetMessage(ctx context.Context, id int64) (*query.MessageDetail Filename: att.Filename, MimeType: att.MimeType, Size: att.Size, + URL: att.URL, }) } diff --git a/internal/remote/store.go b/internal/remote/store.go index a63904b74..de07c652f 100644 --- a/internal/remote/store.go +++ b/internal/remote/store.go @@ -192,6 +192,7 @@ type attachmentResponse struct { Filename string `json:"filename"` MimeType string `json:"mime_type"` Size int64 `json:"size_bytes"` + URL string `json:"url,omitempty"` } // listMessagesResponse matches the API list messages response. @@ -301,6 +302,7 @@ func (s *Store) GetMessage(id int64) (*store.APIMessage, error) { Filename: a.Filename, MimeType: a.MimeType, Size: a.Size, + URL: a.URL, } } msg.Attachments = attachments diff --git a/internal/search/parser.go b/internal/search/parser.go index 9d0c536f6..d960451f8 100644 --- a/internal/search/parser.go +++ b/internal/search/parser.go @@ -23,7 +23,7 @@ type Query struct { LargerThan *int64 // larger: filter (bytes) SmallerThan *int64 // smaller: filter (bytes) AccountIDs []int64 // in: account filter (one or more source IDs) - MessageTypes []string // message_type filter (e.g. sms, mms, whatsapp) + MessageTypes []string // message_type filter (e.g. sms, mms, whatsapp, teams) HideDeleted bool // exclude messages where deleted_from_source_at IS NOT NULL } diff --git a/internal/store/api.go b/internal/store/api.go index 639bcf1c6..6a468a5fa 100644 --- a/internal/store/api.go +++ b/internal/store/api.go @@ -62,6 +62,7 @@ type APIAttachment struct { Filename string MimeType string Size int64 + URL string } // ListMessages returns a paginated list of messages with batch-loaded recipients and labels. @@ -209,12 +210,16 @@ func (s *Store) GetMessage(id int64) (*APIMessage, error) { } // Get attachments - attRows, err := s.db.Query("SELECT filename, mime_type, size FROM attachments WHERE message_id = ?", id) + attRows, err := s.db.Query("SELECT filename, mime_type, size, storage_path FROM attachments WHERE message_id = ?", id) if err == nil { defer func() { _ = attRows.Close() }() for attRows.Next() { var att APIAttachment - if err := attRows.Scan(&att.Filename, &att.MimeType, &att.Size); err == nil { + var storagePath string + if err := attRows.Scan(&att.Filename, &att.MimeType, &att.Size, &storagePath); err == nil { + if strings.HasPrefix(storagePath, "http://") || strings.HasPrefix(storagePath, "https://") { + att.URL = storagePath + } m.Attachments = append(m.Attachments, att) } } diff --git a/internal/store/messages.go b/internal/store/messages.go index 60e6aba4f..57d138bce 100644 --- a/internal/store/messages.go +++ b/internal/store/messages.go @@ -10,6 +10,7 @@ import ( "io" "log/slog" "math/rand" + "regexp" "strings" "time" @@ -127,6 +128,73 @@ func (s *Store) UpdateMessageOnDedup( }) } +// MigrateSourceMessageID rewrites a legacy source_message_id to a new value +// for one conversation. If the new ID already exists, dependents are repointed +// and the legacy row is removed so future imports converge on the new key. +func (s *Store) MigrateSourceMessageID(sourceID, conversationID int64, legacySourceMessageID, newSourceMessageID string) error { + if legacySourceMessageID == "" || legacySourceMessageID == newSourceMessageID { + return nil + } + return s.withTx(func(tx *loggedTx) error { + var newID int64 + err := tx.QueryRow( + `SELECT id FROM messages WHERE source_id = ? AND source_message_id = ?`, + sourceID, newSourceMessageID, + ).Scan(&newID) + if err != nil && !errors.Is(err, sql.ErrNoRows) { + return fmt.Errorf("find migrated message id: %w", err) + } + if err == nil { + if _, err = tx.Exec( + `UPDATE messages SET deleted_from_source_at = NULL WHERE id = ?`, + newID, + ); err != nil { + return fmt.Errorf("clear migrated message deletion marker: %w", err) + } + + var legacyID int64 + legacyErr := tx.QueryRow( + `SELECT id FROM messages + WHERE source_id = ? AND conversation_id = ? AND source_message_id = ?`, + sourceID, conversationID, legacySourceMessageID, + ).Scan(&legacyID) + if legacyErr != nil && !errors.Is(legacyErr, sql.ErrNoRows) { + return fmt.Errorf("find legacy message id: %w", legacyErr) + } + if legacyErr == nil { + if _, err = tx.Exec( + `UPDATE messages SET reply_to_message_id = ? + WHERE reply_to_message_id = ?`, + newID, legacyID, + ); err != nil { + return fmt.Errorf("repoint legacy replies: %w", err) + } + } + + _, err = tx.Exec( + `DELETE FROM messages + WHERE source_id = ? AND conversation_id = ? AND source_message_id = ?`, + sourceID, conversationID, legacySourceMessageID, + ) + if err != nil { + return fmt.Errorf("delete legacy source_message_id: %w", err) + } + return nil + } + + _, err = tx.Exec( + `UPDATE messages + SET source_message_id = ?, deleted_from_source_at = NULL + WHERE source_id = ? AND conversation_id = ? AND source_message_id = ?`, + newSourceMessageID, sourceID, conversationID, legacySourceMessageID, + ) + if err != nil { + return fmt.Errorf("migrate source_message_id: %w", err) + } + return nil + }) +} + // MessageExistsWithRawBatch checks which message IDs already exist in the database // and have raw MIME data stored. // Returns a map of source_message_id -> internal message_id. @@ -806,6 +874,17 @@ func (s *Store) RemoveMessageLabels(messageID int64, labelIDs []int64) error { `DELETE FROM message_labels WHERE message_id = ? AND label_id IN (%s)`) } +// SetReplyTo links a channel reply to its parent by resolving the parent's +// source_message_id to its internal messages.id within the same source. +func (s *Store) SetReplyTo(sourceID int64, childSourceMessageID, parentSourceMessageID string) error { + _, err := s.db.Exec(s.dialect.Rebind(` + UPDATE messages SET reply_to_message_id = + (SELECT id FROM messages WHERE source_id = ? AND source_message_id = ?) + WHERE source_id = ? AND source_message_id = ?`), + sourceID, parentSourceMessageID, sourceID, childSourceMessageID) + return err +} + // MarkMessageDeleted marks a message as deleted from the source. func (s *Store) MarkMessageDeleted(sourceID int64, sourceMessageID string) error { _, err := s.db.Exec(fmt.Sprintf(` @@ -1260,6 +1339,134 @@ func (s *Store) RecomputeConversationStats(sourceID int64) error { return nil } +// ForEachTeamsHostedContentBody invokes fn with (messageID, bodyHTML) for every +// message of the given source whose HTML body contains a hostedContents URL, so +// callers can re-fetch inline media. +func (s *Store) ForEachTeamsHostedContentBody(sourceID int64, fn func(messageID int64, bodyHTML string) error) error { + return s.forEachHostedContentBody(` + SELECT mb.message_id, mb.body_html + FROM message_bodies mb + JOIN messages m ON m.id = mb.message_id + WHERE m.source_id = ? AND mb.body_html LIKE '%hostedContents%' + `, sourceID, fn) +} + +// ForEachTeamsIncompleteHostedContentBody is like ForEachTeamsHostedContentBody +// but yields only messages whose number of distinct hostedContents references +// in body_html exceeds the count of inline image files already stored for them +// — i.e. messages whose inline media was not fully downloaded (transient fetch +// failures). Used to retry just the gaps instead of re-fetching everything. +func (s *Store) ForEachTeamsIncompleteHostedContentBody(sourceID int64, fn func(messageID int64, bodyHTML string) error) error { + type bodyRow struct { + id int64 + body string + } + var buf []bodyRow + + rows, err := s.db.Query(` + SELECT mb.message_id, mb.body_html, + (SELECT COUNT(*) FROM attachments a + WHERE a.message_id = mb.message_id + AND a.storage_path NOT LIKE 'http%' AND a.storage_path != '' + AND a.content_hash != '') + FROM message_bodies mb + JOIN messages m ON m.id = mb.message_id + WHERE m.source_id = ? AND mb.body_html LIKE '%hostedContents%' + `, sourceID) + if err != nil { + return err + } + for rows.Next() { + var messageID int64 + var bodyHTML sql.NullString + var localAttachmentRows int + if err := rows.Scan(&messageID, &bodyHTML, &localAttachmentRows); err != nil { + _ = rows.Close() + return err + } + if !bodyHTML.Valid || bodyHTML.String == "" { + continue + } + if countDistinctHostedContentRefs(bodyHTML.String) > localAttachmentRows { + buf = append(buf, bodyRow{id: messageID, body: bodyHTML.String}) + } + } + if err := rows.Err(); err != nil { + _ = rows.Close() + return err + } + if err := rows.Close(); err != nil { + return err + } + for _, r := range buf { + if err := fn(r.id, r.body); err != nil { + return err + } + } + return nil +} + +var teamsHostedContentURLRe = regexp.MustCompile(`https?://[^"'\s)]+/hostedContents/[^"'\s)]+/\$value`) + +func countDistinctHostedContentRefs(bodyHTML string) int { + refs := teamsHostedContentURLRe.FindAllString(bodyHTML, -1) + if len(refs) == 0 { + return 0 + } + seen := make(map[string]struct{}, len(refs)) + for _, ref := range refs { + seen[ref] = struct{}{} + } + return len(seen) +} + +// forEachHostedContentBody runs query (a single ? = sourceID, selecting +// message_id, body_html) and invokes fn per row. The matching rows are read +// fully and the read cursor is closed BEFORE any callback runs: callers +// typically write (e.g. UpsertAttachment) inside fn, and holding a streaming +// read cursor open across those writes pins a second pooled connection and +// contends for SQLite's single writer ("database is locked"). Returning an +// error from fn stops iteration and is returned. +func (s *Store) forEachHostedContentBody(query string, sourceID int64, fn func(messageID int64, bodyHTML string) error) error { + type bodyRow struct { + id int64 + body string + } + var buf []bodyRow + + rows, err := s.db.Query(query, sourceID) + if err != nil { + return err + } + for rows.Next() { + var messageID int64 + var bodyHTML sql.NullString + if err := rows.Scan(&messageID, &bodyHTML); err != nil { + _ = rows.Close() + return err + } + if !bodyHTML.Valid || bodyHTML.String == "" { + continue + } + buf = append(buf, bodyRow{id: messageID, body: bodyHTML.String}) + } + if err := rows.Err(); err != nil { + _ = rows.Close() + return err + } + // Release the read cursor (and its connection) before the write callbacks. + if err := rows.Close(); err != nil { + return err + } + + for _, r := range buf { + if err := fn(r.id, r.body); err != nil { + return err + } + } + return nil +} + // EnsureConversationWithType gets or creates a conversation with an // explicit conversation_type. Unlike EnsureConversation (which hardcodes // 'email_thread'), this accepts the type as a parameter, making it @@ -1735,6 +1942,32 @@ func (s *Store) UpsertReaction(messageID, participantID int64, reactionType, rea return err } +type ReactionRef struct { + ParticipantID int64 + Type string + Value string + CreatedAt time.Time +} + +// ReplaceReactions replaces all reactions for a message atomically. +func (s *Store) ReplaceReactions(messageID int64, reactions []ReactionRef) error { + return s.withTx(func(tx *loggedTx) error { + if _, err := tx.Exec(`DELETE FROM reactions WHERE message_id = ?`, messageID); err != nil { + return err + } + for _, r := range reactions { + if r.ParticipantID == 0 { + continue + } + if _, err := tx.Exec(s.dialect.InsertOrIgnore(`INSERT OR IGNORE INTO reactions (message_id, participant_id, reaction_type, reaction_value, created_at) + VALUES (?, ?, ?, ?, ?)`), messageID, r.ParticipantID, r.Type, r.Value, r.CreatedAt); err != nil { + return err + } + } + return nil + }) +} + // UpsertMessageRawWithFormat stores compressed raw data with an explicit format. // Unlike UpsertMessageRaw (which hardcodes 'mime'), this accepts the format as a parameter. func (s *Store) UpsertMessageRawWithFormat(messageID int64, rawData []byte, format string) error { @@ -1776,8 +2009,11 @@ func (s *Store) AttachmentPathsUniqueToSource(sourceID int64) ([]string, error) WHERE m.id = a.message_id AND m.source_id = ? ) AND a.content_hash IS NOT NULL + AND a.content_hash != '' AND a.storage_path IS NOT NULL AND a.storage_path != '' + AND a.storage_path NOT LIKE 'http://%' + AND a.storage_path NOT LIKE 'https://%' AND NOT EXISTS ( SELECT 1 FROM attachments a2 WHERE a2.content_hash = a.content_hash @@ -1858,3 +2094,93 @@ func (s *Store) UpsertAttachment(messageID int64, filename, mimeType, storagePat `, s.dialect.Now()), messageID, filename, mimeType, storagePath, contentHash, int64(size)) return err } + +// RecomputeMessageAttachmentStats refreshes the denormalized attachment flags +// on one message from its current attachment rows. +func (s *Store) RecomputeMessageAttachmentStats(messageID int64) error { + _, err := s.db.Exec(` + UPDATE messages + SET has_attachments = (SELECT COUNT(*) FROM attachments WHERE message_id = ?) > 0, + attachment_count = (SELECT COUNT(*) FROM attachments WHERE message_id = ?) + WHERE id = ? + `, messageID, messageID, messageID) + return err +} + +type AttachmentRef struct { + Filename string + MimeType string + StoragePath string + ContentHash string + Size int + SourceAttachmentID string +} + +// ReplaceMessageInlineAttachments replaces Teams-managed inline media rows for +// a message. It removes both rows marked by the current source_attachment_id +// scheme and legacy unmarked Teams inline rows produced before that marker was +// added, while leaving URL-backed reference/recording attachments untouched. +func (s *Store) ReplaceMessageInlineAttachments(messageID int64, refs []AttachmentRef) error { + return s.withTx(func(tx *loggedTx) error { + if _, err := tx.Exec(` + DELETE FROM attachments + WHERE message_id = ? + AND ( + source_attachment_id LIKE 'teams:inline:%' + OR ( + (source_attachment_id IS NULL OR source_attachment_id = '') + AND storage_path != '' + AND storage_path NOT LIKE 'http://%' + AND storage_path NOT LIKE 'https://%' + AND content_hash IS NOT NULL + AND content_hash != '' + AND COALESCE(filename, '') = '' + AND COALESCE(mime_type, '') = '' + ) + ) + `, messageID); err != nil { + return err + } + for _, ref := range refs { + if ref.StoragePath == "" || ref.ContentHash == "" { + continue + } + if _, err := tx.Exec(fmt.Sprintf(` + INSERT INTO attachments (message_id, filename, mime_type, storage_path, content_hash, size, source_attachment_id, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, %s) + ON CONFLICT (message_id, content_hash) WHERE content_hash IS NOT NULL AND content_hash != '' DO NOTHING + `, s.dialect.Now()), messageID, ref.Filename, ref.MimeType, ref.StoragePath, ref.ContentHash, int64(ref.Size), ref.SourceAttachmentID); err != nil { + return err + } + } + return nil + }) +} + +// ReplaceMessageLinkAttachments replaces URL-backed attachment rows for a message. +// It intentionally leaves content-addressed local attachment paths (for example +// downloaded inline media) untouched. +func (s *Store) ReplaceMessageLinkAttachments(messageID int64, refs []AttachmentRef) error { + return s.withTx(func(tx *loggedTx) error { + if _, err := tx.Exec(` + DELETE FROM attachments + WHERE message_id = ? + AND (storage_path LIKE 'http://%' OR storage_path LIKE 'https://%') + `, messageID); err != nil { + return err + } + for _, ref := range refs { + if ref.StoragePath == "" { + continue + } + if _, err := tx.Exec(fmt.Sprintf(` + INSERT INTO attachments (message_id, filename, mime_type, storage_path, content_hash, size, source_attachment_id, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, %s) + ON CONFLICT (message_id, content_hash) WHERE content_hash IS NOT NULL AND content_hash != '' DO NOTHING + `, s.dialect.Now()), messageID, ref.Filename, ref.MimeType, ref.StoragePath, ref.ContentHash, int64(ref.Size), ref.SourceAttachmentID); err != nil { + return err + } + } + return nil + }) +} diff --git a/internal/store/messages_hostedcontent_test.go b/internal/store/messages_hostedcontent_test.go new file mode 100644 index 000000000..a0f75b149 --- /dev/null +++ b/internal/store/messages_hostedcontent_test.go @@ -0,0 +1,171 @@ +package store_test + +import ( + "database/sql" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/testutil" +) + +// TestForEachTeamsHostedContentBody verifies that the iterator streams only the +// message_bodies rows whose body_html contains a hostedContents URL for the +// given source, and skips rows without one (and NULL/empty bodies). +func TestForEachTeamsHostedContentBody(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + st := testutil.NewTestStore(t) + + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + + convID, err := st.EnsureConversationWithType(src.ID, "19:x@thread.v2", "oneOnOne", "DM") + require.NoError(err) + + // Message WITH a hostedContents URL. + withID, err := st.UpsertMessage(&store.Message{ + ConversationID: convID, + SourceID: src.ID, + SourceMessageID: "m_with", + MessageType: "teams", + }) + require.NoError(err) + hostedHTML := `
` + require.NoError(st.UpsertMessageBody(withID, + sql.NullString{String: "with image", Valid: true}, + sql.NullString{String: hostedHTML, Valid: true})) + + // Message WITHOUT a hostedContents URL. + withoutID, err := st.UpsertMessage(&store.Message{ + ConversationID: convID, + SourceID: src.ID, + SourceMessageID: "m_without", + MessageType: "teams", + }) + require.NoError(err) + require.NoError(st.UpsertMessageBody(withoutID, + sql.NullString{String: "plain", Valid: true}, + sql.NullString{String: "
no images here
", Valid: true})) + + // Message with NULL body_html — should be skipped. + nullID, err := st.UpsertMessage(&store.Message{ + ConversationID: convID, + SourceID: src.ID, + SourceMessageID: "m_null", + MessageType: "teams", + }) + require.NoError(err) + require.NoError(st.UpsertMessageBody(nullID, + sql.NullString{String: "text only", Valid: true}, + sql.NullString{})) + + var seen []int64 + var seenBodies []string + err = st.ForEachTeamsHostedContentBody(src.ID, func(messageID int64, bodyHTML string) error { + seen = append(seen, messageID) + seenBodies = append(seenBodies, bodyHTML) + return nil + }) + require.NoError(err) + + require.Len(seen, 1, "only the hostedContents row should be streamed") + assert.Equal(withID, seen[0]) + assert.Equal(hostedHTML, seenBodies[0]) +} + +// TestForEachTeamsIncompleteHostedContentBody verifies the iterator yields only +// messages whose hostedContents reference count exceeds their stored on-disk +// inline images — i.e. messages still missing media — and skips fully-downloaded +// ones. +func TestForEachTeamsIncompleteHostedContentBody(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + st := testutil.NewTestStore(t) + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + convID, err := st.EnsureConversationWithType(src.ID, "19:x@thread.v2", "oneOnOne", "DM") + require.NoError(err) + + mk := func(smid, html string) int64 { + id, err := st.UpsertMessage(&store.Message{ + ConversationID: convID, SourceID: src.ID, SourceMessageID: smid, MessageType: "teams", + }) + require.NoError(err) + require.NoError(st.UpsertMessageBody(id, + sql.NullString{String: "x", Valid: true}, sql.NullString{String: html, Valid: true})) + return id + } + + oneRef := `` + // Complete: one hostedContents ref, one stored on-disk image. + complete := mk("m_complete", oneRef) + require.NoError(st.UpsertAttachment(complete, "", "", "ab/abc", "abc123", 10)) + // Complete with duplicate HTML references to the same hostedContent URL: + // the repair path should compare distinct hosted refs, not raw occurrences. + duplicateComplete := mk("m_duplicate_complete", oneRef+oneRef) + require.NoError(st.UpsertAttachment(duplicateComplete, "", "", "de/def", "def456", 10)) + // Incomplete: one hostedContents ref, no stored image. + incomplete := mk("m_incomplete", oneRef) + + var seen []int64 + require.NoError(st.ForEachTeamsIncompleteHostedContentBody(src.ID, func(id int64, _ string) error { + seen = append(seen, id) + return nil + })) + require.Len(seen, 1, "only the message still missing media should be yielded") + assert.Equal(incomplete, seen[0]) + assert.NotContains(seen, duplicateComplete) +} + +// TestForEachTeamsHostedContentBody_WriteInsideCallback verifies that the +// callback can write to the store without the iterator's read cursor causing +// contention — the iterator must read all matching rows and close the cursor +// before invoking callbacks, since callers write (UpsertAttachment) inside fn. +func TestForEachTeamsHostedContentBody_WriteInsideCallback(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + st := testutil.NewTestStore(t) + + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + convID, err := st.EnsureConversationWithType(src.ID, "19:x@thread.v2", "oneOnOne", "DM") + require.NoError(err) + + // Seed several hostedContents messages so the callback writes many times + // while iterating (the pattern that previously deadlocked on a live DB). + ids := make([]int64, 0, 5) + for _, smid := range []string{"a", "b", "c", "d", "e"} { + id, err := st.UpsertMessage(&store.Message{ + ConversationID: convID, + SourceID: src.ID, + SourceMessageID: "m_" + smid, + MessageType: "teams", + }) + require.NoError(err) + html := `` + require.NoError(st.UpsertMessageBody(id, + sql.NullString{String: "x", Valid: true}, + sql.NullString{String: html, Valid: true})) + ids = append(ids, id) + } + + err = st.ForEachTeamsHostedContentBody(src.ID, func(messageID int64, bodyHTML string) error { + // Write inside the callback — must not error/deadlock. Use the message + // id in the content hash so each row is distinct. + hash := fmt.Sprintf("hash-%d", messageID) + return st.UpsertAttachment(messageID, "img", "image/png", "abc/"+hash, hash, 1) + }) + require.NoError(err) + + for _, id := range ids { + var n int + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT COUNT(*) FROM attachments WHERE message_id = ?`), + id, + ).Scan(&n)) + assert.Equal(1, n, "callback write should have persisted for message %d", id) + } +} diff --git a/internal/store/messages_test.go b/internal/store/messages_test.go index 7aa3bf985..196b49e08 100644 --- a/internal/store/messages_test.go +++ b/internal/store/messages_test.go @@ -163,6 +163,112 @@ func TestEmbedGen_OrphanImpossibleAndCoverage(t *testing.T) { assert.False(embedGen.Valid, "subject change must clear embed_gen") } +func TestMigrateSourceMessageIDRepointsRepliesBeforeDeletingDuplicate(t *testing.T) { + require := requirepkg.New(t) + assert := assertpkg.New(t) + st := testutil.NewTestStore(t) + + source, err := st.GetOrCreateSource("teams", "user@example.com") + require.NoError(err, "GetOrCreateSource") + convID, err := st.EnsureConversationWithType(source.ID, "team-1/channel-1", "channel", "General") + require.NoError(err, "EnsureConversationWithType") + + legacyParentID := insertStoreTestMessage(t, st, source.ID, convID, "m1") + scopedParentID := insertStoreTestMessage(t, st, source.ID, convID, "channel:team-1:channel-1:m1") + childID := insertStoreTestMessage(t, st, source.ID, convID, "m2") + _, err = st.DB().Exec( + st.Rebind(`UPDATE messages SET reply_to_message_id = ? WHERE id = ?`), + legacyParentID, childID, + ) + require.NoError(err, "seed reply") + + require.NoError( + st.MigrateSourceMessageID(source.ID, convID, "m1", "channel:team-1:channel-1:m1"), + "MigrateSourceMessageID", + ) + + var replyTo sql.NullInt64 + err = st.DB().QueryRow( + st.Rebind(`SELECT reply_to_message_id FROM messages WHERE id = ?`), + childID, + ).Scan(&replyTo) + require.NoError(err, "scan reply_to_message_id") + require.True(replyTo.Valid, "reply_to_message_id should remain set") + assert.Equal(scopedParentID, replyTo.Int64, "reply should point at scoped parent") + + var legacyCount int + err = st.DB().QueryRow( + st.Rebind(`SELECT COUNT(*) FROM messages WHERE id = ?`), + legacyParentID, + ).Scan(&legacyCount) + require.NoError(err, "legacy count") + assert.Equal(0, legacyCount, "legacy duplicate should be deleted") +} + +func TestMigrateSourceMessageIDClearsTombstoneWhenRenamingLegacyRow(t *testing.T) { + require := requirepkg.New(t) + st := testutil.NewTestStore(t) + + source, err := st.GetOrCreateSource("teams", "user@example.com") + require.NoError(err, "GetOrCreateSource") + convID, err := st.EnsureConversationWithType(source.ID, "19:x@thread.v2", "direct_chat", "DM") + require.NoError(err, "EnsureConversationWithType") + _ = insertStoreTestMessage(t, st, source.ID, convID, "m1") + require.NoError(st.MarkMessageDeleted(source.ID, "m1"), "MarkMessageDeleted") + + require.NoError( + st.MigrateSourceMessageID(source.ID, convID, "m1", "chat:19:x@thread.v2:m1"), + "MigrateSourceMessageID", + ) + + assertSourceMessageIDNotDeleted(t, st, source.ID, "chat:19:x@thread.v2:m1") +} + +func TestMigrateSourceMessageIDClearsTombstoneOnExistingScopedRow(t *testing.T) { + require := requirepkg.New(t) + st := testutil.NewTestStore(t) + + source, err := st.GetOrCreateSource("teams", "user@example.com") + require.NoError(err, "GetOrCreateSource") + convID, err := st.EnsureConversationWithType(source.ID, "19:x@thread.v2", "direct_chat", "DM") + require.NoError(err, "EnsureConversationWithType") + _ = insertStoreTestMessage(t, st, source.ID, convID, "chat:19:x@thread.v2:m1") + require.NoError(st.MarkMessageDeleted(source.ID, "chat:19:x@thread.v2:m1"), "MarkMessageDeleted") + + require.NoError( + st.MigrateSourceMessageID(source.ID, convID, "m1", "chat:19:x@thread.v2:m1"), + "MigrateSourceMessageID", + ) + + assertSourceMessageIDNotDeleted(t, st, source.ID, "chat:19:x@thread.v2:m1") +} + +func insertStoreTestMessage(t *testing.T, st *store.Store, sourceID, convID int64, sourceMessageID string) int64 { + t.Helper() + msg := &store.Message{ + SourceID: sourceID, + SourceMessageID: sourceMessageID, + ConversationID: convID, + MessageType: "teams", + SentAt: sql.NullTime{Time: time.Date(2024, 1, 15, 10, 0, 0, 0, time.UTC), Valid: true}, + Snippet: sql.NullString{String: sourceMessageID, Valid: true}, + } + id, err := st.UpsertMessage(msg) + requirepkg.NoError(t, err, "UpsertMessage "+sourceMessageID) + return id +} + +func assertSourceMessageIDNotDeleted(t *testing.T, st *store.Store, sourceID int64, sourceMessageID string) { + t.Helper() + var deletedAt sql.NullTime + err := st.DB().QueryRow( + st.Rebind(`SELECT deleted_from_source_at FROM messages WHERE source_id = ? AND source_message_id = ?`), + sourceID, sourceMessageID, + ).Scan(&deletedAt) + requirepkg.NoError(t, err, "scan deleted_from_source_at") + assertpkg.False(t, deletedAt.Valid, "deleted_from_source_at should be cleared") +} + func TestEnsureParticipantByPhone_IdentifierType(t *testing.T) { require := requirepkg.New(t) assert := assertpkg.New(t) diff --git a/internal/store/schema.sql b/internal/store/schema.sql index 60969efea..852d850ca 100644 --- a/internal/store/schema.sql +++ b/internal/store/schema.sql @@ -113,7 +113,7 @@ CREATE TABLE IF NOT EXISTS messages ( rfc822_message_id TEXT, -- Message classification - message_type TEXT NOT NULL, -- 'email', 'imessage', 'sms', 'mms', 'rcs', 'whatsapp', 'fbmessenger' + message_type TEXT NOT NULL, -- 'email', 'imessage', 'sms', 'mms', 'rcs', 'whatsapp', 'fbmessenger', 'teams' -- Timestamps (sent_at is canonical, others platform-specific) sent_at DATETIME, diff --git a/internal/store/sources_test.go b/internal/store/sources_test.go index 09ce6cdea..8fc663772 100644 --- a/internal/store/sources_test.go +++ b/internal/store/sources_test.go @@ -277,6 +277,15 @@ func TestStore_AttachmentPathsUniqueToSource(t *testing.T) { "", "emptypathhash", 40) require.NoError(err, "upsert empty-path attachment") + // URL-backed attachment rows are links, not local files to clean up. + urlBackedMsg := f.CreateMessage("msg-url-backed") + _, err = f.Store.DB().Exec( + f.Store.Rebind(`INSERT INTO attachments (message_id, filename, mime_type, storage_path, content_hash, size, created_at) + VALUES (?, 'deck.pptx', 'reference', 'https://sp/deck.pptx', '', 0, CURRENT_TIMESTAMP)`), + urlBackedMsg, + ) + require.NoError(err, "insert URL-backed attachment") + // Two messages in the default source referencing the same unique hash // should collapse to a single storage_path in the result. dupMsg := f.CreateMessage("msg-dup-hash") diff --git a/internal/teams/client.go b/internal/teams/client.go new file mode 100644 index 000000000..17f19a971 --- /dev/null +++ b/internal/teams/client.go @@ -0,0 +1,238 @@ +package teams + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strconv" + "strings" + "time" + + "golang.org/x/time/rate" +) + +const maxRetries = 8 + +// TokenFunc returns a bearer token for a Graph API request. +type TokenFunc func(context.Context) (string, error) + +// Client is a minimal Microsoft Graph REST client supporting paging and +// Retry-After back-off. +type Client struct { + baseURL string + token TokenFunc + http *http.Client + limiter *rate.Limiter +} + +// NewClient creates a Client. baseURL is injected so tests can point at +// httptest servers. qps controls the token-bucket rate limit (default 5). +func NewClient(baseURL string, token TokenFunc, qps float64) *Client { + if qps <= 0 { + qps = 5 + } + return &Client{ + baseURL: strings.TrimRight(baseURL, "/"), + token: token, + http: &http.Client{Timeout: 60 * time.Second}, + limiter: rate.NewLimiter(rate.Limit(qps), 1), + } +} + +// get fetches rawURL, respecting the rate limiter and retrying on 429/5xx with +// Retry-After or exponential back-off. +func (c *Client) get(ctx context.Context, rawURL string) ([]byte, error) { + reqURL, err := c.resolveRequestURL(rawURL) + if err != nil { + return nil, err + } + for attempt := range maxRetries { + if err := c.limiter.Wait(ctx); err != nil { + return nil, fmt.Errorf("wait for graph rate limit: %w", err) + } + tok, err := c.token(ctx) + if err != nil { + return nil, err + } + req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL, nil) + if err != nil { + return nil, err + } + req.Header.Set("Authorization", "Bearer "+tok) + req.Header.Set("Accept", "application/json") + resp, err := c.http.Do(req) + if err != nil { + return nil, err + } + body, readErr := io.ReadAll(resp.Body) + closeErr := resp.Body.Close() + if readErr != nil { + return nil, fmt.Errorf("graph GET %s: read body: %w", reqURL, readErr) + } + if closeErr != nil { + return nil, fmt.Errorf("graph GET %s: close body: %w", reqURL, closeErr) + } + + switch { + case resp.StatusCode == http.StatusOK: + return body, nil + case resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode >= 500: + wait := retryAfter(resp.Header.Get("Retry-After"), attempt) + timer := time.NewTimer(wait) + select { + case <-ctx.Done(): + timer.Stop() + return nil, ctx.Err() + case <-timer.C: + } + continue + default: + return nil, fmt.Errorf("graph GET %s: status %d: %s", reqURL, resp.StatusCode, string(body)) + } + } + return nil, fmt.Errorf("graph GET %s: exhausted %d retries", reqURL, maxRetries) +} + +func (c *Client) resolveRequestURL(rawURL string) (string, error) { + u, err := url.Parse(rawURL) + if err != nil { + return "", fmt.Errorf("graph GET %q: parse URL: %w", rawURL, err) + } + if !u.IsAbs() { + return c.baseURL + rawURL, nil + } + base, err := url.Parse(c.baseURL) + if err != nil { + return "", fmt.Errorf("graph base URL %q: %w", c.baseURL, err) + } + if !strings.EqualFold(u.Scheme, base.Scheme) || !strings.EqualFold(u.Host, base.Host) { + return "", fmt.Errorf("graph GET %s: off-origin absolute URL", rawURL) + } + return u.String(), nil +} + +// retryAfter parses a Retry-After header value (seconds) or falls back to +// exponential back-off capped at 60 s. +func retryAfter(header string, attempt int) time.Duration { + if header != "" { + if secs, err := strconv.Atoi(strings.TrimSpace(header)); err == nil { + return time.Duration(secs) * time.Second + } + } + d := min(time.Duration(1< 0 && convCount >= opts.Limit { + break + } + gm := &msgs[i] + messageID, added, perr := imp.persistMessage(ctx, convID, sourceID, chatSourceMessageID(ch.ID, gm.ID), gm, opts, sum, toRecips) + if perr != nil { + return perr + } + if messageID != 0 { + persistedIDs = append(persistedIDs, messageID) + } + if added { + sum.MessagesAdded++ + } + sum.MessagesProcessed++ + convCount++ + // Track the latest lastModifiedDateTime across persisted messages using + // time.Time comparison to avoid any lexicographic-width hazard with + // variable-precision fractional seconds. + if t := gm.LastModifiedDateTime.UTC(); t.After(maxTime) { + maxTime = t + } + } + truncated := opts.Limit > 0 && convCount < len(msgs) + if chatComplete && !truncated && !maxTime.IsZero() { + state.SetChatCursor(ch.ID, maxTime.Format(time.RFC3339Nano)) + } + imp.enqueueEmbeddings(ctx, opts, sum, persistedIDs) + sum.ChatsProcessed++ + + // Emit per-conversation progress (1-based index). + if opts.Progress != nil { + opts.Progress(fmt.Sprintf("chat %d/%d (%s): %d messages", idx+1, total, conversationType(ch.ChatType), convCount)) + } + + // Flush checkpoint so an interrupted run can resume from this point. + if blob, merr := state.Marshal(); merr == nil { + _ = imp.store.UpdateSyncCheckpoint(syncID, &store.Checkpoint{ + PageToken: blob, + MessagesProcessed: sum.MessagesProcessed, + MessagesAdded: sum.MessagesAdded, + ErrorsCount: sum.Errors, + }) + } + } + return nil +} + +func (imp *Importer) syncChannels(ctx context.Context, sourceID, syncID int64, opts ImportOptions, state *SyncState, sum *ImportSummary) error { + teams, err := imp.client.ListJoinedTeams(ctx) + if err != nil { + return err + } + for _, team := range teams { + if ctx.Err() != nil { + return ctx.Err() + } + channels, cerr := imp.client.ListChannels(ctx, team.ID) + if cerr != nil { + sum.Errors++ + continue + } + for _, ch := range channels { + if ctx.Err() != nil { + return ctx.Err() + } + key := team.ID + "/" + ch.ID + title := team.DisplayName + " / " + ch.DisplayName + convID, err := imp.store.EnsureConversationWithType(sourceID, key, "channel", title) + if err != nil { + return err + } + + prevDelta := state.ChannelDelta(key) + var newDelta string + channelComplete := true + + // Phase 0: collect all messages for this channel into a single slice, + // deduped by ID. This ensures that when we link replies in phase 2, + // the parent is already persisted regardless of page order. + seen := make(map[string]int) + var collected []ChatMessage + + addMsg := func(gm ChatMessage) { + if idx, dup := seen[gm.ID]; dup { + collected[idx] = gm + return + } + seen[gm.ID] = len(collected) + collected = append(collected, gm) + } + + if prevDelta == "" { + // First run: backfill root messages + replies, then prime delta cursor. + roots, lerr := imp.client.ListChannelMessages(ctx, team.ID, ch.ID) + if lerr != nil { + sum.Errors++ + continue + } + for i := range roots { + addMsg(roots[i]) + replies, rerr := imp.client.ListReplies(ctx, team.ID, ch.ID, roots[i].ID) + if rerr != nil { + sum.Errors++ + channelComplete = false + continue + } + for j := range replies { + addMsg(replies[j]) + } + } + if channelComplete { + // Prime the delta cursor only after a complete roots+replies backfill. + deltaMessages, dl, derr := imp.client.ChannelMessagesDelta(ctx, team.ID, ch.ID, "") + if derr != nil { + sum.Errors++ + } else { + for i := range deltaMessages { + addMsg(deltaMessages[i]) + } + newDelta = dl + } + } + } else { + // Subsequent run: use stored delta link. + deltaMessages, dl, derr := imp.client.ChannelMessagesDelta(ctx, team.ID, ch.ID, prevDelta) + if derr != nil { + // On 400/410, fall back to full re-page + re-prime. + roots, lerr := imp.client.ListChannelMessages(ctx, team.ID, ch.ID) + if lerr != nil { + sum.Errors++ + continue + } + for i := range roots { + addMsg(roots[i]) + replies, rerr := imp.client.ListReplies(ctx, team.ID, ch.ID, roots[i].ID) + if rerr != nil { + sum.Errors++ + channelComplete = false + continue + } + for j := range replies { + addMsg(replies[j]) + } + } + if channelComplete { + primeMessages, pdl, perr := imp.client.ChannelMessagesDelta(ctx, team.ID, ch.ID, "") + if perr != nil { + sum.Errors++ + } else { + for i := range primeMessages { + addMsg(primeMessages[i]) + } + newDelta = pdl + } + } + } else { + for i := range deltaMessages { + addMsg(deltaMessages[i]) + } + newDelta = dl + } + } + + // Phase 1: persist collected messages, respecting the per-conversation + // limit. Track messages with ReplyToID for the linking phase. + var toLink []ChatMessage + convCount := 0 + var persistedIDs []int64 + for i := range collected { + if opts.Limit > 0 && convCount >= opts.Limit { + break + } + gm := &collected[i] + messageID, added, perr := imp.persistMessage(ctx, convID, sourceID, channelSourceMessageID(team.ID, ch.ID, gm.ID), gm, opts, sum, nil) + if perr != nil { + return perr + } + if messageID != 0 { + persistedIDs = append(persistedIDs, messageID) + } + if added { + sum.MessagesAdded++ + } + sum.MessagesProcessed++ + convCount++ + if gm.ReplyToID != "" { + toLink = append(toLink, *gm) + } + } + + // Phase 2: link replies to their parents. All persisted messages are + // now in the store, so SetReplyTo will always find the parent regardless + // of the order they appeared in the collected batch. + for i := range toLink { + if serr := imp.store.SetReplyTo(sourceID, + channelSourceMessageID(team.ID, ch.ID, toLink[i].ID), + channelSourceMessageID(team.ID, ch.ID, toLink[i].ReplyToID)); serr != nil { + sum.Errors++ + } + } + + truncated := opts.Limit > 0 && convCount < len(collected) + if !truncated && newDelta != "" { + state.SetChannelDelta(key, newDelta) + } + imp.enqueueEmbeddings(ctx, opts, sum, persistedIDs) + sum.ChannelsProcessed++ + + // Emit per-conversation progress. + if opts.Progress != nil { + opts.Progress(fmt.Sprintf("channel %s: %d messages", team.DisplayName+" / "+ch.DisplayName, convCount)) + } + + // Flush checkpoint so an interrupted run can resume from this point. + if blob, merr := state.Marshal(); merr == nil { + _ = imp.store.UpdateSyncCheckpoint(syncID, &store.Checkpoint{ + PageToken: blob, + MessagesProcessed: sum.MessagesProcessed, + MessagesAdded: sum.MessagesAdded, + ErrorsCount: sum.Errors, + }) + } + } + } + return nil +} + +// persistMessage writes a single message via the granular store path. +// Returns the internal message ID and true if persisted (best-effort; UpsertMessage upserts). +func (imp *Importer) persistMessage(ctx context.Context, convID, sourceID int64, sourceMessageID string, gm *ChatMessage, opts ImportOptions, sum *ImportSummary, toRecips []recipientRef) (int64, bool, error) { + if err := imp.store.MigrateSourceMessageID(sourceID, convID, gm.ID, sourceMessageID); err != nil { + return 0, false, err + } + if gm.DeletedDateTime != nil { + if err := imp.store.MarkMessageDeleted(sourceID, sourceMessageID); err != nil { + sum.Errors++ + } + return 0, false, nil + } + msg, text := mapMessage(gm, convID, sourceID, sourceMessageID) + if gm.From != nil { + pid, rerr := imp.res.resolve(ctx, identityOf(gm.From)) + if rerr != nil { + return 0, false, rerr + } + if pid != 0 { + msg.SenderID = sql.NullInt64{Int64: pid, Valid: true} + } + } + if msg.SenderID.Valid { + if err := imp.store.EnsureConversationParticipant(convID, msg.SenderID.Int64, "member"); err != nil { + sum.Errors++ + } + } + messageID, err := imp.store.UpsertMessage(&msg) + if err != nil { + return 0, false, err + } + bodyHTML := sql.NullString{} + if gm.Body.ContentType == "html" { + bodyHTML = sql.NullString{String: gm.Body.Content, Valid: true} + } + if err := imp.store.UpsertMessageBody(messageID, sql.NullString{String: text, Valid: text != ""}, bodyHTML); err != nil { + return 0, false, err + } + inlineImagesChanged := imp.downloadInlineImages(ctx, messageID, gm.Body.Content, opts.AttachmentsDir, sum) + // Archive the exact original message JSON. gm.Raw is captured verbatim at + // decode time (ChatMessage.UnmarshalJSON), so it preserves every Graph field + // including ones we do not model; fall back to re-marshalling only if a + // message was constructed without going through a decode. + raw := []byte(gm.Raw) + if len(raw) == 0 { + marshaled, marshalErr := json.Marshal(gm) + if marshalErr != nil { + return 0, false, fmt.Errorf("marshal teams message raw archive: %w", marshalErr) + } + raw = marshaled + } + if len(raw) > 0 { + _ = imp.store.UpsertMessageRawWithFormat(messageID, raw, "teams_json") + } + senderName := "" + if id := identityOf(gm.From); id != nil { + senderName = id.DisplayName + } + if err := imp.store.UpsertFTS(messageID, msg.Subject.String, text, senderName, "", ""); err != nil { + sum.Errors++ + } + + // Capture the sender participant ID for filtering "to" rows. + senderPID := msg.SenderID.Int64 // 0 if not set + var fromIDs []int64 + var fromNames []string + if msg.SenderID.Valid { + fromIDs = append(fromIDs, msg.SenderID.Int64) + if id := identityOf(gm.From); id != nil { + fromNames = append(fromNames, id.DisplayName) + } else { + fromNames = append(fromNames, "") + } + } + if err := imp.store.ReplaceMessageRecipients(messageID, "from", fromIDs, fromNames); err != nil { + sum.Errors++ + } + + // Write "to" rows (all members except the sender). nil means member lookup + // failed and the importer should preserve prior rows; empty means known empty. + if toRecips != nil { + var toIDs []int64 + var toNames []string + for _, r := range toRecips { + if r.ID == 0 || r.ID == senderPID { + continue + } + toIDs = append(toIDs, r.ID) + toNames = append(toNames, r.Name) + } + toIDs, toNames = dedupRecipients(toIDs, toNames) + if err := imp.store.ReplaceMessageRecipients(messageID, "to", toIDs, toNames); err != nil { + sum.Errors++ + } + } + + // Write "mention" rows. + var mentionIDs []int64 + var mentionNames []string + for i := range gm.Mentions { + m := &gm.Mentions[i] + if m.Mentioned == nil { + continue + } + id := identityOf(m.Mentioned) + if id == nil { + continue + } + pid, rerr := imp.res.resolve(ctx, id) + if rerr != nil || pid == 0 { + continue + } + mentionIDs = append(mentionIDs, pid) + mentionNames = append(mentionNames, id.DisplayName) + } + mentionIDs, mentionNames = dedupRecipients(mentionIDs, mentionNames) + if err := imp.store.ReplaceMessageRecipients(messageID, "mention", mentionIDs, mentionNames); err != nil { + sum.Errors++ + } + + reactions := make([]store.ReactionRef, 0, len(gm.Reactions)) + for _, rc := range gm.Reactions { + pid, _ := imp.res.resolve(ctx, identityOf(rc.User)) + if pid != 0 { + reactions = append(reactions, store.ReactionRef{ + ParticipantID: pid, + Type: rc.ReactionType, + Value: rc.ReactionType, + CreatedAt: rc.CreatedDateTime, + }) + } + } + if err := imp.store.ReplaceReactions(messageID, reactions); err != nil { + sum.Errors++ + } else { + sum.ReactionsAdded += int64(len(reactions)) + } + + var linkAttachments []store.AttachmentRef + // Store the call-recording link (systemEventMessage eventDetail) as an attachment. + if recURL, recName, ok := gm.callRecording(); ok { + linkAttachments = append(linkAttachments, store.AttachmentRef{ + Filename: recName, + StoragePath: recURL, + }) + } + // Store attachment[] refs (reference/file/card) that carry a content URL. + for _, att := range gm.Attachments { + if att.ContentURL == "" { + continue + } + linkAttachments = append(linkAttachments, store.AttachmentRef{ + Filename: att.Name, + MimeType: att.ContentType, + StoragePath: att.ContentURL, + }) + } + if err := imp.store.ReplaceMessageLinkAttachments(messageID, linkAttachments); err != nil { + sum.Errors++ + } else { + sum.AttachmentsFound += int64(len(linkAttachments)) + } + if inlineImagesChanged { + if err := imp.store.RecomputeMessageAttachmentStats(messageID); err != nil { + sum.Errors++ + } + } + return messageID, true, nil +} + +func (imp *Importer) enqueueEmbeddings(ctx context.Context, opts ImportOptions, sum *ImportSummary, messageIDs []int64) { + if opts.EmbedEnqueuer == nil || len(messageIDs) == 0 { + return + } + if err := opts.EmbedEnqueuer.EnqueueMessages(ctx, messageIDs); err != nil { + sum.Errors++ + } +} + +// dedupRecipients removes duplicate participant IDs from ids/names slices, +// preserving first-seen order and skipping zero IDs. ids and names must be +// the same length. +func dedupRecipients(ids []int64, names []string) ([]int64, []string) { + seen := make(map[int64]struct{}, len(ids)) + outIDs := make([]int64, 0, len(ids)) + outNames := make([]string, 0, len(ids)) + for i, id := range ids { + if id == 0 { + continue + } + if _, ok := seen[id]; ok { + continue + } + seen[id] = struct{}{} + outIDs = append(outIDs, id) + n := "" + if i < len(names) { + n = names[i] + } + outNames = append(outNames, n) + } + return outIDs, outNames +} + +// hostedRe matches absolute hostedContents $value URLs embedded in Teams HTML bodies. +var hostedRe = regexp.MustCompile(`https?://[^"'\s)]+/hostedContents/[^"'\s)]+/\$value`) + +// hostedFetchPath rewrites an absolute Graph hostedContents URL to a path +// relative to baseURL, so the client fetches it against the configured host +// (production Graph or an httptest server) WITHOUT duplicating baseURL's +// version segment. The stored URLs are absolute and version-qualified +// (".../v1.0/chats/.../hostedContents/.../$value"); since the client already +// prepends baseURL (".../v1.0"), passing u.Path verbatim yields +// ".../v1.0/v1.0/..." and 404s every fetch. Returns "" if rawURL is unparseable. +func hostedFetchPath(baseURL, rawURL string) string { + u, err := url.Parse(rawURL) + if err != nil { + return "" + } + b, err := url.Parse(baseURL) + if err != nil { + return "" + } + if !u.IsAbs() || !strings.EqualFold(u.Scheme, b.Scheme) || !strings.EqualFold(u.Host, b.Host) { + return "" + } + p := u.Path + basePath := strings.TrimRight(b.Path, "/") + if basePath != "" { + if p != basePath && !strings.HasPrefix(p, basePath+"/") { + return "" + } + p = strings.TrimPrefix(p, basePath) + if p == "" { + p = "/" + } + } + if u.RawQuery != "" { + p += "?" + u.RawQuery + } + return p +} + +// downloadInlineImages scans bodyHTML for Graph hostedContents $value URLs and +// replaces the message's Teams-managed inline attachment rows with the current +// set. If any current hosted image cannot be fetched, existing rows are +// preserved so a transient Graph failure does not erase already-downloaded +// media. +func (imp *Importer) downloadInlineImages(ctx context.Context, messageID int64, bodyHTML, attachmentsDir string, sum *ImportSummary) bool { + raws := hostedRe.FindAllString(bodyHTML, -1) + if len(raws) == 0 { + if err := imp.store.ReplaceMessageInlineAttachments(messageID, nil); err != nil { + sum.Errors++ + return false + } + return true + } + if attachmentsDir == "" { + return false + } + + seen := make(map[string]struct{}, len(raws)) + refs := make([]store.AttachmentRef, 0, len(raws)) + for _, raw := range raws { + if _, ok := seen[raw]; ok { + continue + } + seen[raw] = struct{}{} + // Rewrite the absolute graph.microsoft.com URL to a path relative to + // the client's configured base URL so the request hits the correct host + // (e.g. an httptest server in tests, or production Graph in production) + // without duplicating the version segment. + fetchPath := hostedFetchPath(imp.client.BaseURL(), raw) + if fetchPath == "" { + sum.Errors++ + return false + } + data, derr := imp.client.GetRaw(ctx, fetchPath) + if derr != nil || len(data) == 0 { + sum.Errors++ + return false + } + att := &internalmime.Attachment{ + Filename: "", + ContentType: "", + Content: data, + } + storagePath, serr := export.StoreAttachmentFile(attachmentsDir, att) + if serr != nil || storagePath == "" { + sum.Errors++ + return false + } + refs = append(refs, store.AttachmentRef{ + StoragePath: storagePath, + ContentHash: att.ContentHash, + Size: len(data), + SourceAttachmentID: "teams:inline:" + fetchPath, + }) + } + if err := imp.store.ReplaceMessageInlineAttachments(messageID, refs); err != nil { + sum.Errors++ + return false + } + sum.InlineImagesCopied += int64(len(refs)) + return true +} + +// identityOf extracts the primary Identity from an IdentitySet, +// preferring the User field over Application. +func identityOf(set *IdentitySet) *Identity { + if set == nil { + return nil + } + if set.User != nil { + return set.User + } + return set.Application +} diff --git a/internal/teams/importer_test.go b/internal/teams/importer_test.go new file mode 100644 index 000000000..e6981d4db --- /dev/null +++ b/internal/teams/importer_test.go @@ -0,0 +1,1412 @@ +package teams + +import ( + "context" + "database/sql" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.kenn.io/msgvault/internal/store" + "go.kenn.io/msgvault/internal/testutil" +) + +type recordingEnqueuer struct { + ids []int64 +} + +func (e *recordingEnqueuer) EnqueueMessages(_ context.Context, ids []int64) error { + e.ids = append(e.ids, ids...) + return nil +} + +func fakeChatGraph(t *testing.T) *httptest.Server { + t.Helper() + + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[{"id":"19:x@thread.v2","chatType":"oneOnOne","topic":"DM"}]}`)) + case strings.HasSuffix(r.URL.Path, "/members"): + _, _ = w.Write([]byte(`{"value":[]}`)) + case strings.Contains(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[ + {"id":"m1","createdDateTime":"2025-01-01T00:00:00Z","lastModifiedDateTime":"2025-01-01T00:00:00Z", + "from":{"user":{"id":"alice@outlook.com","displayName":"Alice","userIdentityType":"emailUser"}}, + "body":{"contentType":"text","content":"hello world"}} + ]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) +} + +func TestImportChatsEndToEnd(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + srv := fakeChatGraph(t) + defer srv.Close() + st := testutil.NewTestStore(t) + + c := NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50) + imp := NewImporter(st, c) + sum, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: false}) + require.NoError(err) + assert.EqualValues(1, sum.ChatsProcessed) + assert.EqualValues(1, sum.MessagesAdded) + + var cnt int + require.NoError(st.DB().QueryRow(`SELECT COUNT(*) FROM messages WHERE message_type='teams'`).Scan(&cnt)) + assert.Equal(1, cnt) +} + +func TestImportChatsPopulatesConversationParticipantsAndStats(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[{"id":"19:stats@thread.v2","chatType":"group","topic":"Stats"}]}`)) + case strings.HasSuffix(r.URL.Path, "/members"): + _, _ = w.Write([]byte(`{"value":[ + {"id":"mem1","userId":"aad-alice","email":"alice@example.com","displayName":"Alice"}, + {"id":"mem2","userId":"aad-bob","email":"bob@example.com","displayName":"Bob"} + ]}`)) + case strings.Contains(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[ + {"id":"m1","createdDateTime":"2025-01-01T00:00:00Z","lastModifiedDateTime":"2025-01-01T00:00:00Z", + "from":{"user":{"id":"alice@example.com","displayName":"Alice","userIdentityType":"emailUser"}}, + "body":{"contentType":"text","content":"hello stats"}} + ]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + defer srv.Close() + st := testutil.NewTestStore(t) + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + _, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: false}) + require.NoError(err) + + var conversationID int64 + var messageCount, participantCount int + var lastMessageAt sql.NullTime + var preview sql.NullString + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT id, message_count, participant_count, last_message_at, last_message_preview + FROM conversations + WHERE source_conversation_id = ? + `), "19:stats@thread.v2").Scan(&conversationID, &messageCount, &participantCount, &lastMessageAt, &preview)) + assert.Equal(1, messageCount) + assert.Equal(2, participantCount) + assert.True(lastMessageAt.Valid) + assert.Equal("hello stats", preview.String) + + var linkedParticipants int + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT COUNT(*) + FROM conversation_participants cp + JOIN participants p ON p.id = cp.participant_id + WHERE cp.conversation_id = ? + AND p.email_address IN ('alice@example.com', 'bob@example.com') + `), conversationID).Scan(&linkedParticipants)) + assert.Equal(2, linkedParticipants) +} + +func fakeChannelGraph(t *testing.T) *httptest.Server { + t.Helper() + + serverURL := "" + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[]}`)) + case r.URL.Path == "/me/joinedTeams": + _, _ = w.Write([]byte(`{"value":[{"id":"team1","displayName":"Acme"}]}`)) + case strings.HasSuffix(r.URL.Path, "/channels"): + _, _ = w.Write([]byte(`{"value":[{"id":"chanA","displayName":"General","membershipType":"standard"}]}`)) + case strings.HasSuffix(r.URL.Path, "/messages/delta"): + _, _ = w.Write([]byte(`{"value":[{"id":"c1","createdDateTime":"2025-02-01T00:00:00Z","lastModifiedDateTime":"2025-02-01T00:00:00Z","body":{"contentType":"text","content":"channel post"}}],"@odata.deltaLink":"` + serverURL + `/delta?token=next"}`)) + case strings.HasSuffix(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + serverURL = srv.URL + return srv +} + +func TestInlineImageDownloaded(t *testing.T) { + serverURL := "" + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case strings.Contains(r.URL.Path, "/hostedContents/") && strings.HasSuffix(r.URL.Path, "/$value"): + w.Header().Set("Content-Type", "image/png") + _, _ = w.Write([]byte("PNGDATA")) + case r.URL.Path == "/me/chats": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"value":[{"id":"19:x@thread.v2","chatType":"oneOnOne"}]}`)) + case strings.HasSuffix(r.URL.Path, "/members"): + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"value":[]}`)) + case strings.Contains(r.URL.Path, "/messages"): + w.Header().Set("Content-Type", "application/json") + body := `
` + _, _ = w.Write([]byte(`{"value":[{"id":"m1","createdDateTime":"2025-01-01T00:00:00Z","body":{"contentType":"html","content":` + jsonString(t, body) + `}}]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + serverURL = srv.URL + defer srv.Close() + st := testutil.NewTestStore(t) + dir := t.TempDir() + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + sum, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", AttachmentsDir: dir}) + require.NoError(t, err) + assert.EqualValues(t, 1, sum.InlineImagesCopied) + assert.EqualValues(t, 0, sum.Errors) +} + +func TestTeamsReimportRemovesStaleInlineAttachments(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + includeImage := true + serverURL := "" + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case strings.Contains(r.URL.Path, "/hostedContents/") && strings.HasSuffix(r.URL.Path, "/$value"): + w.Header().Set("Content-Type", "image/png") + _, _ = w.Write([]byte("PNGDATA")) + case r.URL.Path == "/me/chats": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"value":[{"id":"19:inline-edit@thread.v2","chatType":"oneOnOne","topic":"DM"}]}`)) + case strings.HasSuffix(r.URL.Path, "/members"): + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"value":[]}`)) + case strings.Contains(r.URL.Path, "/messages"): + w.Header().Set("Content-Type", "application/json") + body := `

edited

` + if includeImage { + body = `
` + } + _, _ = w.Write([]byte(`{"value":[{"id":"m1","createdDateTime":"2025-01-01T00:00:00Z","lastModifiedDateTime":"2025-01-01T00:00:00Z","body":{"contentType":"html","content":` + jsonString(t, body) + `}}]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + serverURL = srv.URL + defer srv.Close() + st := testutil.NewTestStore(t) + attachmentsDir := t.TempDir() + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + _, err := imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + AttachmentsDir: attachmentsDir, + Full: true, + }) + require.NoError(err) + + var messageID int64 + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT id FROM messages WHERE source_message_id = ? + `), chatSourceMessageID("19:inline-edit@thread.v2", "m1")).Scan(&messageID)) + var attachmentCount int + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT COUNT(*) FROM attachments WHERE message_id = ?`), + messageID, + ).Scan(&attachmentCount)) + require.Equal(1, attachmentCount) + + includeImage = false + _, err = imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + AttachmentsDir: attachmentsDir, + Full: true, + }) + require.NoError(err) + + var hasAttachments bool + var denormalizedCount int + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT m.has_attachments, m.attachment_count, COUNT(a.id) + FROM messages m + LEFT JOIN attachments a ON a.message_id = m.id + WHERE m.id = ? + GROUP BY m.id, m.has_attachments, m.attachment_count + `), messageID).Scan(&hasAttachments, &denormalizedCount, &attachmentCount)) + assert.False(hasAttachments) + assert.Equal(0, denormalizedCount) + assert.Equal(0, attachmentCount) +} + +func jsonString(t *testing.T, s string) string { + t.Helper() + + b, err := json.Marshal(s) + require.NoError(t, err) + return string(b) +} + +// TestBackfillInlineMedia exercises the path fix end-to-end: it pre-seeds a +// message whose stored HTML body contains a hostedContents URL, then runs +// BackfillInlineMedia and asserts the inline image was fetched (with the +// correct non-doubled version path) and recorded as an attachment. +func TestBackfillInlineMedia(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Regression guard: a doubled version segment ("/v1.0/v1.0") must 404. + if strings.Contains(r.URL.Path, "/v1.0/v1.0") { + http.Error(w, "404", http.StatusNotFound) + return + } + if r.URL.Path == "/v1.0/chats/19:x@thread.v2/messages/m1/hostedContents/1/$value" { + w.Header().Set("Content-Type", "image/png") + _, _ = w.Write([]byte("PNGDATA")) + return + } + http.Error(w, "404", http.StatusNotFound) + })) + defer srv.Close() + + st := testutil.NewTestStore(t) + + // Pre-seed: source, conversation, message, and an HTML body that contains a + // hostedContents inline-image URL. + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + convID, err := st.EnsureConversationWithType(src.ID, "19:x@thread.v2", "oneOnOne", "DM") + require.NoError(err) + msgID, err := st.UpsertMessage(&store.Message{ + ConversationID: convID, + SourceID: src.ID, + SourceMessageID: "m1", + MessageType: "teams", + }) + require.NoError(err) + bodyHTML := `
` + require.NoError(st.UpsertMessageBody(msgID, + sql.NullString{String: "hello", Valid: true}, + sql.NullString{String: bodyHTML, Valid: true})) + + // baseURL carries the version segment, exactly like production, so the fix + // (stripping baseURL's path) is what makes the fetch path resolve correctly. + client := NewClient(srv.URL+"/v1.0", func(context.Context) (string, error) { return "t", nil }, 50) + imp := NewImporter(st, client) + + sum, err := imp.BackfillInlineMedia(context.Background(), ImportOptions{ + Email: "me@example.com", + AttachmentsDir: t.TempDir(), + }) + require.NoError(err) + assert.EqualValues(1, sum.InlineImagesCopied) + assert.EqualValues(0, sum.Errors) + + var attCount int + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT COUNT(*) FROM attachments WHERE message_id = ?`), + msgID, + ).Scan(&attCount)) + assert.Equal(1, attCount, "an inline-image attachment row should exist for the message") + + var hasAttachments bool + var messageAttachmentCount int + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT has_attachments, attachment_count FROM messages WHERE id = ?`), + msgID, + ).Scan(&hasAttachments, &messageAttachmentCount)) + assert.True(hasAttachments, "backfill should refresh the message attachment flag") + assert.Equal(1, messageAttachmentCount, "backfill should refresh the message attachment count") +} + +// TestHostedFetchPath verifies that an absolute Graph hostedContents URL is +// rewritten to a path relative to the client baseURL WITHOUT duplicating the +// API-version segment. Production baseURL carries "/v1.0"; using u.Path +// verbatim would yield "/v1.0/v1.0/..." and 404 every inline fetch. +func TestHostedFetchPath(t *testing.T) { + assert := assert.New(t) + const hosted = "https://graph.microsoft.com/v1.0/chats/19:x@thread.v2/messages/m1/hostedContents/abc/$value" + + // Production: baseURL includes /v1.0 — the version must not be doubled. + got := hostedFetchPath("https://graph.microsoft.com/v1.0", hosted) + assert.Equal("/chats/19:x@thread.v2/messages/m1/hostedContents/abc/$value", got) + assert.NotContains(got, "/v1.0", "version segment must be stripped, not doubled") + + // httptest: baseURL has no path — keep the full path so the fake server matches. + gotTest := hostedFetchPath("http://127.0.0.1:1234", "http://127.0.0.1:1234/v1.0/chats/19:x@thread.v2/messages/m1/hostedContents/abc/$value") + assert.Equal("/v1.0/chats/19:x@thread.v2/messages/m1/hostedContents/abc/$value", gotTest) + + // Query string is preserved. + gotQ := hostedFetchPath("https://graph.microsoft.com/v1.0", hosted+"?foo=bar") + assert.Equal("/chats/19:x@thread.v2/messages/m1/hostedContents/abc/$value?foo=bar", gotQ) + + malicious := "https://graph.microsoft.com/v1.0https://attacker.example/hostedContents/1/$value" + assert.Empty(hostedFetchPath("https://graph.microsoft.com/v1.0", malicious)) +} + +// fakeLimitChatGraph returns a fake Graph server that serves a single chat +// with 3 messages, so the --limit flag can be tested against it. +func fakeLimitChatGraph(t *testing.T) *httptest.Server { + t.Helper() + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[{"id":"19:limit@thread.v2","chatType":"oneOnOne","topic":"LimitTest"}]}`)) + case strings.Contains(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[ + {"id":"lm1","createdDateTime":"2025-03-01T00:00:00Z","lastModifiedDateTime":"2025-03-01T00:00:00Z","body":{"contentType":"text","content":"msg one"}}, + {"id":"lm2","createdDateTime":"2025-03-01T00:00:01Z","lastModifiedDateTime":"2025-03-01T00:00:01Z","body":{"contentType":"text","content":"msg two"}}, + {"id":"lm3","createdDateTime":"2025-03-01T00:00:02Z","lastModifiedDateTime":"2025-03-01T00:00:02Z","body":{"contentType":"text","content":"msg three"}} + ]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) +} + +func TestImportChatsLimit(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := fakeLimitChatGraph(t) + defer srv.Close() + st := testutil.NewTestStore(t) + + c := NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50) + imp := NewImporter(st, c) + sum, err := imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + Limit: 2, + }) + require.NoError(err) + assert.EqualValues(1, sum.ChatsProcessed) + assert.EqualValues(2, sum.MessagesAdded) + + var cnt int + require.NoError(st.DB().QueryRow(`SELECT COUNT(*) FROM messages WHERE message_type='teams'`).Scan(&cnt)) + assert.Equal(2, cnt) +} + +func TestLimitedChatImportDoesNotAdvanceCursor(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := fakeLimitChatGraph(t) + defer srv.Close() + st := testutil.NewTestStore(t) + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + _, err := imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + Limit: 2, + }) + require.NoError(err) + + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + run, err := st.GetLastSuccessfulSync(src.ID) + require.NoError(err) + require.True(run.CursorAfter.Valid) + state, err := LoadSyncState(run.CursorAfter.String) + require.NoError(err) + assert.Empty(state.ChatCursor("19:limit@thread.v2")) +} + +func TestChatMemberFetchFailureDoesNotAdvanceCursor(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[{"id":"19:memberfail@thread.v2","chatType":"group","topic":"Chat"}]}`)) + case strings.HasSuffix(r.URL.Path, "/members"): + http.Error(w, "members unavailable", http.StatusBadRequest) + case strings.Contains(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[{"id":"m1","createdDateTime":"2025-01-01T00:00:00Z","lastModifiedDateTime":"2025-01-01T00:00:00Z","body":{"contentType":"text","content":"hello"}}]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + defer srv.Close() + st := testutil.NewTestStore(t) + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + sum, err := imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + }) + require.NoError(err) + assert.EqualValues(1, sum.Errors) + assert.EqualValues(1, sum.MessagesAdded) + + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + run, err := st.GetLastSuccessfulSync(src.ID) + require.NoError(err) + require.True(run.CursorAfter.Valid) + state, err := LoadSyncState(run.CursorAfter.String) + require.NoError(err) + assert.Empty(state.ChatCursor("19:memberfail@thread.v2")) +} + +func TestChatMessageIDsAreNamespacedByConversation(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[` + + `{"id":"chatA","chatType":"group","topic":"A"},` + + `{"id":"chatB","chatType":"group","topic":"B"}` + + `]}`)) + case strings.HasSuffix(r.URL.Path, "/members"): + _, _ = w.Write([]byte(`{"value":[]}`)) + case strings.Contains(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[{"id":"same","createdDateTime":"2025-01-01T00:00:00Z","lastModifiedDateTime":"2025-01-01T00:00:00Z","body":{"contentType":"text","content":"hello"}}]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + defer srv.Close() + st := testutil.NewTestStore(t) + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + _, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: false}) + require.NoError(err) + + var count int + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT COUNT(*) FROM messages WHERE source_message_id IN (?, ?)`), + chatSourceMessageID("chatA", "same"), chatSourceMessageID("chatB", "same"), + ).Scan(&count)) + assert.Equal(2, count) +} + +func TestImportChannelsEndToEnd(t *testing.T) { + assert := assert.New(t) + srv := fakeChannelGraph(t) + defer srv.Close() + st := testutil.NewTestStore(t) + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + sum, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: true}) + require.NoError(t, err) + assert.EqualValues(1, sum.ChannelsProcessed) + assert.EqualValues(1, sum.MessagesAdded) + + src, _ := st.GetOrCreateSource("teams", "me@example.com") + prev, _ := st.GetLastSuccessfulSync(src.ID) + state, _ := LoadSyncState(prev.CursorAfter.String) + assert.Contains(state.ChannelDelta("team1/chanA"), "token=next") +} + +func TestLimitedChannelImportDoesNotAdvanceDelta(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + serverURL := "" + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[]}`)) + case r.URL.Path == "/me/joinedTeams": + _, _ = w.Write([]byte(`{"value":[{"id":"team1","displayName":"Acme"}]}`)) + case strings.HasSuffix(r.URL.Path, "/channels"): + _, _ = w.Write([]byte(`{"value":[{"id":"chanA","displayName":"General"}]}`)) + case strings.HasSuffix(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[` + + `{"id":"c1","createdDateTime":"2025-02-01T00:00:00Z","lastModifiedDateTime":"2025-02-01T00:00:00Z","body":{"contentType":"text","content":"one"}},` + + `{"id":"c2","createdDateTime":"2025-02-01T00:00:01Z","lastModifiedDateTime":"2025-02-01T00:00:01Z","body":{"contentType":"text","content":"two"}}` + + `]}`)) + case strings.HasSuffix(r.URL.Path, "/messages/delta"): + _, _ = w.Write([]byte(`{"value":[],"@odata.deltaLink":"` + serverURL + `/delta?token=next"}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + serverURL = srv.URL + defer srv.Close() + st := testutil.NewTestStore(t) + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + _, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: true, Limit: 1}) + require.NoError(err) + + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + run, err := st.GetLastSuccessfulSync(src.ID) + require.NoError(err) + state, err := LoadSyncState(run.CursorAfter.String) + require.NoError(err) + assert.Empty(state.ChannelDelta("team1/chanA")) +} + +func TestChannelReplyFetchErrorDoesNotAdvanceDelta(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + serverURL := "" + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[]}`)) + case r.URL.Path == "/me/joinedTeams": + _, _ = w.Write([]byte(`{"value":[{"id":"team1","displayName":"Acme"}]}`)) + case strings.HasSuffix(r.URL.Path, "/channels"): + _, _ = w.Write([]byte(`{"value":[{"id":"chanA","displayName":"General"}]}`)) + case strings.HasSuffix(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[{"id":"root","createdDateTime":"2025-02-01T00:00:00Z","lastModifiedDateTime":"2025-02-01T00:00:00Z","body":{"contentType":"text","content":"root"}}]}`)) + case strings.Contains(r.URL.Path, "/replies"): + http.Error(w, "reply failure", http.StatusBadRequest) + case strings.HasSuffix(r.URL.Path, "/messages/delta"): + _, _ = w.Write([]byte(`{"value":[],"@odata.deltaLink":"` + serverURL + `/delta?token=next"}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + serverURL = srv.URL + defer srv.Close() + st := testutil.NewTestStore(t) + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + sum, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: true}) + require.NoError(err) + assert.EqualValues(1, sum.Errors) + + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + run, err := st.GetLastSuccessfulSync(src.ID) + require.NoError(err) + state, err := LoadSyncState(run.CursorAfter.String) + require.NoError(err) + assert.Empty(state.ChannelDelta("team1/chanA")) +} + +func TestChannelDeltaPrimeErrorStillPersistsBackfill(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[]}`)) + case r.URL.Path == "/me/joinedTeams": + _, _ = w.Write([]byte(`{"value":[{"id":"team1","displayName":"Acme"}]}`)) + case strings.HasSuffix(r.URL.Path, "/channels"): + _, _ = w.Write([]byte(`{"value":[{"id":"chanA","displayName":"General"}]}`)) + case strings.HasSuffix(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[{"id":"root","createdDateTime":"2025-02-01T00:00:00Z","lastModifiedDateTime":"2025-02-01T00:00:00Z","body":{"contentType":"text","content":"root"}}]}`)) + case strings.Contains(r.URL.Path, "/replies"): + _, _ = w.Write([]byte(`{"value":[{"id":"reply","replyToId":"root","createdDateTime":"2025-02-01T00:00:01Z","lastModifiedDateTime":"2025-02-01T00:00:01Z","body":{"contentType":"text","content":"reply"}}]}`)) + case strings.HasSuffix(r.URL.Path, "/messages/delta"): + http.Error(w, "delta unavailable", http.StatusBadRequest) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + defer srv.Close() + st := testutil.NewTestStore(t) + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + sum, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: true}) + require.NoError(err) + assert.EqualValues(1, sum.Errors) + assert.EqualValues(2, sum.MessagesAdded) + + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + run, err := st.GetLastSuccessfulSync(src.ID) + require.NoError(err) + state, err := LoadSyncState(run.CursorAfter.String) + require.NoError(err) + assert.Empty(state.ChannelDelta("team1/chanA")) + + var count int + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT COUNT(*) FROM messages + WHERE source_id = ? AND source_message_id IN (?, ?) + `), src.ID, + channelSourceMessageID("team1", "chanA", "root"), + channelSourceMessageID("team1", "chanA", "reply"), + ).Scan(&count)) + assert.Equal(2, count) +} + +func TestChannelDeltaPrimeMessageReplacesBackfilledVersion(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + serverURL := "" + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[]}`)) + case r.URL.Path == "/me/joinedTeams": + _, _ = w.Write([]byte(`{"value":[{"id":"team1","displayName":"Acme"}]}`)) + case strings.HasSuffix(r.URL.Path, "/channels"): + _, _ = w.Write([]byte(`{"value":[{"id":"chanA","displayName":"General"}]}`)) + case strings.Contains(r.URL.Path, "/replies"): + _, _ = w.Write([]byte(`{"value":[]}`)) + case strings.HasSuffix(r.URL.Path, "/messages/delta"): + _, _ = w.Write([]byte(`{"value":[{"id":"root","createdDateTime":"2025-02-01T00:00:00Z","lastModifiedDateTime":"2025-02-01T00:00:02Z","body":{"contentType":"text","content":"edited root"}}],"@odata.deltaLink":"` + serverURL + `/delta?token=next"}`)) + case strings.HasSuffix(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[{"id":"root","createdDateTime":"2025-02-01T00:00:00Z","lastModifiedDateTime":"2025-02-01T00:00:00Z","body":{"contentType":"text","content":"original root"}}]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + serverURL = srv.URL + defer srv.Close() + st := testutil.NewTestStore(t) + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + _, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: true}) + require.NoError(err) + + var bodyText string + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT mb.body_text + FROM message_bodies mb + JOIN messages m ON m.id = mb.message_id + WHERE m.source_message_id = ? + `), channelSourceMessageID("team1", "chanA", "root")).Scan(&bodyText)) + assert.Equal("edited root", bodyText) +} + +// TestReplyBeforeRoot verifies that channel reply threading is preserved even +// when a delta page returns a reply (c2) before its root (c1). The old +// inline-SetReplyTo approach would silently drop the link because the root +// was not yet persisted. The two-phase collect-then-link approach fixes this. +func TestReplyBeforeRoot(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + serverURL := "" + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[]}`)) + case r.URL.Path == "/me/joinedTeams": + _, _ = w.Write([]byte(`{"value":[{"id":"team1","displayName":"Acme"}]}`)) + case strings.HasSuffix(r.URL.Path, "/channels"): + _, _ = w.Write([]byte(`{"value":[{"id":"chanA","displayName":"General","membershipType":"standard"}]}`)) + case strings.HasSuffix(r.URL.Path, "/messages/delta"): + // Reply (c2) arrives BEFORE its root (c1) in a single delta page. + _, _ = w.Write([]byte(`{"value":[` + + `{"id":"c2","replyToId":"c1","createdDateTime":"2025-02-01T00:00:01Z","lastModifiedDateTime":"2025-02-01T00:00:01Z","body":{"contentType":"text","content":"a reply"}},` + + `{"id":"c1","createdDateTime":"2025-02-01T00:00:00Z","lastModifiedDateTime":"2025-02-01T00:00:00Z","body":{"contentType":"text","content":"the root"}}` + + `],"@odata.deltaLink":"` + serverURL + `/delta?token=x"}`)) + case strings.HasSuffix(r.URL.Path, "/messages"): + // Backfill roots endpoint returns empty — all messages come via delta. + _, _ = w.Write([]byte(`{"value":[]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + serverURL = srv.URL + defer srv.Close() + st := testutil.NewTestStore(t) + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + _, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: true}) + require.NoError(err) + + // The reply (c2) must be linked to the root (c1). + var replyTo, rootID sql.NullInt64 + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT reply_to_message_id FROM messages WHERE source_message_id = ?`), + channelSourceMessageID("team1", "chanA", "c2"), + ).Scan(&replyTo)) + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT id FROM messages WHERE source_message_id = ?`), + channelSourceMessageID("team1", "chanA", "c1"), + ).Scan(&rootID)) + require.True(replyTo.Valid, "reply_to_message_id should be set on c2") + assert.Equal(rootID.Int64, replyTo.Int64) +} + +func TestRecipientAndMentionRows(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + // Fake Graph server: + // - /me/chats → one oneOnOne chat + // - /chats/{id}/members → two members: alice (sender) and bob + // - chat /messages → one message from alice @mentioning bob + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[{"id":"19:chat1@thread.v2","chatType":"oneOnOne","topic":"Chat"}]}`)) + case strings.HasSuffix(r.URL.Path, "/members"): + _, _ = w.Write([]byte(`{"value":[ + {"id":"mem1","userId":"aad-alice","email":"alice@x.com","displayName":"Alice"}, + {"id":"mem2","userId":"aad-bob","email":"bob@x.com","displayName":"Bob"} + ]}`)) + case strings.Contains(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[{ + "id":"msg1", + "createdDateTime":"2025-01-01T00:00:00Z", + "lastModifiedDateTime":"2025-01-01T00:00:00Z", + "from":{"user":{"id":"alice@x.com","displayName":"Alice","userIdentityType":"emailUser"}}, + "body":{"contentType":"text","content":"hey @Bob"}, + "mentions":[{"id":0,"mentionText":"Bob","mentioned":{"user":{"id":"aad-bob","displayName":"Bob","userIdentityType":"aadUser"}}}] + }]}`)) + default: + http.Error(w, "not found", http.StatusNotFound) + } + })) + defer srv.Close() + + st := testutil.NewTestStore(t) + c := NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50) + imp := NewImporter(st, c) + + _, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: false}) + require.NoError(err) + + // Get the message ID + var msgID int64 + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT id FROM messages WHERE source_message_id = ?`), + chatSourceMessageID("19:chat1@thread.v2", "msg1"), + ).Scan(&msgID)) + + // Should have a "to" row for bob but NOT alice (the sender) + var toCount int + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT COUNT(*) FROM message_recipients mr + JOIN participants p ON p.id = mr.participant_id + WHERE mr.message_id = ? AND mr.recipient_type = 'to' AND p.email_address = 'bob@x.com' + `), msgID).Scan(&toCount)) + assert.Equal(1, toCount, "should have a 'to' row for bob") + + var aliceToCount int + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT COUNT(*) FROM message_recipients mr + JOIN participants p ON p.id = mr.participant_id + WHERE mr.message_id = ? AND mr.recipient_type = 'to' AND p.email_address = 'alice@x.com' + `), msgID).Scan(&aliceToCount)) + assert.Equal(0, aliceToCount, "alice is the sender so should NOT appear in 'to' rows") + + var fromCount int + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT COUNT(*) FROM message_recipients mr + JOIN participants p ON p.id = mr.participant_id + WHERE mr.message_id = ? AND mr.recipient_type = 'from' AND p.email_address = 'alice@x.com' + `), msgID).Scan(&fromCount)) + assert.Equal(1, fromCount, "should have a 'from' row for the message sender") + + // Should have a "mention" row for bob + var mentionCount int + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT COUNT(*) FROM message_recipients mr + JOIN participants p ON p.id = mr.participant_id + WHERE mr.message_id = ? AND mr.recipient_type = 'mention' AND p.email_address = 'bob@x.com' + `), msgID).Scan(&mentionCount)) + assert.Equal(1, mentionCount, "should have a 'mention' row for bob") +} + +// TestImportProgressCallback verifies that ImportOptions.Progress is called at least +// once per conversation and that the message contains the word "messages". +func TestImportProgressCallback(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := fakeChatGraph(t) + defer srv.Close() + st := testutil.NewTestStore(t) + + var lines []string + c := NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50) + imp := NewImporter(st, c) + sum, err := imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + Progress: func(msg string) { lines = append(lines, msg) }, + }) + require.NoError(err) + assert.EqualValues(1, sum.ChatsProcessed) + assert.NotEmpty(lines, "Progress should have been called at least once") + // Each progress line should mention messages + for _, l := range lines { + assert.Contains(l, "messages", "progress line should mention messages count: %q", l) + } +} + +// TestImportChannelProgressCallback verifies progress is called for channel conversations. +func TestImportChannelProgressCallback(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := fakeChannelGraph(t) + defer srv.Close() + st := testutil.NewTestStore(t) + + var lines []string + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + sum, err := imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: true, + Progress: func(msg string) { lines = append(lines, msg) }, + }) + require.NoError(err) + assert.EqualValues(1, sum.ChannelsProcessed) + assert.NotEmpty(lines, "Progress should have been called for channel conversations") +} + +// TestCheckpointFlushedAfterEachConversation verifies that after a successful Import, +// GetLatestCheckpointedSync does NOT return a stale failed checkpoint (since the run +// completed successfully), but that a checkpoint WAS written mid-run (visible via the +// completed sync_run's cursor_before column, or by checking an interrupted run). +// We test the happy path: after completion, cursor_before on the completed run is set. +func TestCheckpointFlushedAfterEachConversation(t *testing.T) { + require := require.New(t) + + srv := fakeChatGraph(t) + defer srv.Close() + st := testutil.NewTestStore(t) + + c := NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50) + imp := NewImporter(st, c) + _, err := imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + }) + require.NoError(err) + + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + + // After a successful run, the latest run is completed. Its cursor_before should + // have been written by the per-conversation checkpoint flush. + run, err := st.GetLastSuccessfulSync(src.ID) + require.NoError(err) + require.True(run.CursorBefore.Valid, "cursor_before should be set after per-conversation checkpoint flush") + + // The stored checkpoint must parse as a SyncState containing the synced chat cursor. + state, err := LoadSyncState(run.CursorBefore.String) + require.NoError(err) + cursor := state.ChatCursor("19:x@thread.v2") + require.NotEmpty(cursor, "SyncState in cursor_before should have a cursor for the synced chat") +} + +// TestResumeFromCheckpoint verifies that after a failed sync (simulated by writing a +// checkpoint without completing), a fresh Import merges the checkpoint cursor so +// conversations already covered by the checkpoint start from their advanced cursor. +func TestResumeFromCheckpoint(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + // Server that returns one chat with one message newer than our pre-seeded cursor. + var requestedSince string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[{"id":"19:x@thread.v2","chatType":"oneOnOne","topic":"DM"}]}`)) + case strings.Contains(r.URL.Path, "/members"): + _, _ = w.Write([]byte(`{"value":[]}`)) + case strings.Contains(r.URL.Path, "/messages"): + requestedSince = r.URL.Query().Get("$filter") + _, _ = w.Write([]byte(`{"value":[ + {"id":"m2","createdDateTime":"2025-06-01T00:00:00Z","lastModifiedDateTime":"2025-06-01T00:00:00Z", + "body":{"contentType":"text","content":"second message"}} + ]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + defer srv.Close() + + st := testutil.NewTestStore(t) + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + + // Simulate a prior interrupted sync: start a sync run, write a checkpoint with a + // SyncState that already has a cursor for the chat, then fail the run (not complete). + // This is what would happen if the importer checkpointed mid-run then crashed. + checkpointState := NewSyncState() + checkpointState.SetChatCursor("19:x@thread.v2", "2025-03-01T00:00:00.000000000Z") + blob, _ := checkpointState.Marshal() + + syncID, err := st.StartSync(src.ID, "teams") + require.NoError(err) + require.NoError(st.UpdateSyncCheckpoint(syncID, &store.Checkpoint{ + PageToken: blob, + MessagesProcessed: 5, + })) + require.NoError(st.FailSync(syncID, "simulated crash")) + + // Now run a fresh import. It should pick up the checkpoint cursor. + c := NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50) + imp := NewImporter(st, c) + sum, err := imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + }) + require.NoError(err) + assert.EqualValues(1, sum.ChatsProcessed) + + // The ListChatMessages request should have been made with the checkpoint cursor, + // meaning the since parameter was non-empty (the fake server captures it). + assert.NotEmpty(requestedSince, "import should have requested messages since the checkpoint cursor") +} + +// TestFullIgnoresCursor verifies that ImportOptions.Full forces a full backfill: +// even when a prior completed sync left a cursor for the chat, the messages +// request is made with no $filter (since), so already-seen messages are +// re-fetched and re-persisted (repair path). +func TestFullIgnoresCursor(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + var requestedSince string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[{"id":"19:x@thread.v2","chatType":"oneOnOne","topic":"DM"}]}`)) + case strings.Contains(r.URL.Path, "/members"): + _, _ = w.Write([]byte(`{"value":[]}`)) + case strings.Contains(r.URL.Path, "/messages"): + requestedSince = r.URL.Query().Get("$filter") + _, _ = w.Write([]byte(`{"value":[ + {"id":"m1","createdDateTime":"2025-01-01T00:00:00Z","lastModifiedDateTime":"2025-01-01T00:00:00Z", + "body":{"contentType":"text","content":"hello"}} + ]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + defer srv.Close() + + st := testutil.NewTestStore(t) + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + + // Seed a prior completed sync whose cursor would normally skip m1. + priorState := NewSyncState() + priorState.SetChatCursor("19:x@thread.v2", "2025-03-01T00:00:00.000000000Z") + blob, _ := priorState.Marshal() + syncID, err := st.StartSync(src.ID, "teams") + require.NoError(err) + require.NoError(st.CompleteSync(syncID, blob)) + + c := NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50) + imp := NewImporter(st, c) + sum, err := imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + Full: true, + }) + require.NoError(err) + assert.EqualValues(1, sum.ChatsProcessed) + assert.Empty(requestedSince, "Full=true should request a full backfill with no $filter cursor") + assert.EqualValues(1, sum.MessagesProcessed, "the previously-seen message should be re-fetched") +} + +func TestImportMigratesLegacyRawMessageID(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := fakeChatGraph(t) + defer srv.Close() + st := testutil.NewTestStore(t) + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + convID, err := st.EnsureConversationWithType(src.ID, "19:x@thread.v2", "direct_chat", "DM") + require.NoError(err) + legacyID, err := st.UpsertMessage(&store.Message{ + ConversationID: convID, + SourceID: src.ID, + SourceMessageID: "m1", + MessageType: "teams", + }) + require.NoError(err) + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + _, err = imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + Full: true, + }) + require.NoError(err) + + var rowCount int + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT COUNT(*) FROM messages WHERE source_id = ?`), + src.ID, + ).Scan(&rowCount)) + assert.Equal(1, rowCount) + + var gotID int64 + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT id FROM messages WHERE source_message_id = ?`), + chatSourceMessageID("19:x@thread.v2", "m1"), + ).Scan(&gotID)) + assert.Equal(legacyID, gotID) + + var rawCount int + require.NoError(st.DB().QueryRow(`SELECT COUNT(*) FROM messages WHERE source_message_id = 'm1'`).Scan(&rawCount)) + assert.Equal(0, rawCount) +} + +func TestImportMigratesLegacyRawMessageIDBeforeDelete(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[{"id":"19:x@thread.v2","chatType":"oneOnOne","topic":"DM"}]}`)) + case strings.HasSuffix(r.URL.Path, "/members"): + _, _ = w.Write([]byte(`{"value":[]}`)) + case strings.Contains(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[{"id":"m1","createdDateTime":"2025-01-01T00:00:00Z","lastModifiedDateTime":"2025-01-01T00:00:00Z","deletedDateTime":"2025-01-02T00:00:00Z"}]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + defer srv.Close() + + st := testutil.NewTestStore(t) + src, err := st.GetOrCreateSource("teams", "me@example.com") + require.NoError(err) + convID, err := st.EnsureConversationWithType(src.ID, "19:x@thread.v2", "direct_chat", "DM") + require.NoError(err) + _, err = st.UpsertMessage(&store.Message{ + ConversationID: convID, + SourceID: src.ID, + SourceMessageID: "m1", + MessageType: "teams", + }) + require.NoError(err) + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + _, err = imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + Full: true, + }) + require.NoError(err) + + var deleted sql.NullTime + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT deleted_from_source_at FROM messages WHERE source_message_id = ?`), + chatSourceMessageID("19:x@thread.v2", "m1"), + ).Scan(&deleted)) + assert.True(deleted.Valid) + + var rawCount int + require.NoError(st.DB().QueryRow(`SELECT COUNT(*) FROM messages WHERE source_message_id = 'm1'`).Scan(&rawCount)) + assert.Equal(0, rawCount) +} + +// TestRawBlobPreservesEventDetail proves the raw archive blob (json.Marshal(gm)) +// retains the eventDetail field, since EventDetail is json.RawMessage. +func TestRawBlobPreservesEventDetail(t *testing.T) { + gm := &ChatMessage{ + ID: "sys1", + EventDetail: json.RawMessage([]byte(`{"@odata.type":"#microsoft.graph.callRecordingEventMessageDetail","callRecordingUrl":"https://sp/rec.mp4","callRecordingDisplayName":"Dev guild"}`)), + } + raw, err := json.Marshal(gm) + require.NoError(t, err) + assert.Contains(t, string(raw), "callRecordingUrl") + assert.Contains(t, string(raw), "https://sp/rec.mp4") +} + +func TestTeamsImportEnqueuesPersistedMessages(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := fakeChatGraph(t) + defer srv.Close() + st := testutil.NewTestStore(t) + enqueuer := &recordingEnqueuer{} + + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + _, err := imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + EmbedEnqueuer: enqueuer, + }) + require.NoError(err) + require.Len(enqueuer.ids, 1) + + var storedID int64 + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT id FROM messages WHERE source_message_id = ?`), + chatSourceMessageID("19:x@thread.v2", "m1"), + ).Scan(&storedID)) + assert.Equal([]int64{storedID}, enqueuer.ids) +} + +func TestTeamsReimportReplacesRemovedChildCollections(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + includeChildren := true + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[{"id":"19:child@thread.v2","chatType":"group","topic":"Chat"}]}`)) + case strings.HasSuffix(r.URL.Path, "/members"): + _, _ = w.Write([]byte(`{"value":[{"id":"mem1","userId":"aad-bob","email":"bob@example.com","displayName":"Bob"}]}`)) + case strings.Contains(r.URL.Path, "/messages"): + children := "" + if includeChildren { + children = `,"attachments":[{"id":"a1","contentType":"reference","contentUrl":"https://sp/file.docx","name":"file.docx"}]` + + `,"mentions":[{"id":0,"mentionText":"Bob","mentioned":{"user":{"id":"aad-bob","displayName":"Bob","userIdentityType":"aadUser"}}}]` + + `,"reactions":[{"reactionType":"like","createdDateTime":"2025-01-01T00:00:01Z","user":{"user":{"id":"aad-bob","displayName":"Bob","userIdentityType":"aadUser"}}}]` + } + _, _ = w.Write([]byte(`{"value":[{"id":"msg1","createdDateTime":"2025-01-01T00:00:00Z","lastModifiedDateTime":"2025-01-01T00:00:00Z","from":{"user":{"id":"alice@example.com","displayName":"Alice","userIdentityType":"emailUser"}},"body":{"contentType":"text","content":"hello"}` + children + `}]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + defer srv.Close() + st := testutil.NewTestStore(t) + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + + _, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: false, Full: true}) + require.NoError(err) + includeChildren = false + _, err = imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: false, Full: true}) + require.NoError(err) + + var msgID int64 + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT id FROM messages WHERE source_message_id = ?`), + chatSourceMessageID("19:child@thread.v2", "msg1"), + ).Scan(&msgID)) + for table, query := range map[string]string{ + "mentions": `SELECT COUNT(*) FROM message_recipients WHERE message_id = ? AND recipient_type = 'mention'`, + "reactions": `SELECT COUNT(*) FROM reactions WHERE message_id = ?`, + "attachments": `SELECT COUNT(*) FROM attachments WHERE message_id = ? AND storage_path LIKE 'https://%'`, + } { + var count int + require.NoError(st.DB().QueryRow(st.Rebind(query), msgID).Scan(&count), table) + assert.Equal(0, count, table) + } +} + +// TestCallRecordingAndAttachmentsPersisted verifies that: +// - a systemEventMessage's eventDetail call-recording link is stored as an attachment, +// - a non-reference/reference attachment carrying a contentUrl is stored as an attachment, +// - the recording URL is indexed into the message body text. +func TestCallRecordingAndAttachmentsPersisted(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[{"id":"19:rec@thread.v2","chatType":"oneOnOne","topic":"DM"}]}`)) + case strings.HasSuffix(r.URL.Path, "/members"): + _, _ = w.Write([]byte(`{"value":[]}`)) + case strings.Contains(r.URL.Path, "/messages"): + _, _ = w.Write([]byte(`{"value":[ + {"id":"m1","createdDateTime":"2025-01-01T00:00:00Z","lastModifiedDateTime":"2025-01-01T00:00:00Z", + "body":{"contentType":"html","content":"

here is the deck

"}, + "attachments":[{"id":"a1","contentType":"reference","contentUrl":"https://sp/deck.pptx","name":"deck.pptx"}]}, + {"id":"sys1","messageType":"unknownFutureValue","createdDateTime":"2025-01-02T00:00:00Z","lastModifiedDateTime":"2025-01-02T00:00:00Z", + "body":{"contentType":"html","content":""}, + "eventDetail":{"@odata.type":"#microsoft.graph.callRecordingEventMessageDetail","callRecordingUrl":"https://sp/rec.mp4","callRecordingDisplayName":"Dev guild"}} + ]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + defer srv.Close() + + st := testutil.NewTestStore(t) + c := NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50) + imp := NewImporter(st, c) + _, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: false}) + require.NoError(err) + + // Recording attachment row exists. + var recCount int + require.NoError(st.DB().QueryRow(` + SELECT COUNT(*) FROM attachments a + JOIN messages m ON m.id = a.message_id + WHERE a.storage_path = 'https://sp/rec.mp4'`).Scan(&recCount)) + assert.Equal(1, recCount, "recording attachment row should exist") + var recHash sql.NullString + require.NoError(st.DB().QueryRow(` + SELECT a.content_hash FROM attachments a + JOIN messages m ON m.id = a.message_id + WHERE a.storage_path = 'https://sp/rec.mp4'`).Scan(&recHash)) + assert.False(recHash.Valid && recHash.String != "", "URL-backed recording links should not look exportable by content hash") + + // Reference attachment row exists. + var refCount int + require.NoError(st.DB().QueryRow(` + SELECT COUNT(*) FROM attachments a + JOIN messages m ON m.id = a.message_id + WHERE a.storage_path = 'https://sp/deck.pptx'`).Scan(&refCount)) + assert.Equal(1, refCount, "reference attachment row should exist") + var refHash sql.NullString + require.NoError(st.DB().QueryRow(` + SELECT a.content_hash FROM attachments a + JOIN messages m ON m.id = a.message_id + WHERE a.storage_path = 'https://sp/deck.pptx'`).Scan(&refHash)) + assert.False(refHash.Valid && refHash.String != "", "URL-backed reference links should not look exportable by content hash") + + // Body text for the system message contains the recording URL. + var bodyText sql.NullString + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT mb.body_text FROM message_bodies mb + JOIN messages m ON m.id = mb.message_id + WHERE m.source_message_id = ?`), chatSourceMessageID("19:rec@thread.v2", "sys1")).Scan(&bodyText)) + assert.True(bodyText.Valid) + assert.Contains(bodyText.String, "rec.mp4") +} + +func TestTeamsMixedInlineAndLinkAttachmentsRefreshMessageStats(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + serverURL := "" + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case strings.Contains(r.URL.Path, "/hostedContents/") && strings.HasSuffix(r.URL.Path, "/$value"): + w.Header().Set("Content-Type", "image/png") + _, _ = w.Write([]byte("PNGDATA")) + case r.URL.Path == "/me/chats": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"value":[{"id":"19:mixed@thread.v2","chatType":"oneOnOne","topic":"DM"}]}`)) + case strings.HasSuffix(r.URL.Path, "/members"): + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"value":[]}`)) + case strings.Contains(r.URL.Path, "/messages"): + w.Header().Set("Content-Type", "application/json") + body := `
` + _, _ = w.Write([]byte(`{"value":[{ + "id":"m1", + "createdDateTime":"2025-01-01T00:00:00Z", + "lastModifiedDateTime":"2025-01-01T00:00:00Z", + "body":{"contentType":"html","content":` + jsonString(t, body) + `}, + "attachments":[{"id":"a1","contentType":"reference","contentUrl":"https://sp/file.docx","name":"file.docx"}] + }]}`)) + default: + http.Error(w, "404", http.StatusNotFound) + } + })) + serverURL = srv.URL + defer srv.Close() + + st := testutil.NewTestStore(t) + imp := NewImporter(st, NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50)) + _, err := imp.Import(context.Background(), ImportOptions{ + Email: "me@example.com", + IncludeChannels: false, + AttachmentsDir: t.TempDir(), + }) + require.NoError(err) + + var hasAttachments bool + var messageAttachmentCount int + var actualAttachmentRows int + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT m.has_attachments, m.attachment_count, COUNT(a.id) + FROM messages m + LEFT JOIN attachments a ON a.message_id = m.id + WHERE m.source_message_id = ? + GROUP BY m.id, m.has_attachments, m.attachment_count + `), chatSourceMessageID("19:mixed@thread.v2", "m1")).Scan(&hasAttachments, &messageAttachmentCount, &actualAttachmentRows)) + assert.True(hasAttachments) + assert.Equal(2, actualAttachmentRows) + assert.Equal(2, messageAttachmentCount) +} + +func TestDuplicateMentionDedup(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + + // Message that @mentions bob twice should produce exactly one 'mention' row + // and sum.Errors should remain 0. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch { + case r.URL.Path == "/me/chats": + _, _ = w.Write([]byte(`{"value":[{"id":"19:chat1@thread.v2","chatType":"oneOnOne","topic":"Chat"}]}`)) + case strings.HasSuffix(r.URL.Path, "/members"): + _, _ = w.Write([]byte(`{"value":[ + {"id":"mem1","userId":"aad-alice","email":"alice@x.com","displayName":"Alice"}, + {"id":"mem2","userId":"aad-bob","email":"bob@x.com","displayName":"Bob"} + ]}`)) + case strings.Contains(r.URL.Path, "/messages"): + // Two mention entries for bob (same aad id). + _, _ = w.Write([]byte(`{"value":[{ + "id":"msg1", + "createdDateTime":"2025-01-01T00:00:00Z", + "lastModifiedDateTime":"2025-01-01T00:00:00Z", + "from":{"user":{"id":"alice@x.com","displayName":"Alice","userIdentityType":"emailUser"}}, + "body":{"contentType":"text","content":"hey @Bob @Bob"}, + "mentions":[ + {"id":0,"mentionText":"Bob","mentioned":{"user":{"id":"aad-bob","displayName":"Bob","userIdentityType":"aadUser"}}}, + {"id":1,"mentionText":"Bob","mentioned":{"user":{"id":"aad-bob","displayName":"Bob","userIdentityType":"aadUser"}}} + ] + }]}`)) + default: + http.Error(w, "not found", http.StatusNotFound) + } + })) + defer srv.Close() + + st := testutil.NewTestStore(t) + c := NewClient(srv.URL, func(context.Context) (string, error) { return "t", nil }, 50) + imp := NewImporter(st, c) + + sum, err := imp.Import(context.Background(), ImportOptions{Email: "me@example.com", IncludeChannels: false}) + require.NoError(err) + assert.EqualValues(0, sum.Errors, "no errors expected") + + var msgID int64 + require.NoError(st.DB().QueryRow( + st.Rebind(`SELECT id FROM messages WHERE source_message_id = ?`), + chatSourceMessageID("19:chat1@thread.v2", "msg1"), + ).Scan(&msgID)) + + var mentionCount int + require.NoError(st.DB().QueryRow(st.Rebind(` + SELECT COUNT(*) FROM message_recipients mr + JOIN participants p ON p.id = mr.participant_id + WHERE mr.message_id = ? AND mr.recipient_type = 'mention' AND p.email_address = 'bob@x.com' + `), msgID).Scan(&mentionCount)) + assert.Equal(1, mentionCount, "duplicate @mention should produce exactly one mention row") +} diff --git a/internal/teams/mapping.go b/internal/teams/mapping.go new file mode 100644 index 000000000..2a80ff117 --- /dev/null +++ b/internal/teams/mapping.go @@ -0,0 +1,91 @@ +package teams + +import ( + "database/sql" + "net/url" + "strings" + + "go.kenn.io/msgvault/internal/mime" + "go.kenn.io/msgvault/internal/store" +) + +// htmlToText converts an HTML string to plain text by delegating to +// mime.StripHTML, which strips tags, decodes entities, and normalises +// whitespace. It is a thin wrapper so tests can target it directly. +func htmlToText(html string) string { + return mime.StripHTML(html) +} + +func snippet(text string) string { + r := []rune(text) + if len(r) > 100 { + return string(r[:100]) + } + return text +} + +// recordingLine renders a call-recording pointer for inclusion in the message +// body/snippet/FTS so the URL is visible and searchable. +func recordingLine(name, url string) string { + if name != "" { + return "📹 recording: " + name + " " + url + } + return "📹 recording: " + url +} + +// mapMessage converts a Graph API ChatMessage into a store.Message and the +// plain-text body. conversationID and sourceID are the internal DB IDs. +func mapMessage(gm *ChatMessage, conversationID, sourceID int64, sourceMessageID string) (store.Message, string) { + text := gm.Body.Content + if strings.EqualFold(gm.Body.ContentType, "html") { + text = htmlToText(gm.Body.Content) + } + attCount := len(gm.Attachments) + attCount += len(hostedRe.FindAllString(gm.Body.Content, -1)) + if url, name, ok := gm.callRecording(); ok { + line := recordingLine(name, url) + if text != "" { + text += "\n" + line + } else { + text = line + } + attCount++ + } + msg := store.Message{ + ConversationID: conversationID, + SourceID: sourceID, + SourceMessageID: sourceMessageID, + MessageType: "teams", + SentAt: sql.NullTime{Time: gm.CreatedDateTime, Valid: !gm.CreatedDateTime.IsZero()}, + ReceivedAt: sql.NullTime{Time: gm.CreatedDateTime, Valid: !gm.CreatedDateTime.IsZero()}, + Snippet: sql.NullString{String: snippet(text), Valid: text != ""}, + HasAttachments: attCount > 0, + AttachmentCount: attCount, + } + if gm.Subject != "" { + msg.Subject = sql.NullString{String: gm.Subject, Valid: true} + } + return msg, text +} + +func chatSourceMessageID(chatID, messageID string) string { + return "chat:" + escapeSourceIDPart(chatID) + ":" + escapeSourceIDPart(messageID) +} + +func channelSourceMessageID(teamID, channelID, messageID string) string { + return "channel:" + escapeSourceIDPart(teamID) + ":" + escapeSourceIDPart(channelID) + ":" + escapeSourceIDPart(messageID) +} + +func escapeSourceIDPart(part string) string { + return url.QueryEscape(part) +} + +// conversationType maps a Graph API chatType string to the msgvault +// conversation type. "oneOnOne" becomes "direct_chat"; everything else +// (group, meeting, unknownFutureValue, …) becomes "group_chat". +func conversationType(chatType string) string { + if chatType == "oneOnOne" { + return "direct_chat" + } + return "group_chat" +} diff --git a/internal/teams/mapping_test.go b/internal/teams/mapping_test.go new file mode 100644 index 000000000..e76fcbb60 --- /dev/null +++ b/internal/teams/mapping_test.go @@ -0,0 +1,122 @@ +package teams + +import ( + "encoding/json" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestHTMLToText(t *testing.T) { + got := htmlToText(`

Hello Bob see link

`) + assert.Contains(t, got, "Hello") + assert.Contains(t, got, "Bob") + assert.NotContains(t, got, "

") +} + +func TestMapMessageBasics(t *testing.T) { + assert := assert.New(t) + gm := &ChatMessage{ + ID: "m1", + CreatedDateTime: time.Date(2025, 1, 2, 3, 4, 5, 0, time.UTC), + LastModifiedDateTime: time.Date(2025, 1, 2, 3, 4, 5, 0, time.UTC), + Body: MessageBody{ContentType: "html", Content: "

hi there

"}, + Attachments: []Attachment{{ContentType: "reference", ContentURL: "http://sp/f", Name: "f.docx"}}, + } + msg, text := mapMessage(gm, 10, 20, chatSourceMessageID("chatA", gm.ID)) + assert.Equal("teams", msg.MessageType) + assert.Equal("chat:chatA:m1", msg.SourceMessageID) + assert.True(msg.SentAt.Valid) + assert.True(msg.HasAttachments) + assert.Equal(1, msg.AttachmentCount) + assert.Equal("hi there", text) + assert.Contains(msg.Snippet.String, "hi there") +} + +func TestCallRecordingParsing(t *testing.T) { + assert := assert.New(t) + + gm := &ChatMessage{ + EventDetail: json.RawMessage([]byte(`{"@odata.type":"#microsoft.graph.callRecordingEventMessageDetail","callRecordingUrl":"https://sp/rec.mp4","callRecordingDisplayName":"Dev guild"}`)), + } + url, name, ok := gm.callRecording() + assert.True(ok) + assert.Equal("https://sp/rec.mp4", url) + assert.Equal("Dev guild", name) + + // Negative: a membersAdded-style event has no callRecordingUrl. + gmNoRec := &ChatMessage{ + EventDetail: json.RawMessage([]byte(`{"@odata.type":"#microsoft.graph.membersAddedEventMessageDetail","members":["aad-bob"]}`)), + } + _, _, ok = gmNoRec.callRecording() + assert.False(ok) + + // Negative: empty EventDetail. + gmEmpty := &ChatMessage{} + _, _, ok = gmEmpty.callRecording() + assert.False(ok) +} + +func TestMapMessageRecording(t *testing.T) { + assert := assert.New(t) + + gm := &ChatMessage{ + ID: "sys1", + CreatedDateTime: time.Date(2025, 1, 2, 0, 0, 0, 0, time.UTC), + LastModifiedDateTime: time.Date(2025, 1, 2, 0, 0, 0, 0, time.UTC), + Body: MessageBody{ContentType: "html", Content: ""}, + EventDetail: json.RawMessage([]byte(`{"@odata.type":"#microsoft.graph.callRecordingEventMessageDetail","callRecordingUrl":"https://sp/rec.mp4","callRecordingDisplayName":"Dev guild"}`)), + } + msg, text := mapMessage(gm, 10, 20, chatSourceMessageID("chatA", gm.ID)) + assert.Contains(text, "📹 recording") + assert.Contains(text, "https://sp/rec.mp4") + assert.True(msg.HasAttachments) + assert.Equal(1, msg.AttachmentCount) + assert.Contains(msg.Snippet.String, "📹 recording") + assert.Contains(msg.Snippet.String, "https://sp/rec.mp4") +} + +func TestMapMessageHostedContentCountsInlineImages(t *testing.T) { + assert := assert.New(t) + + gm := &ChatMessage{ + ID: "inline1", + Body: MessageBody{ + ContentType: "html", + Content: `

` + + `

`, + }, + } + + msg, _ := mapMessage(gm, 10, 20, chatSourceMessageID("chatA", gm.ID)) + assert.True(msg.HasAttachments) + assert.Equal(2, msg.AttachmentCount) +} + +func TestTeamsSourceMessageIDNamespacesConversations(t *testing.T) { + assert.Equal(t, "chat:chatA:m1", chatSourceMessageID("chatA", "m1")) + assert.Equal(t, "chat:chatB:m1", chatSourceMessageID("chatB", "m1")) + assert.Equal(t, "channel:team1:chanA:m1", channelSourceMessageID("team1", "chanA", "m1")) +} + +// TestChatMessageRawPreservesUnknownFields verifies that decoding a ChatMessage +// retains the full original JSON (including fields we do not model, e.g. webUrl), +// so the archived raw blob is truly lossless. +func TestChatMessageRawPreservesUnknownFields(t *testing.T) { + assert := assert.New(t) + src := `{"id":"m1","webUrl":"https://teams/msg/m1","summary":"s","body":{"contentType":"text","content":"hi"}}` + var gm ChatMessage + require.NoError(t, json.Unmarshal([]byte(src), &gm)) + assert.Equal("m1", gm.ID) + assert.Equal("hi", gm.Body.Content) + assert.Contains(string(gm.Raw), "webUrl") + assert.Contains(string(gm.Raw), "https://teams/msg/m1") +} + +func TestConversationType(t *testing.T) { + assert.Equal(t, "direct_chat", conversationType("oneOnOne")) + assert.Equal(t, "group_chat", conversationType("group")) + assert.Equal(t, "group_chat", conversationType("meeting")) +} diff --git a/internal/teams/participants.go b/internal/teams/participants.go new file mode 100644 index 000000000..b491a215f --- /dev/null +++ b/internal/teams/participants.go @@ -0,0 +1,107 @@ +package teams + +import ( + "context" + "strings" + + "go.kenn.io/msgvault/internal/store" +) + +type userLookup interface { + GetUser(ctx context.Context, id string) (*GraphUser, error) +} + +type participantResolver struct { + store *store.Store + lookup userLookup + cache map[string]int64 +} + +func newParticipantResolver(s *store.Store, lookup userLookup) *participantResolver { + return &participantResolver{store: s, lookup: lookup, cache: map[string]int64{}} +} + +func (r *participantResolver) resolve(ctx context.Context, id *Identity) (int64, error) { + if id == nil || id.ID == "" { + return 0, nil + } + if pid, ok := r.cache[id.ID]; ok { + return pid, nil + } + var pid int64 + var err error + switch id.UserIdentityType { + case "emailUser": + pid, err = r.byEmail(id.ID, id.DisplayName) + case "aadUser", "onPremiseAadUser": + email := r.lookupMail(ctx, id.ID) + if email != "" { + pid, err = r.byEmail(email, id.DisplayName) + } else { + pid, err = r.store.EnsureParticipantByIdentifier("teams", id.ID, id.DisplayName) + } + default: + pid, err = r.store.EnsureParticipantByIdentifier("teams", id.ID, id.DisplayName) + } + if err != nil { + return 0, err + } + r.cache[id.ID] = pid + return pid, nil +} + +func (r *participantResolver) byEmail(email, displayName string) (int64, error) { + domain := "" + if at := strings.LastIndex(email, "@"); at >= 0 { + domain = strings.ToLower(email[at+1:]) + } + return r.store.EnsureParticipant(strings.ToLower(email), displayName, domain) +} + +func (r *participantResolver) lookupMail(ctx context.Context, objectID string) string { + if r.lookup == nil { + return "" + } + u, err := r.lookup.GetUser(ctx, objectID) + if err != nil || u == nil { + return "" + } + if u.Mail != "" { + return u.Mail + } + if strings.Contains(u.UserPrincipalName, "@") && !strings.Contains(u.UserPrincipalName, "#EXT#") { + return u.UserPrincipalName + } + return "" +} + +// resolveMember resolves a ChatMember to a participant ID, using the member's +// email when available and falling back to identifier-based resolution otherwise. +// The resolved ID is cached by the member's user object ID to enable mention +// resolution to find the same participant via the participant cache. +func (r *participantResolver) resolveMember(ctx context.Context, m ChatMember) (int64, error) { + id := m.UserID + if id == "" { + id = m.ID + } + if id == "" { + return 0, nil + } + if pid, ok := r.cache[id]; ok { + return pid, nil + } + var pid int64 + var err error + if m.Email != "" { + pid, err = r.byEmail(m.Email, m.DisplayName) + } else { + pid, err = r.resolve(ctx, &Identity{ID: id, DisplayName: m.DisplayName, UserIdentityType: "aadUser"}) + } + if err != nil { + return 0, err + } + if pid != 0 { + r.cache[id] = pid + } + return pid, nil +} diff --git a/internal/teams/participants_test.go b/internal/teams/participants_test.go new file mode 100644 index 000000000..698160f63 --- /dev/null +++ b/internal/teams/participants_test.go @@ -0,0 +1,53 @@ +package teams + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.kenn.io/msgvault/internal/testutil" +) + +func TestResolveParticipant_EmailUserUsesIDAsEmail(t *testing.T) { + st := testutil.NewTestStore(t) + r := newParticipantResolver(st, nil) + id := &Identity{ID: "alice@outlook.com", DisplayName: "Alice", UserIdentityType: "emailUser"} + pid, err := r.resolve(context.Background(), id) + require.NoError(t, err) + assert.NotZero(t, pid) +} + +func TestResolveParticipant_AADUserResolvesMail(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + st := testutil.NewTestStore(t) + fake := &fakeUserLookup{mail: map[string]string{"obj-1": "bob@example.com"}} + r := newParticipantResolver(st, fake) + id := &Identity{ID: "obj-1", DisplayName: "Bob", UserIdentityType: "aadUser"} + pid, err := r.resolve(context.Background(), id) + require.NoError(err) + assert.NotZero(pid) + + _, err = r.resolve(context.Background(), id) // cache hit + require.NoError(err) + assert.Equal(1, fake.calls) +} + +func TestResolveParticipant_NilReturnsZero(t *testing.T) { + st := testutil.NewTestStore(t) + r := newParticipantResolver(st, nil) + pid, err := r.resolve(context.Background(), nil) + require.NoError(t, err) + assert.Zero(t, pid) +} + +type fakeUserLookup struct { + mail map[string]string + calls int +} + +func (f *fakeUserLookup) GetUser(_ context.Context, id string) (*GraphUser, error) { + f.calls++ + return &GraphUser{ID: id, Mail: f.mail[id]}, nil +} diff --git a/internal/teams/syncstate.go b/internal/teams/syncstate.go new file mode 100644 index 000000000..b057056e5 --- /dev/null +++ b/internal/teams/syncstate.go @@ -0,0 +1,86 @@ +package teams + +import ( + "encoding/json" + "time" +) + +// SyncState holds per-conversation incremental cursors, persisted as JSON in +// sync_runs.cursor_after. Chats use a max-lastModifiedDateTime timestamp; +// channels use an @odata.deltaLink. +type SyncState struct { + Chats map[string]string `json:"chats"` // chatID -> max lastModifiedDateTime (RFC3339) + Channels map[string]string `json:"channels"` // "teamID/channelID" -> deltaLink +} + +func NewSyncState() *SyncState { + return &SyncState{Chats: map[string]string{}, Channels: map[string]string{}} +} + +func LoadSyncState(blob string) (*SyncState, error) { + s := NewSyncState() + if blob == "" { + return s, nil + } + if err := json.Unmarshal([]byte(blob), s); err != nil { + return nil, err + } + if s.Chats == nil { + s.Chats = map[string]string{} + } + if s.Channels == nil { + s.Channels = map[string]string{} + } + return s, nil +} + +func (s *SyncState) Marshal() (string, error) { + b, err := json.Marshal(s) + return string(b), err +} + +func (s *SyncState) ChatCursor(chatID string) string { return s.Chats[chatID] } +func (s *SyncState) SetChatCursor(chatID, cursor string) { s.Chats[chatID] = cursor } +func (s *SyncState) ChannelDelta(key string) string { return s.Channels[key] } +func (s *SyncState) SetChannelDelta(key, link string) { s.Channels[key] = link } + +// Merge incorporates cursors from other into s, keeping the more-advanced value for +// each conversation. If other is nil it is silently ignored. +// +// Chat cursors are RFC3339Nano timestamps. They are parsed before comparison +// because RFC3339Nano omits trailing fractional zeroes, so string ordering is +// not a reliable proxy for time ordering. +// +// Channel deltaLinks are opaque Graph tokens that cannot be compared by value; we +// always prefer other's link when it is non-empty, on the assumption that other +// represents a more recent (checkpoint) run whose cursor is at least as advanced. +func (s *SyncState) Merge(other *SyncState) { + if other == nil { + return + } + for chatID, cursor := range other.Chats { + if chatCursorAfter(cursor, s.Chats[chatID]) { + s.Chats[chatID] = cursor + } + } + for key, link := range other.Channels { + if link != "" { + s.Channels[key] = link + } + } +} + +func chatCursorAfter(candidate, existing string) bool { + if candidate == "" { + return false + } + if existing == "" { + return true + } + candidateTime, candidateErr := time.Parse(time.RFC3339Nano, candidate) + existingTime, existingErr := time.Parse(time.RFC3339Nano, existing) + if candidateErr == nil && existingErr == nil { + return candidateTime.After(existingTime) + } + return candidate > existing +} diff --git a/internal/teams/syncstate_test.go b/internal/teams/syncstate_test.go new file mode 100644 index 000000000..7a58272c5 --- /dev/null +++ b/internal/teams/syncstate_test.go @@ -0,0 +1,94 @@ +package teams + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestSyncStateRoundTrip(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + s := NewSyncState() + s.SetChatCursor("19:abc@thread.v2", "2026-01-01T00:00:00Z") + s.SetChannelDelta("team1/chanA", "https://graph/delta?token=xyz") + + blob, err := s.Marshal() + require.NoError(err) + + got, err := LoadSyncState(blob) + require.NoError(err) + assert.Equal("2026-01-01T00:00:00Z", got.ChatCursor("19:abc@thread.v2")) + assert.Equal("https://graph/delta?token=xyz", got.ChannelDelta("team1/chanA")) + assert.Empty(got.ChatCursor("unknown")) +} + +func TestLoadSyncStateEmpty(t *testing.T) { + got, err := LoadSyncState("") + require.NoError(t, err) + assert.Empty(t, got.ChatCursor("anything")) + assert.Empty(t, got.ChannelDelta("anything")) +} + +func TestLoadSyncStateInvalid(t *testing.T) { + _, err := LoadSyncState("{not json") + require.Error(t, err) +} + +func TestSyncStateMerge(t *testing.T) { + assert := assert.New(t) + + // baseline: chatA=t1, channel key1=d1 + baseline := NewSyncState() + baseline.SetChatCursor("chatA", "2025-01-01T00:00:00.000000000Z") + baseline.SetChannelDelta("key1", "https://delta/d1") + + // other (checkpoint): chatA=t2 (later), chatB=t3, channel key1=d2 + other := NewSyncState() + other.SetChatCursor("chatA", "2025-06-01T00:00:00.000000000Z") + other.SetChatCursor("chatB", "2025-03-01T00:00:00.000000000Z") + other.SetChannelDelta("key1", "https://delta/d2") + + baseline.Merge(other) + + // chatA: other's value is lexicographically greater — use it + assert.Equal("2025-06-01T00:00:00.000000000Z", baseline.ChatCursor("chatA")) + // chatB: only in other — should be picked up + assert.Equal("2025-03-01T00:00:00.000000000Z", baseline.ChatCursor("chatB")) + // key1 channel: other's deltaLink preferred when present + assert.Equal("https://delta/d2", baseline.ChannelDelta("key1")) +} + +func TestSyncStateMergeNilOther(t *testing.T) { + s := NewSyncState() + s.SetChatCursor("chatA", "2025-01-01T00:00:00Z") + // Merging nil must not panic + s.Merge(nil) + assert.Equal(t, "2025-01-01T00:00:00Z", s.ChatCursor("chatA")) +} + +func TestSyncStateMergeBaselineWins(t *testing.T) { + assert := assert.New(t) + + baseline := NewSyncState() + baseline.SetChatCursor("chatA", "2025-06-01T00:00:00.000000000Z") // later + + other := NewSyncState() + other.SetChatCursor("chatA", "2025-01-01T00:00:00.000000000Z") // earlier + + baseline.Merge(other) + // baseline already has the later value — it should win + assert.Equal("2025-06-01T00:00:00.000000000Z", baseline.ChatCursor("chatA")) +} + +func TestSyncStateMergeParsesVariablePrecisionTimestamps(t *testing.T) { + baseline := NewSyncState() + baseline.SetChatCursor("chatA", "2025-01-01T00:00:00Z") + + other := NewSyncState() + other.SetChatCursor("chatA", "2025-01-01T00:00:00.1Z") + + baseline.Merge(other) + assert.Equal(t, "2025-01-01T00:00:00.1Z", baseline.ChatCursor("chatA")) +} diff --git a/internal/teams/types.go b/internal/teams/types.go new file mode 100644 index 000000000..3fb620109 --- /dev/null +++ b/internal/teams/types.go @@ -0,0 +1,195 @@ +package teams + +import ( + "context" + "encoding/json" + "time" +) + +// ---- Graph response envelopes ---- + +type listResponse[T any] struct { + Value []T `json:"value"` + NextLink string `json:"@odata.nextLink"` + DeltaLink string `json:"@odata.deltaLink"` +} + +// ---- Chats & channels ---- + +type Chat struct { + ID string `json:"id"` + ChatType string `json:"chatType"` // oneOnOne | group | meeting + Topic string `json:"topic"` + OnlineInfo *struct { + JoinWebURL string `json:"joinWebUrl"` + } `json:"onlineMeetingInfo"` +} + +type JoinedTeam struct { + ID string `json:"id"` + DisplayName string `json:"displayName"` +} + +type Channel struct { + ID string `json:"id"` + DisplayName string `json:"displayName"` + MembershipType string `json:"membershipType"` // standard | private | shared +} + +// ---- Messages ---- + +type ChatMessage struct { + ID string `json:"id"` + ReplyToID string `json:"replyToId"` + MessageType string `json:"messageType"` + CreatedDateTime time.Time `json:"createdDateTime"` + LastModifiedDateTime time.Time `json:"lastModifiedDateTime"` + DeletedDateTime *time.Time `json:"deletedDateTime"` + Subject string `json:"subject"` + Importance string `json:"importance"` + From *IdentitySet `json:"from"` + Body MessageBody `json:"body"` + Attachments []Attachment `json:"attachments"` + Mentions []Mention `json:"mentions"` + Reactions []Reaction `json:"reactions"` + // EventDetail is the polymorphic eventDetail payload (default-returned by + // Graph on systemEventMessage items). Kept as RawMessage so the typed lens + // below can parse the call-recording fields without modelling every subtype. + EventDetail json.RawMessage `json:"eventDetail,omitempty"` + // Raw holds the exact original JSON for this message, captured during decode + // (see UnmarshalJSON). It is archived verbatim so no Graph field is lost to + // our partial struct modelling. + Raw json.RawMessage `json:"-"` +} + +// UnmarshalJSON decodes a ChatMessage while retaining the exact original bytes +// in Raw, so the archived raw blob is lossless even for fields we do not model. +func (m *ChatMessage) UnmarshalJSON(b []byte) error { + type alias ChatMessage // avoid recursion into this method + var a alias + if err := json.Unmarshal(b, &a); err != nil { + return err + } + *m = ChatMessage(a) + m.Raw = append(json.RawMessage(nil), b...) + return nil +} + +// eventMessageDetail is the polymorphic eventDetail payload. Only the +// call-recording fields we surface are typed; the full JSON is preserved +// via ChatMessage.EventDetail. +type eventMessageDetail struct { + ODataType string `json:"@odata.type"` + CallRecordingURL string `json:"callRecordingUrl"` + CallRecordingDisplayName string `json:"callRecordingDisplayName"` +} + +// callRecording returns the recording URL and display name when this message's +// eventDetail is a callRecordingEventMessageDetail. ok is false otherwise. +func (m *ChatMessage) callRecording() (url, name string, ok bool) { + if len(m.EventDetail) == 0 { + return "", "", false + } + var d eventMessageDetail + if err := json.Unmarshal(m.EventDetail, &d); err != nil { + return "", "", false + } + if d.CallRecordingURL == "" { + return "", "", false + } + return d.CallRecordingURL, d.CallRecordingDisplayName, true +} + +type MessageBody struct { + ContentType string `json:"contentType"` // html | text + Content string `json:"content"` +} + +type IdentitySet struct { + User *Identity `json:"user"` + Application *Identity `json:"application"` +} + +type Identity struct { + ID string `json:"id"` + DisplayName string `json:"displayName"` + UserIdentityType string `json:"userIdentityType"` // aadUser | emailUser | anonymousGuest | skypeUser | ... +} + +type Attachment struct { + ID string `json:"id"` + ContentType string `json:"contentType"` // "reference" => shared file link + ContentURL string `json:"contentUrl"` + Name string `json:"name"` +} + +type Mention struct { + ID int `json:"id"` + MentionText string `json:"mentionText"` + Mentioned *IdentitySet `json:"mentioned"` +} + +type Reaction struct { + ReactionType string `json:"reactionType"` // like | heart | laugh | ... + CreatedDateTime time.Time `json:"createdDateTime"` + User *IdentitySet `json:"user"` +} + +// GraphUser is the subset of /users/{id} we resolve for participant email. +type GraphUser struct { + ID string `json:"id"` + Mail string `json:"mail"` + UserPrincipalName string `json:"userPrincipalName"` + DisplayName string `json:"displayName"` +} + +// ---- Importer options/summary ---- + +type ImportOptions struct { + Email string + AttachmentsDir string + IncludeChannels bool // default true; allows chats-only runs + Limit int // 0 = no limit (per-conversation message cap, for scoped runs) + After time.Time // zero = no lower bound + // Full forces a complete backfill: the prior sync cursor and any interrupted + // checkpoint are ignored, so every chat/channel message is re-fetched and + // re-persisted (upsert). Use to repair messages after an importer change. + Full bool + // OnlyIncomplete (BackfillInlineMedia only) restricts the run to messages + // whose inline media was not fully downloaded, so transient fetch failures + // can be retried without re-fetching everything. + OnlyIncomplete bool + // Progress, if non-nil, is called after each conversation with a human-readable + // status line. Safe to leave nil (silent mode). + Progress func(msg string) `json:"-"` + // EmbedEnqueuer, if non-nil, queues persisted message IDs for vector embedding. + // Enqueue failures are counted in the summary but do not abort the import. + EmbedEnqueuer EmbedEnqueuer `json:"-"` +} + +type EmbedEnqueuer interface { + EnqueueMessages(ctx context.Context, messageIDs []int64) error +} + +type ImportSummary struct { + Duration time.Duration + SourceID int64 + ChatsProcessed int64 + ChannelsProcessed int64 + MessagesProcessed int64 + MessagesAdded int64 + MessagesUpdated int64 + ReactionsAdded int64 + AttachmentsFound int64 + InlineImagesCopied int64 + Participants int64 + Errors int64 +} + +// ChatMember is a member of a chat (direct or group), returned by /chats/{id}/members. +type ChatMember struct { + ID string `json:"id"` + UserID string `json:"userId"` + Email string `json:"email"` + DisplayName string `json:"displayName"` +}