diff --git a/.golangci.yaml b/.golangci.yaml index b2afff0..3ea4890 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -252,6 +252,8 @@ linters-settings: desc: "core must not import format-adapter subpackages (ADR 0006); applications wire adapters in" - pkg: "github.com/SnapdragonPartners/maestro-cms/extract/markdown" desc: "core must not import format-adapter subpackages (ADR 0006); applications wire adapters in" + - pkg: "github.com/SnapdragonPartners/maestro-cms/extract/preset" + desc: "core must not import the format-bundle subpackage (ADR 0006); applications wire it in" - pkg: "github.com/SnapdragonPartners/maestro-cms/store/gcs" desc: "core must not import storage-adapter subpackages (ADR 0006); applications wire adapters in" - pkg: "github.com/SnapdragonPartners/maestro-cms/index" diff --git a/content/hash.go b/content/hash.go new file mode 100644 index 0000000..8c3be5f --- /dev/null +++ b/content/hash.go @@ -0,0 +1,61 @@ +package content + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "io" +) + +// SHA-256 hex helpers. These are content-addressing primitives only: they +// compute a stable, lowercase-hex SHA-256 digest of some bytes. They carry no +// opinion about what the digest identifies — raw source identity (Source.Hash), +// an extracted artifact's identity, a chunk's identity, or a cache key are all +// application decisions. The library never computes or mutates Source.Hash on +// the app's behalf; these helpers just remove the boilerplate when the app does. + +// SHA256HexBytes returns the lowercase-hex SHA-256 digest of b. +func SHA256HexBytes(b []byte) string { + sum := sha256.Sum256(b) + return hex.EncodeToString(sum[:]) +} + +// SHA256HexString returns the lowercase-hex SHA-256 digest of s. +func SHA256HexString(s string) string { + return SHA256HexBytes([]byte(s)) +} + +// SHA256HexReader streams r through a SHA-256 hash without buffering it all in +// memory, returning the lowercase-hex digest and the number of bytes read. It +// honors ctx cancellation between reads (a reader already blocked inside Read is +// not interrupted). It does not bound input size; the caller wraps r (e.g. with +// io.LimitReader) if a bound is needed. +func SHA256HexReader(ctx context.Context, r io.Reader) (hexDigest string, n int64, err error) { + if cerr := ctx.Err(); cerr != nil { + return "", 0, fmt.Errorf("content: hash aborted: %w", cerr) + } + h := sha256.New() + n, err = io.Copy(h, &ctxReader{ctx: ctx, r: r}) + if err != nil { + return "", n, fmt.Errorf("content: hash read: %w", err) + } + return hex.EncodeToString(h.Sum(nil)), n, nil +} + +// ctxReader makes a streaming read cancellable: each Read first observes ctx. +// Cancellation is cooperative (checked between reads), the standard bridge for +// an io.Reader that has no context parameter. +// +//nolint:containedctx // intentional ctx bridge for io.Reader, which has no ctx param +type ctxReader struct { + ctx context.Context + r io.Reader +} + +func (cr *ctxReader) Read(p []byte) (int, error) { + if err := cr.ctx.Err(); err != nil { + return 0, err //nolint:wrapcheck // sentinel ctx error; SHA256HexReader adds context + } + return cr.r.Read(p) //nolint:wrapcheck // pass-through of underlying reader, incl io.EOF +} diff --git a/content/hash_test.go b/content/hash_test.go new file mode 100644 index 0000000..8843776 --- /dev/null +++ b/content/hash_test.go @@ -0,0 +1,65 @@ +package content_test + +import ( + "context" + "errors" + "strings" + "testing" + + "github.com/SnapdragonPartners/maestro-cms/content" +) + +// Known SHA-256 vectors. +const ( + hashABC = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad" // "abc" + hashEmpty = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" // "" +) + +func TestSHA256HexBytesAndString(t *testing.T) { + if got := content.SHA256HexBytes([]byte("abc")); got != hashABC { + t.Fatalf("SHA256HexBytes = %q, want %q", got, hashABC) + } + if got := content.SHA256HexString("abc"); got != hashABC { + t.Fatalf("SHA256HexString = %q, want %q", got, hashABC) + } + if got := content.SHA256HexBytes(nil); got != hashEmpty { + t.Fatalf("SHA256HexBytes(nil) = %q, want empty-hash %q", got, hashEmpty) + } +} + +func TestSHA256HexReader(t *testing.T) { + h, n, err := content.SHA256HexReader(context.Background(), strings.NewReader("abc")) + if err != nil { + t.Fatalf("err: %v", err) + } + if n != 3 { + t.Fatalf("n = %d, want 3", n) + } + if h != hashABC { + t.Fatalf("hash = %q, want %q", h, hashABC) + } +} + +func TestSHA256HexReaderAgreesWithBytes(t *testing.T) { + const s = "the quick brown fox\n\nmultiple paragraphs" + want := content.SHA256HexString(s) + got, n, err := content.SHA256HexReader(context.Background(), strings.NewReader(s)) + if err != nil { + t.Fatalf("err: %v", err) + } + if got != want { + t.Fatalf("reader hash %q != string hash %q", got, want) + } + if n != int64(len(s)) { + t.Fatalf("n = %d, want %d", n, len(s)) + } +} + +func TestSHA256HexReaderCanceled(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + _, _, err := content.SHA256HexReader(ctx, strings.NewReader("abc")) + if !errors.Is(err, context.Canceled) { + t.Fatalf("err = %v, want context.Canceled", err) + } +} diff --git a/extract/helpers.go b/extract/helpers.go new file mode 100644 index 0000000..b0bcc20 --- /dev/null +++ b/extract/helpers.go @@ -0,0 +1,76 @@ +package extract + +import ( + "errors" + "slices" + + "github.com/SnapdragonPartners/maestro-cms/content" +) + +// Errors returned by SingleTextArtifact. +var ( + // ErrNoTextArtifact indicates no text artifact was present. + ErrNoTextArtifact = errors.New("extract: no text artifact") + // ErrMultipleTextArtifacts indicates more than one text artifact was present, + // so picking one is an application policy decision, not the library's. + ErrMultipleTextArtifacts = errors.New("extract: multiple text artifacts") +) + +// Supports reports whether an extractor is registered for mediaType. It applies +// the same canonicalization as Extract (lowercased type, parameters dropped), so +// "text/plain", "Text/Plain", and "text/plain; charset=utf-8" all match alike. +func (r *Registry) Supports(mediaType content.MediaType) bool { + _, ok := r.Get(mediaType) + return ok +} + +// SupportedMediaTypes returns the canonical media types this registry can +// extract, sorted for deterministic output. The values are canonical (as stored +// by Register), so they round-trip through Get/Supports. +func (r *Registry) SupportedMediaTypes() []content.MediaType { + out := make([]content.MediaType, 0, len(r.extractors)) + for mt := range r.extractors { + out = append(out, mt) + } + slices.Sort(out) + return out +} + +// TextArtifacts returns the artifacts that carry an inline text payload — those +// with no Blob handle and non-empty Text. The test is structural (payload shape), +// not semantic: it does not inspect MediaType, so inline OCR, transcript, or +// caption text counts too. Callers that care about media type filter further. +// +// The returned artifacts are shallow copies: as with content.Artifact generally, +// each shares its Metadata map with the input (no deep clone). Use +// content.Artifact.Clone for an independent copy. +func TextArtifacts(artifacts []content.Artifact) []content.Artifact { + out := make([]content.Artifact, 0, len(artifacts)) + for i := range artifacts { + a := &artifacts[i] + if a.Blob == nil && a.Text != "" { + out = append(out, *a) + } + } + return out +} + +// SingleTextArtifact returns the one inline-text artifact in artifacts. It +// returns ErrNoTextArtifact when there is none and ErrMultipleTextArtifacts when +// there is more than one — the multi-artifact case is the application's policy to +// resolve, not a choice the library makes for it. It is the convenience for the +// common "this source yields exactly one text payload" path. +// +// As with TextArtifacts, the returned artifact is a shallow copy and shares its +// Metadata map with the input; use content.Artifact.Clone for an independent copy. +func SingleTextArtifact(artifacts []content.Artifact) (content.Artifact, error) { + texts := TextArtifacts(artifacts) + switch len(texts) { + case 0: + return content.Artifact{}, ErrNoTextArtifact + case 1: + return texts[0], nil + default: + return content.Artifact{}, ErrMultipleTextArtifacts + } +} diff --git a/extract/helpers_test.go b/extract/helpers_test.go new file mode 100644 index 0000000..61a2332 --- /dev/null +++ b/extract/helpers_test.go @@ -0,0 +1,68 @@ +package extract_test + +import ( + "errors" + "slices" + "testing" + + "github.com/SnapdragonPartners/maestro-cms/content" + "github.com/SnapdragonPartners/maestro-cms/extract" +) + +func TestRegistrySupports(t *testing.T) { + r := extract.NewRegistry() + r.Register("text/plain", extract.NewTextExtractor()) + + if !r.Supports("text/plain") { + t.Fatal("Supports(text/plain) = false") + } + // Same canonicalization as Extract: casing + parameters are normalized. + if !r.Supports("Text/Plain; charset=utf-8") { + t.Fatal("Supports with casing/params = false") + } + if r.Supports("application/pdf") { + t.Fatal("Supports(application/pdf) = true for empty registry entry") + } +} + +func TestRegistrySupportedMediaTypesSorted(t *testing.T) { + r := extract.NewRegistry() + r.Register("text/plain", extract.NewTextExtractor()) + r.Register("application/json", extract.NewTextExtractor()) + + got := r.SupportedMediaTypes() + want := []content.MediaType{"application/json", "text/plain"} + if !slices.Equal(got, want) { + t.Fatalf("SupportedMediaTypes = %v, want %v (sorted)", got, want) + } +} + +func TestTextArtifacts(t *testing.T) { + in := []content.Artifact{ + {Text: "first"}, + {Text: ""}, // empty inline text: excluded + {Blob: &content.StoreHandle{Backend: "gcs"}}, // blob payload: excluded + {Text: "second", Blob: nil}, + } + got := extract.TextArtifacts(in) + if len(got) != 2 || got[0].Text != "first" || got[1].Text != "second" { + t.Fatalf("TextArtifacts = %+v, want the two non-empty inline-text artifacts", got) + } +} + +func TestSingleTextArtifact(t *testing.T) { + one, err := extract.SingleTextArtifact([]content.Artifact{{Text: "only"}}) + if err != nil || one.Text != "only" { + t.Fatalf("single = (%+v, %v), want the one artifact", one, err) + } + + _, err = extract.SingleTextArtifact([]content.Artifact{{Blob: &content.StoreHandle{}}}) + if !errors.Is(err, extract.ErrNoTextArtifact) { + t.Fatalf("no-text err = %v, want ErrNoTextArtifact", err) + } + + _, err = extract.SingleTextArtifact([]content.Artifact{{Text: "a"}, {Text: "b"}}) + if !errors.Is(err, extract.ErrMultipleTextArtifacts) { + t.Fatalf("multi err = %v, want ErrMultipleTextArtifacts", err) + } +} diff --git a/extract/preset/preset.go b/extract/preset/preset.go new file mode 100644 index 0000000..4b3408e --- /dev/null +++ b/extract/preset/preset.go @@ -0,0 +1,56 @@ +// Package preset wires the common document extractors into an extract.Registry +// in one call. It is a convenience bundle, not policy: it only registers +// format extractors (text, HTML, PDF, DOCX, Markdown) and chooses no app +// behavior. +// +// Importing this package pulls in every bundled format's dependencies (e.g. +// golang.org/x/net for HTML, dslipak/pdf for PDF) — that is the trade for the +// one-call convenience. Apps that want a leaner import tree register only the +// formats they use against extract.NewRegistry directly. A depguard rule keeps +// core packages from importing this bundle (see +// docs/adr/0006-optional-adapters-as-subpackages.md). +// +// reg := preset.NewDocumentRegistry(extract.WithMaxBytes(maxUploadBytes)) +package preset + +import ( + "github.com/SnapdragonPartners/maestro-cms/content" + "github.com/SnapdragonPartners/maestro-cms/extract" + "github.com/SnapdragonPartners/maestro-cms/extract/docx" + "github.com/SnapdragonPartners/maestro-cms/extract/html" + "github.com/SnapdragonPartners/maestro-cms/extract/markdown" + "github.com/SnapdragonPartners/maestro-cms/extract/pdf" +) + +// mediaTypeHTML is the media type for HTML sources; the html subpackage exposes +// no constant, so it is named here. +const mediaTypeHTML content.MediaType = "text/html" + +// RegisterDocuments registers the common document extractors on r: plain text, +// HTML, PDF, DOCX, and Markdown (both text/markdown and text/x-markdown). It +// follows extract.Registry.Register's semantics and so panics if a media type is +// already registered on r. +func RegisterDocuments(r *extract.Registry) { + r.Register(extract.MediaTypeText, extract.NewTextExtractor()) + r.Register(mediaTypeHTML, html.New()) + r.Register(pdf.MediaType, pdf.New()) + r.Register(docx.MediaType, docx.New()) + r.Register(markdown.MediaType, markdown.New()) + r.Register(markdown.MediaTypeX, markdown.New()) +} + +// NewDocumentRegistry returns a new extract.Registry with the common document +// extractors registered. Options (e.g. extract.WithMaxBytes) are passed through +// to extract.NewRegistry. +func NewDocumentRegistry(opts ...extract.Option) *extract.Registry { + r := extract.NewRegistry(opts...) + RegisterDocuments(r) + return r +} + +// SupportedDocumentMediaTypes returns the canonical media types the bundle +// registers, sorted. It delegates to the registry so the list cannot drift from +// what RegisterDocuments actually wires up. +func SupportedDocumentMediaTypes() []content.MediaType { + return NewDocumentRegistry().SupportedMediaTypes() +} diff --git a/extract/preset/preset_test.go b/extract/preset/preset_test.go new file mode 100644 index 0000000..a75edef --- /dev/null +++ b/extract/preset/preset_test.go @@ -0,0 +1,69 @@ +package preset_test + +import ( + "context" + "slices" + "strings" + "testing" + + "github.com/SnapdragonPartners/maestro-cms/content" + "github.com/SnapdragonPartners/maestro-cms/extract" + "github.com/SnapdragonPartners/maestro-cms/extract/preset" +) + +func TestNewDocumentRegistrySupportsAll(t *testing.T) { + reg := preset.NewDocumentRegistry() + for _, mt := range []content.MediaType{ + "text/plain", "text/html", "application/pdf", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "text/markdown", "text/x-markdown", + } { + if !reg.Supports(mt) { + t.Errorf("registry does not support %q", mt) + } + } +} + +func TestSupportedDocumentMediaTypesSorted(t *testing.T) { + got := preset.SupportedDocumentMediaTypes() + want := []content.MediaType{ + "application/pdf", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "text/html", + "text/markdown", + "text/plain", + "text/x-markdown", + } + if !slices.Equal(got, want) { + t.Fatalf("SupportedDocumentMediaTypes = %v, want %v", got, want) + } + // Delegation guarantee: the bundle list matches what a fresh registry reports. + if !slices.Equal(got, preset.NewDocumentRegistry().SupportedMediaTypes()) { + t.Fatal("SupportedDocumentMediaTypes drifted from the registry") + } +} + +func TestNewDocumentRegistryExtracts(t *testing.T) { + reg := preset.NewDocumentRegistry() + // Markdown is wired and preserved verbatim (structure intact). + arts, err := reg.Extract(context.Background(), "text/markdown", strings.NewReader("# Title\n\nbody"), "src-1") + if err != nil { + t.Fatalf("Extract: %v", err) + } + got, err := extract.SingleTextArtifact(arts) + if err != nil { + t.Fatalf("SingleTextArtifact: %v", err) + } + if got.MediaType != "text/markdown" || !strings.Contains(got.Text, "# Title") { + t.Fatalf("unexpected artifact: %+v", got) + } +} + +func TestNewDocumentRegistryPassesOptions(t *testing.T) { + // WithMaxBytes flows through to the registry: an oversize source is rejected. + reg := preset.NewDocumentRegistry(extract.WithMaxBytes(4)) + _, err := reg.Extract(context.Background(), "text/plain", strings.NewReader("way too long"), "src-1") + if err == nil { + t.Fatal("expected ErrSourceTooLarge from WithMaxBytes, got nil") + } +} diff --git a/store/gcs/gcs.go b/store/gcs/gcs.go index 33ef72e..f7038ed 100644 --- a/store/gcs/gcs.go +++ b/store/gcs/gcs.go @@ -60,6 +60,19 @@ func New(ctx context.Context, bucket string, opts ...option.ClientOption) (*Stor return &Store{client: client, bucket: bucket, ownsClient: true}, nil } +// NewEmulator constructs a Store over bucket pointed at a GCS-compatible +// emulator endpoint (e.g. a fsouza/fake-gcs-server) with authentication +// disabled. It is a dev/test convenience equivalent to +// New(ctx, bucket, option.WithEndpoint(endpoint), option.WithoutAuthentication()); +// do not use it against real GCS. endpoint must be non-empty. (Alternatively, +// set STORAGE_EMULATOR_HOST and use New, which the SDK honors automatically.) +func NewEmulator(ctx context.Context, bucket, endpoint string) (*Store, error) { + if endpoint == "" { + return nil, errors.New("gcs: emulator endpoint must not be empty") + } + return New(ctx, bucket, option.WithEndpoint(endpoint), option.WithoutAuthentication()) +} + // NewWithClient wraps an existing *storage.Client as a Store over bucket — the // seam for callers that build and share their own client (one client across // several buckets, or a client pointed at an emulator in tests). diff --git a/store/gcs/gcs_integration_test.go b/store/gcs/gcs_integration_test.go index de9fff0..98acb9b 100644 --- a/store/gcs/gcs_integration_test.go +++ b/store/gcs/gcs_integration_test.go @@ -12,6 +12,7 @@ import ( "errors" "io" "os" + "strings" "testing" "cloud.google.com/go/storage" @@ -174,3 +175,31 @@ func TestGCSExistsMissingIsFalse(t *testing.T) { t.Fatalf("Exists missing = (%v, %v), want (false, nil)", ok, err) } } + +func TestGCSNewEmulatorRoundTrip(t *testing.T) { + _ = newStore(t) // skips if no emulator, and ensures the bucket exists + ctx := context.Background() + st, err := gcs.NewEmulator(ctx, testBucket, os.Getenv("STORAGE_EMULATOR_HOST")) + if err != nil { + t.Fatalf("NewEmulator: %v", err) + } + t.Cleanup(func() { _ = st.Close() }) // New-constructed: owns and closes its client + + const key = "emulator/obj.bin" + if err := st.Put(ctx, key, strings.NewReader("via emulator")); err != nil { + t.Fatalf("Put: %v", err) + } + rc, err := st.Get(ctx, key) + if err != nil { + t.Fatalf("Get: %v", err) + } + got, err := io.ReadAll(rc) + _ = rc.Close() + if err != nil { + t.Fatalf("read: %v", err) + } + if string(got) != "via emulator" { + t.Fatalf("Get = %q, want %q", got, "via emulator") + } + _ = st.Delete(ctx, key) +} diff --git a/store/gcs/gcs_test.go b/store/gcs/gcs_test.go new file mode 100644 index 0000000..080e297 --- /dev/null +++ b/store/gcs/gcs_test.go @@ -0,0 +1,36 @@ +package gcs_test + +import ( + "context" + "testing" + + "github.com/SnapdragonPartners/maestro-cms/store/gcs" +) + +func mustPanic(t *testing.T, name string, fn func()) { + t.Helper() + defer func() { + if recover() == nil { + t.Fatalf("%s: expected panic, got none", name) + } + }() + fn() +} + +func TestNewWithClientPanics(t *testing.T) { + // Bucket is checked before client, so an empty bucket panics even with a nil + // client; a nil client panics with a valid bucket. + mustPanic(t, "empty bucket", func() { gcs.NewWithClient("", nil) }) + mustPanic(t, "nil client", func() { gcs.NewWithClient("bucket", nil) }) +} + +func TestNewEmulatorValidates(t *testing.T) { + ctx := context.Background() + if _, err := gcs.NewEmulator(ctx, "bucket", ""); err == nil { + t.Fatal("NewEmulator with empty endpoint = nil error, want error") + } + // Empty bucket is rejected by New before any client/network work. + if _, err := gcs.NewEmulator(ctx, "", "http://localhost:4443"); err == nil { + t.Fatal("NewEmulator with empty bucket = nil error, want error") + } +} diff --git a/store/helpers.go b/store/helpers.go new file mode 100644 index 0000000..24f6533 --- /dev/null +++ b/store/helpers.go @@ -0,0 +1,24 @@ +package store + +import ( + "context" + "errors" +) + +// IsNotFound reports whether err indicates a missing object, i.e. it matches +// ErrObjectNotFound. It is sugar over errors.Is for the common check. +func IsNotFound(err error) bool { + return errors.Is(err, ErrObjectNotFound) +} + +// DeleteIfExists deletes key and treats a missing object as success. ObjectStore +// Delete is deliberately strict (it returns ErrObjectNotFound for an absent +// key); this helper is the idempotent variant for cleanup and rollback paths +// where "already gone" is the desired outcome. Any other error is returned +// unchanged. +func DeleteIfExists(ctx context.Context, s ObjectStore, key string) error { + if err := s.Delete(ctx, key); err != nil && !IsNotFound(err) { + return err //nolint:wrapcheck // pass-through of the store's own already-contextual Delete error + } + return nil +} diff --git a/store/helpers_test.go b/store/helpers_test.go new file mode 100644 index 0000000..ef17c42 --- /dev/null +++ b/store/helpers_test.go @@ -0,0 +1,66 @@ +package store_test + +import ( + "context" + "errors" + "strings" + "testing" + + "github.com/SnapdragonPartners/maestro-cms/store" + "github.com/SnapdragonPartners/maestro-cms/testcms" +) + +func TestIsNotFound(t *testing.T) { + if !store.IsNotFound(store.ErrObjectNotFound) { + t.Fatal("IsNotFound(ErrObjectNotFound) = false") + } + if !store.IsNotFound(errwrap(store.ErrObjectNotFound)) { + t.Fatal("IsNotFound(wrapped) = false") + } + if store.IsNotFound(nil) { + t.Fatal("IsNotFound(nil) = true") + } + if store.IsNotFound(errors.New("other")) { + t.Fatal("IsNotFound(other) = true") + } +} + +func errwrap(err error) error { return errors.Join(errors.New("ctx"), err) } + +func TestDeleteIfExistsMissingIsNil(t *testing.T) { + ms := testcms.NewMemoryStore() + if err := store.DeleteIfExists(context.Background(), ms, "absent"); err != nil { + t.Fatalf("DeleteIfExists(absent) = %v, want nil", err) + } +} + +func TestDeleteIfExistsPresentDeletes(t *testing.T) { + ms := testcms.NewMemoryStore() + ctx := context.Background() + if err := ms.Put(ctx, "k", strings.NewReader("v")); err != nil { + t.Fatalf("Put: %v", err) + } + if err := store.DeleteIfExists(ctx, ms, "k"); err != nil { + t.Fatalf("DeleteIfExists: %v", err) + } + if ok, _ := ms.Exists(ctx, "k"); ok { + t.Fatal("object still present after DeleteIfExists") + } +} + +// failStore returns a non-not-found error from Delete, to confirm DeleteIfExists +// passes real errors through. +type failStore struct { + store.ObjectStore + err error +} + +func (f failStore) Delete(context.Context, string) error { return f.err } + +func TestDeleteIfExistsPassesOtherErrors(t *testing.T) { + boom := errors.New("transport failure") + err := store.DeleteIfExists(context.Background(), failStore{err: boom}, "k") + if !errors.Is(err, boom) { + t.Fatalf("DeleteIfExists = %v, want %v", err, boom) + } +}