SnapdragonPartners · dratner · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/.golangci.yaml b/.golangci.yaml
@@ -252,6 +252,8 @@ linters-settings:
             desc: "core must not import format-adapter subpackages (ADR 0006); applications wire adapters in"
           - pkg: "github.com/SnapdragonPartners/maestro-cms/extract/markdown"
             desc: "core must not import format-adapter subpackages (ADR 0006); applications wire adapters in"
+          - pkg: "github.com/SnapdragonPartners/maestro-cms/extract/preset"
+            desc: "core must not import the format-bundle subpackage (ADR 0006); applications wire it in"
           - pkg: "github.com/SnapdragonPartners/maestro-cms/store/gcs"
             desc: "core must not import storage-adapter subpackages (ADR 0006); applications wire adapters in"
           - pkg: "github.com/SnapdragonPartners/maestro-cms/index"

diff --git a/content/hash.go b/content/hash.go
@@ -0,0 +1,61 @@
+package content
+
+import (
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"io"
+)
+
+// SHA-256 hex helpers. These are content-addressing primitives only: they
+// compute a stable, lowercase-hex SHA-256 digest of some bytes. They carry no
+// opinion about what the digest identifies — raw source identity (Source.Hash),
+// an extracted artifact's identity, a chunk's identity, or a cache key are all
+// application decisions. The library never computes or mutates Source.Hash on
+// the app's behalf; these helpers just remove the boilerplate when the app does.
+
+// SHA256HexBytes returns the lowercase-hex SHA-256 digest of b.
+func SHA256HexBytes(b []byte) string {
+	sum := sha256.Sum256(b)
+	return hex.EncodeToString(sum[:])
+}
+
+// SHA256HexString returns the lowercase-hex SHA-256 digest of s.
+func SHA256HexString(s string) string {
+	return SHA256HexBytes([]byte(s))
+}
+
+// SHA256HexReader streams r through a SHA-256 hash without buffering it all in
+// memory, returning the lowercase-hex digest and the number of bytes read. It
+// honors ctx cancellation between reads (a reader already blocked inside Read is
+// not interrupted). It does not bound input size; the caller wraps r (e.g. with
+// io.LimitReader) if a bound is needed.
+func SHA256HexReader(ctx context.Context, r io.Reader) (hexDigest string, n int64, err error) {
+	if cerr := ctx.Err(); cerr != nil {
+		return "", 0, fmt.Errorf("content: hash aborted: %w", cerr)
+	}
+	h := sha256.New()
+	n, err = io.Copy(h, &ctxReader{ctx: ctx, r: r})
+	if err != nil {
+		return "", n, fmt.Errorf("content: hash read: %w", err)
+	}
+	return hex.EncodeToString(h.Sum(nil)), n, nil
+}
+
+// ctxReader makes a streaming read cancellable: each Read first observes ctx.
+// Cancellation is cooperative (checked between reads), the standard bridge for
+// an io.Reader that has no context parameter.
+//
+//nolint:containedctx // intentional ctx bridge for io.Reader, which has no ctx param
+type ctxReader struct {
+	ctx context.Context
+	r   io.Reader
+}
+
+func (cr *ctxReader) Read(p []byte) (int, error) {
+	if err := cr.ctx.Err(); err != nil {
+		return 0, err //nolint:wrapcheck // sentinel ctx error; SHA256HexReader adds context
+	}
+	return cr.r.Read(p) //nolint:wrapcheck // pass-through of underlying reader, incl io.EOF
+}
diff --git a/content/hash_test.go b/content/hash_test.go
@@ -0,0 +1,65 @@
+package content_test
+
+import (
+	"context"
+	"errors"
+	"strings"
+	"testing"
+
+	"github.com/SnapdragonPartners/maestro-cms/content"
+)
+
+// Known SHA-256 vectors.
+const (
+	hashABC   = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad" // "abc"
+	hashEmpty = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" // ""
+)
+
+func TestSHA256HexBytesAndString(t *testing.T) {
+	if got := content.SHA256HexBytes([]byte("abc")); got != hashABC {
+		t.Fatalf("SHA256HexBytes = %q, want %q", got, hashABC)
+	}
+	if got := content.SHA256HexString("abc"); got != hashABC {
+		t.Fatalf("SHA256HexString = %q, want %q", got, hashABC)
+	}
+	if got := content.SHA256HexBytes(nil); got != hashEmpty {
+		t.Fatalf("SHA256HexBytes(nil) = %q, want empty-hash %q", got, hashEmpty)
+	}
+}
+
+func TestSHA256HexReader(t *testing.T) {
+	h, n, err := content.SHA256HexReader(context.Background(), strings.NewReader("abc"))
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	if n != 3 {
+		t.Fatalf("n = %d, want 3", n)
+	}
+	if h != hashABC {
+		t.Fatalf("hash = %q, want %q", h, hashABC)
+	}
+}
+
+func TestSHA256HexReaderAgreesWithBytes(t *testing.T) {
+	const s = "the quick brown fox\n\nmultiple paragraphs"
+	want := content.SHA256HexString(s)
+	got, n, err := content.SHA256HexReader(context.Background(), strings.NewReader(s))
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	if got != want {
+		t.Fatalf("reader hash %q != string hash %q", got, want)
+	}
+	if n != int64(len(s)) {
+		t.Fatalf("n = %d, want %d", n, len(s))
+	}
+}
+
+func TestSHA256HexReaderCanceled(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+	_, _, err := content.SHA256HexReader(ctx, strings.NewReader("abc"))
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("err = %v, want context.Canceled", err)
+	}
+}
diff --git a/extract/helpers.go b/extract/helpers.go
@@ -0,0 +1,76 @@
+package extract
+
+import (
+	"errors"
+	"slices"
+
+	"github.com/SnapdragonPartners/maestro-cms/content"
+)
+
+// Errors returned by SingleTextArtifact.
+var (
+	// ErrNoTextArtifact indicates no text artifact was present.
+	ErrNoTextArtifact = errors.New("extract: no text artifact")
+	// ErrMultipleTextArtifacts indicates more than one text artifact was present,
+	// so picking one is an application policy decision, not the library's.
+	ErrMultipleTextArtifacts = errors.New("extract: multiple text artifacts")
+)
+
+// Supports reports whether an extractor is registered for mediaType. It applies
+// the same canonicalization as Extract (lowercased type, parameters dropped), so
+// "text/plain", "Text/Plain", and "text/plain; charset=utf-8" all match alike.
+func (r *Registry) Supports(mediaType content.MediaType) bool {
+	_, ok := r.Get(mediaType)
+	return ok
+}
+
+// SupportedMediaTypes returns the canonical media types this registry can
+// extract, sorted for deterministic output. The values are canonical (as stored
+// by Register), so they round-trip through Get/Supports.
+func (r *Registry) SupportedMediaTypes() []content.MediaType {
+	out := make([]content.MediaType, 0, len(r.extractors))
+	for mt := range r.extractors {
+		out = append(out, mt)
+	}
+	slices.Sort(out)
+	return out
+}
+
+// TextArtifacts returns the artifacts that carry an inline text payload — those
+// with no Blob handle and non-empty Text. The test is structural (payload shape),
+// not semantic: it does not inspect MediaType, so inline OCR, transcript, or
+// caption text counts too. Callers that care about media type filter further.
+//
+// The returned artifacts are shallow copies: as with content.Artifact generally,
+// each shares its Metadata map with the input (no deep clone). Use
+// content.Artifact.Clone for an independent copy.
+func TextArtifacts(artifacts []content.Artifact) []content.Artifact {
+	out := make([]content.Artifact, 0, len(artifacts))
+	for i := range artifacts {
+		a := &artifacts[i]
+		if a.Blob == nil && a.Text != "" {
+			out = append(out, *a)
+		}
+	}
+	return out
+}
+
+// SingleTextArtifact returns the one inline-text artifact in artifacts. It
+// returns ErrNoTextArtifact when there is none and ErrMultipleTextArtifacts when
+// there is more than one — the multi-artifact case is the application's policy to
+// resolve, not a choice the library makes for it. It is the convenience for the
+// common "this source yields exactly one text payload" path.
+//
+// As with TextArtifacts, the returned artifact is a shallow copy and shares its
+// Metadata map with the input; use content.Artifact.Clone for an independent copy.
+func SingleTextArtifact(artifacts []content.Artifact) (content.Artifact, error) {
+	texts := TextArtifacts(artifacts)
+	switch len(texts) {
+	case 0:
+		return content.Artifact{}, ErrNoTextArtifact
+	case 1:
+		return texts[0], nil
+	default:
+		return content.Artifact{}, ErrMultipleTextArtifacts
+	}
+}
diff --git a/extract/helpers_test.go b/extract/helpers_test.go
@@ -0,0 +1,68 @@
+package extract_test
+
+import (
+	"errors"
+	"slices"
+	"testing"
+
+	"github.com/SnapdragonPartners/maestro-cms/content"
+	"github.com/SnapdragonPartners/maestro-cms/extract"
+)
+
+func TestRegistrySupports(t *testing.T) {
+	r := extract.NewRegistry()
+	r.Register("text/plain", extract.NewTextExtractor())
+
+	if !r.Supports("text/plain") {
+		t.Fatal("Supports(text/plain) = false")
+	}
+	// Same canonicalization as Extract: casing + parameters are normalized.
+	if !r.Supports("Text/Plain; charset=utf-8") {
+		t.Fatal("Supports with casing/params = false")
+	}
+	if r.Supports("application/pdf") {
+		t.Fatal("Supports(application/pdf) = true for empty registry entry")
+	}
+}
+
+func TestRegistrySupportedMediaTypesSorted(t *testing.T) {
+	r := extract.NewRegistry()
+	r.Register("text/plain", extract.NewTextExtractor())
+	r.Register("application/json", extract.NewTextExtractor())
+
+	got := r.SupportedMediaTypes()
+	want := []content.MediaType{"application/json", "text/plain"}
+	if !slices.Equal(got, want) {
+		t.Fatalf("SupportedMediaTypes = %v, want %v (sorted)", got, want)
+	}
+}
+
+func TestTextArtifacts(t *testing.T) {
+	in := []content.Artifact{
+		{Text: "first"},
+		{Text: ""}, // empty inline text: excluded
+		{Blob: &content.StoreHandle{Backend: "gcs"}}, // blob payload: excluded
+		{Text: "second", Blob: nil},
+	}
+	got := extract.TextArtifacts(in)
+	if len(got) != 2 || got[0].Text != "first" || got[1].Text != "second" {
+		t.Fatalf("TextArtifacts = %+v, want the two non-empty inline-text artifacts", got)
+	}
+}
+
+func TestSingleTextArtifact(t *testing.T) {
+	one, err := extract.SingleTextArtifact([]content.Artifact{{Text: "only"}})
+	if err != nil || one.Text != "only" {
+		t.Fatalf("single = (%+v, %v), want the one artifact", one, err)
+	}
+
+	_, err = extract.SingleTextArtifact([]content.Artifact{{Blob: &content.StoreHandle{}}})
+	if !errors.Is(err, extract.ErrNoTextArtifact) {
+		t.Fatalf("no-text err = %v, want ErrNoTextArtifact", err)
+	}
+
+	_, err = extract.SingleTextArtifact([]content.Artifact{{Text: "a"}, {Text: "b"}})
+	if !errors.Is(err, extract.ErrMultipleTextArtifacts) {
+		t.Fatalf("multi err = %v, want ErrMultipleTextArtifacts", err)
+	}
+}
diff --git a/extract/preset/preset.go b/extract/preset/preset.go
@@ -0,0 +1,56 @@
+// Package preset wires the common document extractors into an extract.Registry
+// in one call. It is a convenience bundle, not policy: it only registers
+// format extractors (text, HTML, PDF, DOCX, Markdown) and chooses no app
+// behavior.
+//
+// Importing this package pulls in every bundled format's dependencies (e.g.
+// golang.org/x/net for HTML, dslipak/pdf for PDF) — that is the trade for the
+// one-call convenience. Apps that want a leaner import tree register only the
+// formats they use against extract.NewRegistry directly. A depguard rule keeps
+// core packages from importing this bundle (see
+// docs/adr/0006-optional-adapters-as-subpackages.md).
+//
+//	reg := preset.NewDocumentRegistry(extract.WithMaxBytes(maxUploadBytes))
+package preset
+
+import (
+	"github.com/SnapdragonPartners/maestro-cms/content"
+	"github.com/SnapdragonPartners/maestro-cms/extract"
+	"github.com/SnapdragonPartners/maestro-cms/extract/docx"
+	"github.com/SnapdragonPartners/maestro-cms/extract/html"
+	"github.com/SnapdragonPartners/maestro-cms/extract/markdown"
+	"github.com/SnapdragonPartners/maestro-cms/extract/pdf"
+)
+
+// mediaTypeHTML is the media type for HTML sources; the html subpackage exposes
+// no constant, so it is named here.
+const mediaTypeHTML content.MediaType = "text/html"
+
+// RegisterDocuments registers the common document extractors on r: plain text,
+// HTML, PDF, DOCX, and Markdown (both text/markdown and text/x-markdown). It
+// follows extract.Registry.Register's semantics and so panics if a media type is
+// already registered on r.
+func RegisterDocuments(r *extract.Registry) {
+	r.Register(extract.MediaTypeText, extract.NewTextExtractor())
+	r.Register(mediaTypeHTML, html.New())
+	r.Register(pdf.MediaType, pdf.New())
+	r.Register(docx.MediaType, docx.New())
+	r.Register(markdown.MediaType, markdown.New())
+	r.Register(markdown.MediaTypeX, markdown.New())
+}
+
+// NewDocumentRegistry returns a new extract.Registry with the common document
+// extractors registered. Options (e.g. extract.WithMaxBytes) are passed through
+// to extract.NewRegistry.
+func NewDocumentRegistry(opts ...extract.Option) *extract.Registry {
+	r := extract.NewRegistry(opts...)
+	RegisterDocuments(r)
+	return r
+}
+
+// SupportedDocumentMediaTypes returns the canonical media types the bundle
+// registers, sorted. It delegates to the registry so the list cannot drift from
+// what RegisterDocuments actually wires up.
+func SupportedDocumentMediaTypes() []content.MediaType {
+	return NewDocumentRegistry().SupportedMediaTypes()
+}