Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .golangci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,8 @@ linters-settings:
desc: "core must not import format-adapter subpackages (ADR 0006); applications wire adapters in"
- pkg: "github.com/SnapdragonPartners/maestro-cms/extract/markdown"
desc: "core must not import format-adapter subpackages (ADR 0006); applications wire adapters in"
- pkg: "github.com/SnapdragonPartners/maestro-cms/extract/preset"
desc: "core must not import the format-bundle subpackage (ADR 0006); applications wire it in"
- pkg: "github.com/SnapdragonPartners/maestro-cms/store/gcs"
desc: "core must not import storage-adapter subpackages (ADR 0006); applications wire adapters in"
- pkg: "github.com/SnapdragonPartners/maestro-cms/index"
Expand Down
61 changes: 61 additions & 0 deletions content/hash.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package content

import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
)

// SHA-256 hex helpers. These are content-addressing primitives only: they
// compute a stable, lowercase-hex SHA-256 digest of some bytes. They carry no
// opinion about what the digest identifies — raw source identity (Source.Hash),
// an extracted artifact's identity, a chunk's identity, or a cache key are all
// application decisions. The library never computes or mutates Source.Hash on
// the app's behalf; these helpers just remove the boilerplate when the app does.

// SHA256HexBytes returns the lowercase-hex SHA-256 digest of b.
func SHA256HexBytes(b []byte) string {
sum := sha256.Sum256(b)
return hex.EncodeToString(sum[:])
}

// SHA256HexString returns the lowercase-hex SHA-256 digest of s.
func SHA256HexString(s string) string {
return SHA256HexBytes([]byte(s))
}

// SHA256HexReader streams r through a SHA-256 hash without buffering it all in
// memory, returning the lowercase-hex digest and the number of bytes read. It
// honors ctx cancellation between reads (a reader already blocked inside Read is
// not interrupted). It does not bound input size; the caller wraps r (e.g. with
// io.LimitReader) if a bound is needed.
func SHA256HexReader(ctx context.Context, r io.Reader) (hexDigest string, n int64, err error) {
if cerr := ctx.Err(); cerr != nil {
return "", 0, fmt.Errorf("content: hash aborted: %w", cerr)
}
h := sha256.New()
n, err = io.Copy(h, &ctxReader{ctx: ctx, r: r})
if err != nil {
return "", n, fmt.Errorf("content: hash read: %w", err)
}
return hex.EncodeToString(h.Sum(nil)), n, nil
}

// ctxReader makes a streaming read cancellable: each Read first observes ctx.
// Cancellation is cooperative (checked between reads), the standard bridge for
// an io.Reader that has no context parameter.
//
//nolint:containedctx // intentional ctx bridge for io.Reader, which has no ctx param
type ctxReader struct {
ctx context.Context
r io.Reader
}

func (cr *ctxReader) Read(p []byte) (int, error) {
if err := cr.ctx.Err(); err != nil {
return 0, err //nolint:wrapcheck // sentinel ctx error; SHA256HexReader adds context
}
return cr.r.Read(p) //nolint:wrapcheck // pass-through of underlying reader, incl io.EOF
}
65 changes: 65 additions & 0 deletions content/hash_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package content_test

import (
"context"
"errors"
"strings"
"testing"

"github.com/SnapdragonPartners/maestro-cms/content"
)

// Known SHA-256 vectors.
const (
hashABC = "ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad" // "abc"
hashEmpty = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" // ""
)

func TestSHA256HexBytesAndString(t *testing.T) {
if got := content.SHA256HexBytes([]byte("abc")); got != hashABC {
t.Fatalf("SHA256HexBytes = %q, want %q", got, hashABC)
}
if got := content.SHA256HexString("abc"); got != hashABC {
t.Fatalf("SHA256HexString = %q, want %q", got, hashABC)
}
if got := content.SHA256HexBytes(nil); got != hashEmpty {
t.Fatalf("SHA256HexBytes(nil) = %q, want empty-hash %q", got, hashEmpty)
}
}

func TestSHA256HexReader(t *testing.T) {
h, n, err := content.SHA256HexReader(context.Background(), strings.NewReader("abc"))
if err != nil {
t.Fatalf("err: %v", err)
}
if n != 3 {
t.Fatalf("n = %d, want 3", n)
}
if h != hashABC {
t.Fatalf("hash = %q, want %q", h, hashABC)
}
}

func TestSHA256HexReaderAgreesWithBytes(t *testing.T) {
const s = "the quick brown fox\n\nmultiple paragraphs"
want := content.SHA256HexString(s)
got, n, err := content.SHA256HexReader(context.Background(), strings.NewReader(s))
if err != nil {
t.Fatalf("err: %v", err)
}
if got != want {
t.Fatalf("reader hash %q != string hash %q", got, want)
}
if n != int64(len(s)) {
t.Fatalf("n = %d, want %d", n, len(s))
}
}

func TestSHA256HexReaderCanceled(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
cancel()
_, _, err := content.SHA256HexReader(ctx, strings.NewReader("abc"))
if !errors.Is(err, context.Canceled) {
t.Fatalf("err = %v, want context.Canceled", err)
}
}
76 changes: 76 additions & 0 deletions extract/helpers.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package extract

import (
"errors"
"slices"

"github.com/SnapdragonPartners/maestro-cms/content"
)

// Errors returned by SingleTextArtifact.
var (
// ErrNoTextArtifact indicates no text artifact was present.
ErrNoTextArtifact = errors.New("extract: no text artifact")
// ErrMultipleTextArtifacts indicates more than one text artifact was present,
// so picking one is an application policy decision, not the library's.
ErrMultipleTextArtifacts = errors.New("extract: multiple text artifacts")
)

// Supports reports whether an extractor is registered for mediaType. It applies
// the same canonicalization as Extract (lowercased type, parameters dropped), so
// "text/plain", "Text/Plain", and "text/plain; charset=utf-8" all match alike.
func (r *Registry) Supports(mediaType content.MediaType) bool {
_, ok := r.Get(mediaType)
return ok
}

// SupportedMediaTypes returns the canonical media types this registry can
// extract, sorted for deterministic output. The values are canonical (as stored
// by Register), so they round-trip through Get/Supports.
func (r *Registry) SupportedMediaTypes() []content.MediaType {
out := make([]content.MediaType, 0, len(r.extractors))
for mt := range r.extractors {
out = append(out, mt)
}
slices.Sort(out)
return out
}

// TextArtifacts returns the artifacts that carry an inline text payload — those
// with no Blob handle and non-empty Text. The test is structural (payload shape),
// not semantic: it does not inspect MediaType, so inline OCR, transcript, or
// caption text counts too. Callers that care about media type filter further.
//
// The returned artifacts are shallow copies: as with content.Artifact generally,
// each shares its Metadata map with the input (no deep clone). Use
// content.Artifact.Clone for an independent copy.
func TextArtifacts(artifacts []content.Artifact) []content.Artifact {
out := make([]content.Artifact, 0, len(artifacts))
for i := range artifacts {
a := &artifacts[i]
if a.Blob == nil && a.Text != "" {
out = append(out, *a)
}
}
return out
}

// SingleTextArtifact returns the one inline-text artifact in artifacts. It
// returns ErrNoTextArtifact when there is none and ErrMultipleTextArtifacts when
// there is more than one — the multi-artifact case is the application's policy to
// resolve, not a choice the library makes for it. It is the convenience for the
// common "this source yields exactly one text payload" path.
//
// As with TextArtifacts, the returned artifact is a shallow copy and shares its
// Metadata map with the input; use content.Artifact.Clone for an independent copy.
func SingleTextArtifact(artifacts []content.Artifact) (content.Artifact, error) {
texts := TextArtifacts(artifacts)
switch len(texts) {
case 0:
return content.Artifact{}, ErrNoTextArtifact
case 1:
return texts[0], nil
default:
return content.Artifact{}, ErrMultipleTextArtifacts
}
}
68 changes: 68 additions & 0 deletions extract/helpers_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package extract_test

import (
"errors"
"slices"
"testing"

"github.com/SnapdragonPartners/maestro-cms/content"
"github.com/SnapdragonPartners/maestro-cms/extract"
)

func TestRegistrySupports(t *testing.T) {
r := extract.NewRegistry()
r.Register("text/plain", extract.NewTextExtractor())

if !r.Supports("text/plain") {
t.Fatal("Supports(text/plain) = false")
}
// Same canonicalization as Extract: casing + parameters are normalized.
if !r.Supports("Text/Plain; charset=utf-8") {
t.Fatal("Supports with casing/params = false")
}
if r.Supports("application/pdf") {
t.Fatal("Supports(application/pdf) = true for empty registry entry")
}
}

func TestRegistrySupportedMediaTypesSorted(t *testing.T) {
r := extract.NewRegistry()
r.Register("text/plain", extract.NewTextExtractor())
r.Register("application/json", extract.NewTextExtractor())

got := r.SupportedMediaTypes()
want := []content.MediaType{"application/json", "text/plain"}
if !slices.Equal(got, want) {
t.Fatalf("SupportedMediaTypes = %v, want %v (sorted)", got, want)
}
}

func TestTextArtifacts(t *testing.T) {
in := []content.Artifact{
{Text: "first"},
{Text: ""}, // empty inline text: excluded
{Blob: &content.StoreHandle{Backend: "gcs"}}, // blob payload: excluded
{Text: "second", Blob: nil},
}
got := extract.TextArtifacts(in)
if len(got) != 2 || got[0].Text != "first" || got[1].Text != "second" {
t.Fatalf("TextArtifacts = %+v, want the two non-empty inline-text artifacts", got)
}
}

func TestSingleTextArtifact(t *testing.T) {
one, err := extract.SingleTextArtifact([]content.Artifact{{Text: "only"}})
if err != nil || one.Text != "only" {
t.Fatalf("single = (%+v, %v), want the one artifact", one, err)
}

_, err = extract.SingleTextArtifact([]content.Artifact{{Blob: &content.StoreHandle{}}})
if !errors.Is(err, extract.ErrNoTextArtifact) {
t.Fatalf("no-text err = %v, want ErrNoTextArtifact", err)
}

_, err = extract.SingleTextArtifact([]content.Artifact{{Text: "a"}, {Text: "b"}})
if !errors.Is(err, extract.ErrMultipleTextArtifacts) {
t.Fatalf("multi err = %v, want ErrMultipleTextArtifacts", err)
}
}
56 changes: 56 additions & 0 deletions extract/preset/preset.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Package preset wires the common document extractors into an extract.Registry
// in one call. It is a convenience bundle, not policy: it only registers
// format extractors (text, HTML, PDF, DOCX, Markdown) and chooses no app
// behavior.
//
// Importing this package pulls in every bundled format's dependencies (e.g.
// golang.org/x/net for HTML, dslipak/pdf for PDF) — that is the trade for the
// one-call convenience. Apps that want a leaner import tree register only the
// formats they use against extract.NewRegistry directly. A depguard rule keeps
// core packages from importing this bundle (see
// docs/adr/0006-optional-adapters-as-subpackages.md).
//
// reg := preset.NewDocumentRegistry(extract.WithMaxBytes(maxUploadBytes))
package preset

import (
"github.com/SnapdragonPartners/maestro-cms/content"
"github.com/SnapdragonPartners/maestro-cms/extract"
"github.com/SnapdragonPartners/maestro-cms/extract/docx"
"github.com/SnapdragonPartners/maestro-cms/extract/html"
"github.com/SnapdragonPartners/maestro-cms/extract/markdown"
"github.com/SnapdragonPartners/maestro-cms/extract/pdf"
)

// mediaTypeHTML is the media type for HTML sources; the html subpackage exposes
// no constant, so it is named here.
const mediaTypeHTML content.MediaType = "text/html"

// RegisterDocuments registers the common document extractors on r: plain text,
// HTML, PDF, DOCX, and Markdown (both text/markdown and text/x-markdown). It
// follows extract.Registry.Register's semantics and so panics if a media type is
// already registered on r.
func RegisterDocuments(r *extract.Registry) {
r.Register(extract.MediaTypeText, extract.NewTextExtractor())
r.Register(mediaTypeHTML, html.New())
r.Register(pdf.MediaType, pdf.New())
r.Register(docx.MediaType, docx.New())
r.Register(markdown.MediaType, markdown.New())
r.Register(markdown.MediaTypeX, markdown.New())
}

// NewDocumentRegistry returns a new extract.Registry with the common document
// extractors registered. Options (e.g. extract.WithMaxBytes) are passed through
// to extract.NewRegistry.
func NewDocumentRegistry(opts ...extract.Option) *extract.Registry {
r := extract.NewRegistry(opts...)
RegisterDocuments(r)
return r
}

// SupportedDocumentMediaTypes returns the canonical media types the bundle
// registers, sorted. It delegates to the registry so the list cannot drift from
// what RegisterDocuments actually wires up.
func SupportedDocumentMediaTypes() []content.MediaType {
return NewDocumentRegistry().SupportedMediaTypes()
}
Loading
Loading