Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions pkg/ingest/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,11 +359,11 @@ func (p *Pipeline) summaryFor(ctx context.Context, s db.Section, childLines []st
resp, err := p.LLM.Complete(ctx, llmgate.Request{
Model: p.SummaryModel,
Temperature: 0.0,
MaxTokens: 200,
MaxTokens: 260,
Messages: []llmgate.Message{
{Role: llmgate.RoleSystem, Content: summarySystemPrompt(profile)},
{Role: llmgate.RoleUser, Content: fmt.Sprintf(
"Summarize this section titled %q in a single sentence (max 40 words):\n\n%s",
"Section titled %q.\n\n%s\n\nReturn a single sentence (≤ 60 words) that names this section's concrete topics, entities, identifiers, and key items so a retrieval engine can match it to user questions.",
cleanForLLM(s.Title), body)},
},
})
Expand Down Expand Up @@ -484,16 +484,21 @@ func isLikelyMojibakeTitle(s string) bool {
}

// summarySystemPrompt returns a domain-aware system prompt for the
// summarization LLM based on the document's store profile. Domain framing
// nudges the model toward the salient facts of that document class.
// summarization LLM based on the document's store profile. Summaries are
// optimized for RETRIEVAL: a downstream retrieval engine, given only the
// summary, should be able to tell whether the section answers a specific
// question. So we ask the model to name the concrete topics, entities,
// identifiers, and key items the section covers — not just describe it
// generically.
func summarySystemPrompt(profile string) string {
const retrievalRule = "Write so a downstream retrieval engine, reading only your summary, can tell whether this section answers a specific user question. Name the section's concrete topics — entities, identifiers, table contents, named items, key numbers — not just a generic description. One factual sentence, ≤ 60 words, no preamble, no quotes."
switch strings.ToLower(strings.TrimSpace(profile)) {
case "research":
return "You summarize sections of academic research papers. In one factual sentence capture the key claim, method, dataset, or result of the section. No preamble, no quotes, no citations."
return "You summarize sections of academic research papers. Capture the key claim, method, dataset, or result. " + retrievalRule
case "medical":
return "You summarize sections of clinical and medical documents. In one factual sentence capture the key finding, recommendation, dosage, definition, or guideline of the section. No preamble, no quotes."
return "You summarize sections of clinical and medical documents. Capture the key finding, recommendation, dosage, drug name, definition, or guideline. " + retrievalRule
default:
return "You write short, factual section summaries. One sentence, no preamble, no quotes."
return "You summarize sections of business, legal, and financial documents (filings, reports, contracts). " + retrievalRule
}
}

Expand Down
92 changes: 92 additions & 0 deletions pkg/parser/chunk_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package parser

import (
"strings"
"testing"
)

func TestChunkOversizedLeavesSplits(t *testing.T) {
// 12 words per "sentence", 5 sentences ~ 60-65 words, ~360 chars; we want
// >2400 chars so build it from a longer paragraph + a colon-terminated header.
header := "Securities registered pursuant to Section 12(b) of the Act: "
long := strings.Repeat("alpha beta gamma delta epsilon zeta eta theta iota kappa lambda mu ", 60)
content := header + long
if len(content) <= leafChunkThreshold {
t.Fatalf("test setup: content must exceed threshold; got %d", len(content))
}
in := []Section{{Level: 1, Title: "3M COMPANY", Content: content}}

out := chunkOversizedLeaves(in)
if len(out) != 1 {
t.Fatalf("expected 1 top-level section, got %d", len(out))
}
parent := out[0]
if parent.Title != "3M COMPANY" {
t.Errorf("parent title should be preserved, got %q", parent.Title)
}
if parent.Content != "" {
t.Errorf("parent content should be cleared after splitting, got %d chars", len(parent.Content))
}
if len(parent.Children) < 2 {
t.Fatalf("expected multiple chunks, got %d", len(parent.Children))
}
// First chunk's title should use the colon-terminated header.
if !strings.HasPrefix(parent.Children[0].Title, "Securities registered pursuant to Section 12(b)") {
t.Errorf("first chunk title should come from the colon header, got %q", parent.Children[0].Title)
}
// Every chunk's content should be non-empty and well below the original.
for i, c := range parent.Children {
if c.Content == "" {
t.Errorf("chunk %d has empty content", i)
}
if len(c.Content) > leafChunkTarget*2 {
t.Errorf("chunk %d larger than expected: %d chars", i, len(c.Content))
}
}
}

func TestChunkOversizedLeavesLeavesSmallSectionsAlone(t *testing.T) {
in := []Section{
{Level: 1, Title: "Intro", Content: strings.Repeat("a b c d e f ", 50)}, // ~600 chars
{Level: 1, Title: "Methods", Content: strings.Repeat("x y z ", 200)}, // ~1200 chars
}
out := chunkOversizedLeaves(in)
if len(out) != 2 {
t.Fatalf("expected 2 sections preserved, got %d", len(out))
}
for i, s := range out {
if len(s.Children) != 0 {
t.Errorf("section %d was unexpectedly split into %d children", i, len(s.Children))
}
}
}

func TestChunkOversizedLeavesRecursesIntoInternals(t *testing.T) {
bigLeaf := Section{Level: 2, Title: "Detail", Content: strings.Repeat("the quick brown fox jumps over the lazy dog ", 100)}
parent := Section{Level: 1, Title: "Parent", Children: []Section{bigLeaf}}
out := chunkOversizedLeaves([]Section{parent})
if len(out) != 1 || len(out[0].Children) == 0 {
t.Fatalf("parent should be retained with chunked children, got %+v", out)
}
leaf := out[0].Children[0]
if leaf.Title != "Detail" {
t.Errorf("inner leaf title should be preserved, got %q", leaf.Title)
}
if len(leaf.Children) < 2 {
t.Errorf("inner leaf should have been chunked, has %d children", len(leaf.Children))
}
}

func TestDeriveChunkTitleColonHeader(t *testing.T) {
got := deriveChunkTitle("Securities registered pursuant to Section 12(b) of the Act: Title of each class ...", "fallback")
want := "Securities registered pursuant to Section 12(b) of the Act"
if got != want {
t.Errorf("colon-header title: got %q want %q", got, want)
}
}

func TestDeriveChunkTitleFallback(t *testing.T) {
if got := deriveChunkTitle("", "fb"); got != "fb" {
t.Errorf("empty chunk should fall back, got %q", got)
}
}
Loading
Loading