Skip to content

Commit e4f9781

Browse files
committed
feat: add phase 2 notebook search/chunk routes and oss roadmap docs
1 parent 6f734aa commit e4f9781

6 files changed

Lines changed: 283 additions & 0 deletions

File tree

.github/workflows/ci.yml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
name: CI
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches: [main]
7+
8+
jobs:
9+
validate:
10+
runs-on: ubuntu-latest
11+
steps:
12+
- name: Checkout
13+
uses: actions/checkout@v4
14+
15+
- name: Setup Node
16+
uses: actions/setup-node@v4
17+
with:
18+
node-version: '20'
19+
cache: 'npm'
20+
21+
- name: Install
22+
run: npm ci
23+
24+
- name: Typecheck
25+
run: npm run type-check
26+
27+
- name: Test
28+
run: npm test
29+
30+
smoke-v2-live:
31+
runs-on: ubuntu-latest
32+
needs: validate
33+
if: ${{ secrets.MINDSPRING_BASE_URL != '' && secrets.MINDSPRING_API_KEY != '' }}
34+
steps:
35+
- name: Checkout
36+
uses: actions/checkout@v4
37+
38+
- name: Setup Node
39+
uses: actions/setup-node@v4
40+
with:
41+
node-version: '20'
42+
cache: 'npm'
43+
44+
- name: Install
45+
run: npm ci
46+
47+
- name: Run v2 smoke against deployed worker
48+
env:
49+
MINDSPRING_BASE_URL: ${{ secrets.MINDSPRING_BASE_URL }}
50+
MINDSPRING_API_KEY: ${{ secrets.MINDSPRING_API_KEY }}
51+
run: npm run smoke:v2

ARCHITECTURE.md

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# MindSpring Architecture
2+
3+
## Runtime Model
4+
5+
- Cloudflare Worker (Hono) serves API + static SPA
6+
- Queue consumer handles asynchronous ingestion
7+
- Data planes:
8+
- Vectorize: embeddings + lightweight metadata
9+
- KV: fast hydration + request telemetry + v1 progress states
10+
- D1: v2 relational notebook/source/chunk/job state
11+
- R2: raw uploaded source files
12+
13+
## v1 vs v2 Boundaries
14+
15+
### v1 (`/api/*`)
16+
17+
- Primary primitive: conversation archive
18+
- Storage pattern:
19+
- full conversation text in KV (`conv:*`)
20+
- vector metadata in Vectorize
21+
- Ingestion supports JSON array/object conversation exports
22+
23+
### v2 (`/api/v2/workspaces/:workspaceId/notebooks/*`)
24+
25+
- Primary primitive: Knowledge Notebook (workspace scoped)
26+
- Storage pattern:
27+
- notebooks/sources/chunks/jobs in D1
28+
- vectors in Vectorize with notebook/source/chunk metadata pointers
29+
- raw files in R2
30+
- Ingestion supports parser-typed jobs from source registration
31+
- Current parser coverage: `markdown`, `txt` (+ NDJSON thread-compatible ingest path)
32+
33+
## Query Flow
34+
35+
1. request enters notebook scoped route
36+
2. embedding generated via Workers AI
37+
3. vector search in Vectorize
38+
4. notebook scope enforced via metadata filter + app-level guard
39+
5. fallback to D1 chunk retrieval when vector/generation path degrades
40+
41+
## Deletion Model
42+
43+
- Notebook delete is soft-delete (`deleted_at`) in D1
44+
- Sources under notebook are soft-deleted together
45+
- This avoids irreversible data loss and enables async cleanup workflows
46+
47+
## Design Constraints
48+
49+
- Module size cap: < 400 lines per source module
50+
- Keep runtime dependency surface minimal (Hono only)
51+
- Prefer deterministic, source-grounded outputs with explicit citations
52+
- Keep OSS-safe boundaries: no secrets/PII artifacts in tracked files

ROADMAP.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# MindSpring Roadmap
2+
3+
## Now
4+
5+
- v1 production flows stable:
6+
- conversation upload (`/api/uploads/*`)
7+
- semantic search (`/api/search`)
8+
- RAG chat (`/api/chat`)
9+
- v2 backend foundation live:
10+
- notebooks CRUD/read lifecycle (workspace scoped)
11+
- source registration + ingestion jobs
12+
- job status polling
13+
- notebook chat with citation fallback
14+
- notebook search + chunk diagnostics
15+
- v2 parsers currently implemented:
16+
- `markdown`
17+
- `txt`
18+
- NDJSON thread ingestion path for AEGIS-compatible writes
19+
20+
## Next
21+
22+
- Implement v2 artifact persistence (`POST /artifacts`) with `snapshot_hashes`
23+
- Implement `chat_export` parser under v2 pipeline
24+
- Add source-level invalidation/staleness markers for artifacts
25+
- Add migration bridge strategy for optional v1 -> v2 notebook wrapping
26+
27+
## Later
28+
29+
- Implement `url` parser
30+
- Implement `pdf` parser
31+
- Add notebook-first frontend routes once backend is fully stabilized
32+
- Add release automation and tagged version docs for v2 milestones

openapi.yaml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -847,6 +847,61 @@ paths:
847847
'404':
848848
$ref: '#/components/responses/NotFound'
849849

850+
/api/v2/workspaces/{workspaceId}/notebooks/{notebookId}/search:
851+
post:
852+
summary: Notebook-scoped semantic search
853+
operationId: notebookSearchV2
854+
tags: [notebooks-v2]
855+
x-auth-scope: read
856+
parameters:
857+
- $ref: '#/components/parameters/WorkspaceId'
858+
- $ref: '#/components/parameters/NotebookId'
859+
requestBody:
860+
required: true
861+
content:
862+
application/json:
863+
schema:
864+
type: object
865+
required: [query]
866+
properties:
867+
query:
868+
type: string
869+
limit:
870+
type: integer
871+
minimum: 1
872+
maximum: 50
873+
threshold:
874+
type: number
875+
minimum: 0
876+
maximum: 1
877+
responses:
878+
'200':
879+
description: Notebook scoped search results
880+
'404':
881+
$ref: '#/components/responses/NotFound'
882+
883+
/api/v2/workspaces/{workspaceId}/notebooks/{notebookId}/chunks:
884+
get:
885+
summary: Notebook chunk diagnostics
886+
operationId: notebookChunksV2
887+
tags: [notebooks-v2]
888+
x-auth-scope: read
889+
parameters:
890+
- $ref: '#/components/parameters/WorkspaceId'
891+
- $ref: '#/components/parameters/NotebookId'
892+
- name: limit
893+
in: query
894+
schema:
895+
type: integer
896+
minimum: 1
897+
maximum: 200
898+
default: 50
899+
responses:
900+
'200':
901+
description: Raw chunk diagnostics for notebook
902+
'404':
903+
$ref: '#/components/responses/NotFound'
904+
850905
/api/v2/workspaces/{workspaceId}/notebooks/{notebookId}/artifacts:
851906
post:
852907
summary: Build studio artifact

src/lib/v2-store.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,3 +291,40 @@ export async function getSource(
291291

292292
return row ?? null
293293
}
294+
295+
export interface NotebookChunkRecord {
296+
id: string
297+
notebook_id: string
298+
source_id: string
299+
content: string
300+
chunk_hash: string
301+
char_start: number
302+
char_end: number
303+
created_at: string
304+
source_title: string
305+
}
306+
307+
export async function listNotebookChunks(
308+
env: Env,
309+
workspaceId: string,
310+
notebookId: string,
311+
limit: number = 50
312+
): Promise<NotebookChunkRecord[]> {
313+
const db = requireDb(env)
314+
const rows = await db
315+
.prepare(
316+
`SELECT
317+
c.id, c.notebook_id, c.source_id, c.content, c.chunk_hash, c.char_start, c.char_end, c.created_at,
318+
s.title AS source_title
319+
FROM chunks c
320+
JOIN sources s ON s.id = c.source_id
321+
JOIN notebooks n ON n.id = c.notebook_id
322+
WHERE n.workspace_id = ? AND c.notebook_id = ? AND n.deleted_at IS NULL
323+
ORDER BY c.created_at DESC
324+
LIMIT ?`
325+
)
326+
.bind(workspaceId, notebookId, limit)
327+
.all<NotebookChunkRecord>()
328+
329+
return rows.results
330+
}

src/routes/notebooks-v2-manage.ts

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@ import type { Env } from '../lib/types'
33
import {
44
getNotebook,
55
getSource,
6+
listNotebookChunks,
67
listNotebooks,
78
listSources,
89
patchNotebook,
910
softDeleteNotebook,
1011
} from '../lib/v2-store'
12+
import { generateQueryEmbedding } from '../lib/embeddings'
13+
import { VectorStore } from '../lib/vectorize'
1114

1215
const notebooksV2Manage = new Hono<{ Bindings: Env }>()
1316

@@ -113,4 +116,57 @@ notebooksV2Manage.get('/:notebookId/sources/:sourceId', async (c) => {
113116
return c.json(source)
114117
})
115118

119+
notebooksV2Manage.post('/:notebookId/search', async (c) => {
120+
const workspaceId = requireParam(c.req.param('workspaceId'), 'workspaceId')
121+
const notebookId = requireParam(c.req.param('notebookId'), 'notebookId')
122+
const body = await c.req.json<{
123+
query: string
124+
limit?: number
125+
threshold?: number
126+
}>()
127+
128+
if (!body.query?.trim()) {
129+
return c.json({ error: 'query is required' }, 400)
130+
}
131+
132+
const notebook = await getNotebook(c.env, workspaceId, notebookId)
133+
if (!notebook) return c.json({ error: 'notebook not found' }, 404)
134+
135+
const queryVector = await generateQueryEmbedding(body.query, c.env)
136+
const store = new VectorStore(c.env)
137+
const limit = Math.min(Math.max(body.limit ?? 10, 1), 50)
138+
const threshold = body.threshold ?? 0
139+
140+
const results = await store.search(queryVector, limit, threshold, {
141+
hydrateFullText: true,
142+
notebookId,
143+
})
144+
145+
return c.json({
146+
query: body.query,
147+
notebookId,
148+
count: results.length,
149+
results,
150+
})
151+
})
152+
153+
notebooksV2Manage.get('/:notebookId/chunks', async (c) => {
154+
const workspaceId = requireParam(c.req.param('workspaceId'), 'workspaceId')
155+
const notebookId = requireParam(c.req.param('notebookId'), 'notebookId')
156+
const limit = Math.min(
157+
Math.max(parseInt(c.req.query('limit') ?? '50', 10) || 50, 1),
158+
200
159+
)
160+
161+
const notebook = await getNotebook(c.env, workspaceId, notebookId)
162+
if (!notebook) return c.json({ error: 'notebook not found' }, 404)
163+
164+
const chunks = await listNotebookChunks(c.env, workspaceId, notebookId, limit)
165+
return c.json({
166+
notebookId,
167+
count: chunks.length,
168+
chunks,
169+
})
170+
})
171+
116172
export { notebooksV2Manage }

0 commit comments

Comments
 (0)