go-rag/keyword.go at main · dAppCore/go-rag · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
package rag

import (
	"iter"
	"math"
	"slices"

	"dappco.re/go"
)

// KeywordResult represents a keyword-search hit with TF-IDF score and
// backing chunk metadata. Emitted by KeywordIndex.Search.
//
//	result := KeywordResult{Source: "docs/guide.md", Score: 0.42, Text: "..."}
type KeywordResult struct {
	// Text is the chunk text that matched the query terms.
	Text string
	// Source is the source document path when available.
	Source string
	// Section is the Markdown section attached to the chunk.
	Section string
	// ChunkIndex is the chunk's zero-based source position.
	ChunkIndex int
	// Score is the TF-IDF relevance score.
	Score float32
}

// GetText returns the result text (satisfies textResult / rankedResult).
func (r KeywordResult) GetText() string { return r.Text }

// GetScore returns the TF-IDF similarity score.
func (r KeywordResult) GetScore() float32 { return r.Score }

// GetSource returns the source path.
func (r KeywordResult) GetSource() string { return r.Source }

// HasChunkIndex reports whether this result carries explicit chunk metadata.
func (r KeywordResult) HasChunkIndex() bool { return true }

// GetChunkIndex returns the source chunk index.
func (r KeywordResult) GetChunkIndex() int {
	return r.ChunkIndex
}

// KeywordIndex is a lightweight TF-IDF keyword search index over a fixed set
// of chunks. It is the fallback search path when Qdrant or Ollama are
// unavailable — no network, no embeddings, just bag-of-words scoring.
//
//	idx := NewKeywordIndex(chunks)
//	results := idx.Search("authentication setup", 5)
type KeywordIndex struct {
	chunks []Chunk
	// termFreq[i] is a map of term -> normalised frequency within chunk i.
	termFreq []map[string]float32
	// docFreq is a map of term -> number of chunks containing the term.
	docFreq map[string]int
	// docCount is the total number of indexed chunks.
	docCount int
}

// SearchKeywords builds a temporary TF-IDF index over the provided chunks and
// returns the top-K keyword matches for the query.
//
//	results := SearchKeywords(chunks, "authentication setup", 5)
func SearchKeywords(chunks []Chunk, query string, topK int) []KeywordResult {
	return NewKeywordIndex(chunks).Search(query, topK)
}

// SearchKeywordsSeq is the iterator form of SearchKeywords.
//
//	for result := range SearchKeywordsSeq(chunks, "authentication setup", 5) { _ = result }
func SearchKeywordsSeq(chunks []Chunk, query string, topK int) iter.Seq[KeywordResult] {
	return func(yield func(KeywordResult) bool) {
		for _, result := range SearchKeywords(chunks, query, topK) {
			if !yield(result) {
				return
			}
		}
	}
}

// NewKeywordIndex builds a TF-IDF index from the given chunks.
// Tokens shorter than 3 characters are ignored.
//
//	idx := NewKeywordIndex(chunks)
func NewKeywordIndex(chunks []Chunk) *KeywordIndex {
	idx := &KeywordIndex{
		chunks:   make([]Chunk, len(chunks)),
		termFreq: make([]map[string]float32, len(chunks)),
		docFreq:  make(map[string]int, 32),
		docCount: len(chunks),
	}
	copy(idx.chunks, chunks)

	for i, chunk := range chunks {
		tokens := tokenise(chunk.Text)
		if len(tokens) == 0 {
			idx.termFreq[i] = map[string]float32{}
			continue
		}

		counts := make(map[string]int, len(tokens))
		for _, token := range tokens {
			counts[token]++
		}

		total := float32(len(tokens))
		tf := make(map[string]float32, len(counts))
		for term, count := range counts {
			tf[term] = float32(count) / total
			idx.docFreq[term]++
		}
		idx.termFreq[i] = tf
	}

	return idx
}

// Len returns the number of indexed chunks.
// n := idx.Len()
func (idx *KeywordIndex) Len() int {
	if idx == nil {
		return 0
	}
	return idx.docCount
}

// Search returns the top-K chunks matching the query ranked by TF-IDF score.
// Query tokens shorter than 3 characters are discarded. Chunks with zero
// matches are excluded from the result.
//
//	results := idx.Search("authentication setup", 5)
func (idx *KeywordIndex) Search(query string, topK int) []KeywordResult {
	if idx == nil || idx.docCount == 0 {
		return nil
	}
	queryTerms := tokenise(query)
	if len(queryTerms) == 0 {
		return nil
	}

	// Deduplicate query terms; repeated terms shouldn't inflate scores.
	seen := make(map[string]struct{}, len(queryTerms))
	uniqueTerms := make([]string, 0, len(queryTerms))
	for _, term := range queryTerms {
		if _, ok := seen[term]; ok {
			continue
		}
		seen[term] = struct{}{}
		uniqueTerms = append(uniqueTerms, term)
	}

	docCountF := float64(idx.docCount)
	scores := make([]float32, idx.docCount)

	// Smoothed IDF so that a matching term always contributes a positive weight
	// even when df == docCount (tiny corpora). idf = log((N+1)/(df+1)) + 1.
	for _, term := range uniqueTerms {
		df := idx.docFreq[term]
		if df == 0 {
			continue
		}
		idf := float32(math.Log((docCountF+1.0)/float64(df+1)) + 1.0)
		for i, tf := range idx.termFreq {
			weight, ok := tf[term]
			if !ok {
				continue
			}
			scores[i] += weight * idf
		}
	}

	results := make([]KeywordResult, 0, idx.docCount)
	for i, score := range scores {
		if score <= 0 {
			continue
		}
		chunk := idx.chunks[i]
		results = append(results, KeywordResult{
			Text:       chunk.Text,
			Section:    chunk.Section,
			ChunkIndex: chunk.Index,
			Score:      score,
		})
	}

	slices.SortFunc(results, func(a, b KeywordResult) int {
		switch {
		case a.Score > b.Score:
			return -1
		case a.Score < b.Score:
			return 1
		case a.ChunkIndex < b.ChunkIndex:
			return -1
		case a.ChunkIndex > b.ChunkIndex:
			return 1
		}
		return 0
	})

	if topK > 0 && len(results) > topK {
		results = results[:topK]
	}
	return results
}

// tokenise splits text into lowercase tokens of length >= 3.
func tokenise(text string) []string {
	var tokens []string
	current := core.NewBuilder()

	flush := func() {
		if current.Len() == 0 {
			return
		}
		token := core.Lower(current.String())
		current.Reset()
		if len([]rune(token)) >= 3 {
			tokens = append(tokens, token)
		}
	}

	for _, r := range text {
		switch {
		case r == ' ' || r == '\t' || r == '\n' || r == '\r':
			flush()
		case r == '.' || r == ',' || r == ';' || r == ':' || r == '!' || r == '?' || r == '"' || r == '\'' || r == '(' || r == ')' || r == '[' || r == ']' || r == '{' || r == '}' || r == '<' || r == '>':
			flush()
		default:
			current.WriteRune(r)
		}
	}
	flush()

	return tokens
}

// KeywordFilter re-ranks query results by boosting scores for results whose
// text contains one or more of the given keywords. Matching is
// case-insensitive using core.Contains. Each keyword match adds a 10%
// boost to the original score: score *= 1.0 + 0.1 * matchCount.
// Results are re-sorted by boosted score descending.
// KeywordFilter(results, []string{"kubernetes", "containers"})
func KeywordFilter(results []QueryResult, keywords []string) []QueryResult {
	if len(keywords) == 0 || len(results) == 0 {
		return results
	}

	// Normalise keywords to lowercase once and deduplicate them so repeated
	// query terms do not inflate the boost.
	lowerKeywords := make([]string, 0, len(keywords))
	seen := make(map[string]struct{}, len(keywords))
	for _, kw := range keywords {
		kw = core.Lower(kw)
		if kw == "" {
			continue
		}
		if _, ok := seen[kw]; ok {
			continue
		}
		seen[kw] = struct{}{}
		lowerKeywords = append(lowerKeywords, kw)
	}

	// Apply boost
	boosted := make([]QueryResult, len(results))
	copy(boosted, results)

	for i := range boosted {
		lowerText := core.Lower(boosted[i].Text)
		matchCount := 0
		for _, kw := range lowerKeywords {
			if kw != "" && core.Contains(lowerText, kw) {
				matchCount++
			}
		}
		if matchCount > 0 {
			boosted[i].Score *= 1.0 + 0.1*float32(matchCount)
		}
	}

	// Re-sort by boosted score descending
	slices.SortFunc(boosted, func(a, b QueryResult) int {
		if a.Score > b.Score {
			return -1
		} else if a.Score < b.Score {
			return 1
		}
		if a.Source < b.Source {
			return -1
		}
		if a.Source > b.Source {
			return 1
		}
		if a.ChunkIndex < b.ChunkIndex {
			return -1
		}
		if a.ChunkIndex > b.ChunkIndex {
			return 1
		}
		if a.Text < b.Text {
			return -1
		}
		if a.Text > b.Text {
			return 1
		}
		return 0
	})

	return boosted
}

// KeywordFilterSeq is an iterator version of KeywordFilter.
// for result := range KeywordFilterSeq(results, []string{"kubernetes"}) { _ = result }
func KeywordFilterSeq(results []QueryResult, keywords []string) iter.Seq[QueryResult] {
	return func(yield func(QueryResult) bool) {
		filtered := KeywordFilter(results, keywords)
		for _, r := range filtered {
			if !yield(r) {
				return
			}
		}
	}
}

// extractKeywords splits query text into individual keywords for filtering.
// Words shorter than 3 characters are discarded as they tend to be noise.
func extractKeywords(query string) []string {
	return slices.Collect(extractKeywordsSeq(query))
}

// extractKeywordsSeq returns an iterator that yields keywords from a query.
func extractKeywordsSeq(query string) iter.Seq[string] {
	return func(yield func(string) bool) {
		for _, w := range tokenise(query) {
			if !yield(w) {
				return
			}
		}
	}
}

func fields(text string) []string {
	var words []string
	current := core.NewBuilder()

	flush := func() {
		if current.Len() == 0 {
			return
		}
		words = append(words, current.String())
		current.Reset()
	}

	for _, r := range text {
		switch r {
		case ' ', '\t', '\n', '\r':
			flush()
		default:
			current.WriteRune(r)
		}
	}
	flush()

	return words
}