-
-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathekg-embedding.el
More file actions
526 lines (469 loc) · 23 KB
/
ekg-embedding.el
File metadata and controls
526 lines (469 loc) · 23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
;;; ekg-embeddings.el --- Create and use embeddings for ekg -*- lexical-binding: t -*-
;; Copyright (c) 2023-2025 Andrew Hyatt <ahyatt@gmail.com>
;; Author: Andrew Hyatt <ahyatt@gmail.com>
;; Homepage: https://github.com/ahyatt/ekg
;; Keywords: outlines, hypermedia
;; SPDX-License-Identifier: GPL-3.0-or-later
;;
;; This program is free software; you can redistribute it and/or
;; modify it under the terms of the GNU General Public License as
;; published by the Free Software Foundation; either version 3 of the
;; License, or (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;; General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;; This is a module for creating, storing, and using embeddings in ekg. The
;; embeddings provide the capability of understanding note and tag similarity,
;; as well as searching via embedding.
;;
;; It is highly recommended that you byte-compile this, or, better yet,
;; native-compile this, due to the amount of calculations that happen.
(require 'ekg)
(require 'llm)
(require 'vecdb nil t)
(declare-function vecdb-exists "vecdb")
(declare-function vecdb-create "vecdb")
(declare-function vecdb-upsert-items "vecdb")
(declare-function vecdb-delete-items "vecdb")
(declare-function vecdb-get-item "vecdb")
(declare-function vecdb-search-by-vector "vecdb")
(declare-function vecdb-item-payload "vecdb")
(declare-function vecdb-item-vector "vecdb")
(declare-function make-vecdb-item "vecdb")
;;; Code:
(defgroup ekg-embedding nil
"Embedding-based functionality for ekg."
:group 'ekg)
(defcustom ekg-generate-all-buffer "*ekg embedding generation*"
"Buffer name used for messages related to generating embeddings."
:type 'string
:group 'ekg-embedding)
(defcustom ekg-embedding-text-selector #'ekg-embedding-text-selector-initial
"Function to select the text of the embedding.
This is necessary because there are usually token limits in the
API calls. The function will be passed the full text and will
return the text to pass to the embedding API."
:type '(function)
:group 'ekg-embedding)
(defcustom ekg-embedding-batch-size 100
"The number of embeddings to generate in a single batch."
:type 'integer
:group 'ekg-embedding)
(defconst ekg-embedding-max-words 800 "The maximum number of words to use for generating embeddings.")
(defvar ekg-embedding-provider nil
"The provider of the embedding.
This is a struct representing a provider in the `llm' package.")
(defvar ekg-vecdb-provider nil
"The vecdb provider for the `ekg-embedding' module.
This is a CONS of an `vecdb-provider' and a `embed-db-collection'.
If nil, then we fallback to the default `ekg-db'.")
(defun ekg-embedding-connect ()
"Ensure the database is connected and ekg-embedding schema exists."
(ekg-connect)
(if (and ekg-vecdb-provider (not (vecdb-exists (car ekg-vecdb-provider)
(cdr ekg-vecdb-provider))))
(vecdb-create (car ekg-vecdb-provider)
(cdr ekg-vecdb-provider))
(ekg-embedding-add-schema)))
(defun ekg-embedding-add-schema ()
"Add the triples schema for storing embeddings, if we are using the sqlite db."
(unless ekg-vecdb-provider
(triples-add-schema ekg-db 'embedding '(embedding :base/unique t :base/type vector))))
(add-to-list 'ekg-header-hidden-properties :embedding/embedding)
(defun ekg-remove-sqlite-embeddings ()
"Remove all embeddings from the SQLite database.
This should be used when you either decide not to use embeddings, or
switch from the native sqlite to a vecdb provider."
(interactive)
(ekg-embedding-connect)
(triples-remove-schema-type ekg-db 'embedding)
;; Now that the embeddings are removed, we should compact the database, which
;; can happen in the builtin sqlite.
(when (and (eq triples-sqlite-interface 'builtin) (fboundp 'sqlite-execute))
(sqlite-execute ekg-db "VACUUM")))
(defun ekg-embedding-average (embeddings)
"Compute the average of all of EMBEDDINGS, a list.
Return the vector embedding. This assumes all embeddings are the
same size. There must be at least one embedding passed in."
(let* ((v (make-vector (length (car embeddings)) 0)))
(cl-loop for e in (seq-filter (lambda (e) (= (length e) (length v))) embeddings) do
(cl-loop for i below (length e) do
(aset v i (+ (aref v i) (aref e i)))))
(cl-loop for i below (length v) do
(aset v i (/ (aref v i) (length embeddings))))
v))
(defun ekg-embedding-id-to-embed-id (input)
"Hash the INPUT id to a uint64 id that will be maximally unique."
(let* ((string-input (format "%S" input))
;; Get binary string of SHA256 digest
(hash-binary (secure-hash 'sha256 string-input nil nil t))
;; Convert string to list of bytes (integers 0–255)
(hash-bytes (string-to-list hash-binary))
;; Take first 8 bytes
(first-8-bytes (seq-subseq hash-bytes 0 8))
(uint64 0))
;; Fold bytes into a single uint64 value
(dolist (byte first-8-bytes uint64)
(setq uint64 (+ (ash uint64 8) byte)))))
(defun ekg-embedding--note-to-embed-item (note embedding)
"Convert NOTE to an vecdb item with EMBEDDING."
(make-vecdb-item :id (ekg-embedding-id-to-embed-id (ekg-note-id note))
:payload `(:ekg-id ,(format "%S" (ekg-note-id note)))
:vector embedding))
(defun ekg-embedding-generate-for-note-async (note &optional success-callback error-callback)
"Calculate and set the embedding for NOTE.
The embedding is calculated asynchronously and the data is
updated afterwards.
If SUCCESS-CALLBACK is non-nil, call it after setting the value,
with NOTE as the argument.
If ERROR-CALLBACK is non-nil use it on error, otherwise log a message."
(ekg-embedding-connect)
(llm-embedding-async
ekg-embedding-provider
(funcall ekg-embedding-text-selector
(substring-no-properties
(ekg-display-note-text note ekg-embedding-max-words 'plaintext)))
(lambda (embedding)
(ekg-connect)
(ekg-embedding-batch-store
(list (ekg-embedding--note-to-embed-item note embedding)))
(when success-callback (funcall success-callback note)))
(or error-callback
(lambda (error-type msg)
(message "ekg-embedding: error %s: %s" error-type msg)))))
(defun ekg-embedding-generate-for-note-sync (note)
"Calculate and set the embedding for NOTE.
The embedding is calculated synchronously, and the caller will
wait for the embedding to return and be set."
(ekg-embedding-connect)
(let ((embedding (llm-embedding
ekg-embedding-provider
(funcall ekg-embedding-text-selector
(substring-no-properties
(ekg-display-note-text note ekg-embedding-max-words 'plaintext))))))
(if (ekg-embedding-valid-p embedding)
(if ekg-vecdb-provider
(ekg-embedding-batch-store
(list (ekg-embedding--note-to-embed-item note embedding)))
(triples-set-type ekg-db (ekg-note-id note) 'embedding
:embedding embedding))
(lwarn '(ekg embedding-generation) :error "Invalid and unusable embedding generated from llm-embedding of note %s: %S"
(ekg-note-id note) embedding))))
(defun ekg-embedding-batch-store (items)
"Store a batch of ITEMS in the embedding database."
(if ekg-vecdb-provider
(let ((provider (car ekg-vecdb-provider))
(collection (cdr ekg-vecdb-provider)))
(vecdb-upsert-items provider collection items))
(cl-loop for item in items do
(triples-set-type ekg-db (plist-get item :id) 'embedding
:embedding (plist-get item :vector)))))
(defun ekg-embedding-generate-batch-async (notes success-callback error-callback)
"Generate embeddings for NOTES in a batch.
SUCCESS-CALLBACK is called with the size of the batch after the batch is
finished.
ERROR-CALLBACK is called with error-type and message on errors."
(let ((texts (mapcar (lambda (note)
(funcall ekg-embedding-text-selector
(substring-no-properties
(ekg-display-note-text note ekg-embedding-max-words 'plaintext))))
notes)))
(llm-batch-embeddings-async
ekg-embedding-provider
texts
(lambda (embeddings)
(ekg-connect)
(cl-loop for note in notes
for embedding in embeddings
collect (ekg-embedding--note-to-embed-item note embedding) into items
finally
(ekg-embedding-batch-store items)
(when success-callback (funcall success-callback (length notes)))))
error-callback)))
(defun ekg-embedding-generate-for-note-tags-delayed (note)
"Run `ekg-embedding-generate-for-note-tags' after a delay.
The delay is necessary when notes have just been saved, because
they may not have an embedding yet.
NOTE is the note to create an embedding for."
(run-with-idle-timer (* 60 5) nil
(lambda ()
(ekg-embedding-generate-for-note-tags note))))
(defun ekg-embedding-generate-for-note-tags (note)
"Calculate and set the embedding for all the tags of NOTE."
(ekg-embedding-connect)
(cl-loop for tag in (ekg-note-tags note) do
(ekg-embedding-refresh-tag-embedding tag)))
(defun ekg-embedding-note-get (note)
"Get the already store embedding for NOTE."
(ekg-embedding-get (ekg-note-id note)))
(defun ekg-embedding-valid-p (embedding)
"Return non-nil if EMBEDDING is valid."
;; If there's a 0, it can't be a valid embedding - we assume we have to have a
;; non-zero value on every dimension of the embedding. This is likely true,
;; but more likely 0s tend to indicate issues with how the embedding was
;; obtained.
(and (vectorp embedding) (> (length embedding) 0)
(not (seq-contains-p embedding 0))))
(defun ekg-embedding-refresh-tag-embedding (tag)
"Refresh the embedding for TAG.
The embedding for TAG is recomputed by averaging all the
embeddings of notes with the given tag."
(condition-case err
(let ((embeddings
(cl-loop for tagged in
(plist-get (triples-get-type ekg-db tag 'tag) :tagged)
for note = (ekg-get-note-with-id tagged)
;; Skip deleted notes that are still in the tag list.
when note
collect
(let ((embedding (ekg-embedding-get tagged)))
(unless (ekg-embedding-valid-p embedding)
(message "ekg-embedding: invalid embedding for note %s, attempting to fix" tagged)
(condition-case nil
(progn
(ekg-embedding-generate-for-note-sync note)
(let ((new-embedding (ekg-embedding-note-get note)))
(when (ekg-embedding-valid-p new-embedding)
(ekg-save-note note)
(setf embedding new-embedding))))
(error nil))
(unless (ekg-embedding-valid-p embedding)
(warn "ekg-embedding: could not fix invalid embedding for note %s, skipping" tagged)))
embedding))))
(let ((avg (ekg-embedding-average
(seq-filter #'ekg-embedding-valid-p embeddings))))
(if (ekg-embedding-valid-p avg)
(if ekg-vecdb-provider
(ekg-embedding-batch-store
(list (ekg-embedding--note-to-embed-item
(ekg-get-note-with-id tag) avg)))
(triples-set-type ekg-db tag 'embedding :embedding avg))
(message "ekg-embedding: could not compute average embedding for tag %s" tag))))
(error (message "ekg-embedding: error when trying to refresh tag %s: %S" tag err))))
(defun ekg-embedding-generate-all (&optional arg)
"Generate and store embeddings for every entity that needs one.
It is not necessary for the entity to contain a note. Tags will
be calculated from the average of all tagged entities.
Embeddings will not be calculated for objects with no text,
except for tags. If called with prefix ARG, embeddings will be
generated even if embeddings already exist. This is a fairly
slow function, and may take minutes or hours depending on how
much data there is.
Everything here is done asynchronously. A message will be
printed when everything is finished."
(interactive "P")
(ekg-embedding-connect)
(let* ((count 0)
(to-generate
(seq-filter (if arg #'identity (lambda (id)
(not (ekg-embedding-valid-p
(ekg-embedding-get id)))))
(ekg-active-note-ids)))
(notes-to-generate
(seq-filter (lambda (note) (and note (> (length (ekg-note-text note)) 0)))
(mapcar #'ekg-get-note-with-id to-generate))))
(cl-labels ((complete-id (num)
(cl-incf count num)
(with-current-buffer (get-buffer-create ekg-generate-all-buffer)
(goto-char (point-max))
(insert (format "Generated %d/%d (%.0f%% done)\n"
count (length to-generate)
(/ (* count 100.0) (length to-generate)))))))
(with-current-buffer (get-buffer-create ekg-generate-all-buffer)
(goto-char (point-max))
(insert (format "\nGenerating %d embeddings\n" (length to-generate)))
(display-buffer (current-buffer))
;; Process notes in batches
(let ((batches (seq-partition notes-to-generate ekg-embedding-batch-size)))
(cl-loop for batch in batches do
(ekg-embedding-generate-batch-async
batch
#'complete-id
(lambda (error-type msg)
(insert (format "Could not generate embedding batch: %s %s\n"
error-type msg))))
;; Add a small delay between batches
(sit-for 1)))
;; Process empty notes
(cl-loop for id in to-generate
for note = (ekg-get-note-with-id id)
when (or (null note) (= (length (ekg-note-text note)) 0))
collect id into ids
finally
do (complete-id (length ids)))
;; At this point, async things are happening, wait for idle
(run-with-idle-timer (* 60 5) nil
(lambda ()
(let ((tags (ekg-tags)))
(cl-loop for s in tags do
(ekg-embedding-refresh-tag-embedding s))
(with-current-buffer (get-buffer-create ekg-generate-all-buffer)
(insert (format "Refreshed %d tags\n" (length tags)))))))))))
(defun ekg-embedding-cosine-similarity (v1 v2)
"Calculate the cosine similarity of V1 and V2.
The return is a floating point number between 0 and 1, where the
closer it is to 1, the more similar it is."
(let ((dot-product (ekg-embedding-dot-product v1 v2))
(v1-magnitude (ekg-embedding-magnitude v1))
(v2-magnitude (ekg-embedding-magnitude v2)))
(if (and v1-magnitude v2-magnitude)
(/ dot-product (* v1-magnitude v2-magnitude))
0)))
(defun ekg-embedding-dot-product (v1 v2)
"Calculate the dot produce of vectors V1 and V2."
(let ((result 0))
(dotimes (i (length v1))
(setq result (+ result (* (aref v1 i) (aref v2 i)))))
result))
(defun ekg-embedding-magnitude (v)
"Calculate magnitude of vector V."
(let ((sum 0))
(dotimes (i (length v))
(setq sum (+ sum (* (aref v i) (aref v i)))))
(sqrt sum)))
(defun ekg-embedding-text-selector-initial (text)
"Return the TEXT to use for generating embeddings.
This is shortened to abide by token limits, using a conservative
approach. The truncation method depends on `ekg-truncation-method`
defined in `ekg.el`."
(with-temp-buffer
(insert text)
(goto-char (point-min))
(cond
((eq ekg-truncation-method 'word)
;; The target number of words we want is 8191 (the open AI limit is 8192),
;; divided by a factor of 1.5 to be conservative, since one word can be
;; multiple tokens.
(let ((target-num-words (floor (/ 8191 1.5)))
(num-words 0))
(while (and (< num-words target-num-words)
(not (eobp)))
(forward-word 1) ; Ensure forward-word moves by 1 word
(cl-incf num-words))
(buffer-substring-no-properties (point-min) (point))))
((eq ekg-truncation-method 'character)
;; For character-based selection, we'll use a direct character limit.
;; Assuming a conservative 1 character per token for CJK languages,
;; or a general heuristic. This might need further refinement or
;; a separate configurable variable in the future.
;; Let's use 8191 / 1.0 for now as a starting point.
(let ((target-num-chars (floor (/ 8191 1.0))))
(buffer-substring-no-properties (point-min) (min (point-max) (+ (point-min) target-num-chars)))))
(t ; Default to word-based if ekg-truncation-method is somehow not set or invalid
(let ((target-num-words (floor (/ 8191 1.5)))
(num-words 0))
(while (and (< num-words target-num-words)
(not (eobp)))
(forward-word 1)
(cl-incf num-words))
(buffer-substring-no-properties (point-min) (point)))))))
(defun ekg-embedding-delete (id)
"Delete embedding for ID."
(ekg-embedding-connect)
(if ekg-vecdb-provider
(vecdb-delete-items (car ekg-vecdb-provider)
(cdr ekg-vecdb-provider)
(list (ekg-embedding-id-to-embed-id id)))
(triples-remove-type ekg-db id 'embedding)))
(defun ekg-embedding-get (id)
"Return the embedding of entity with ID.
If there is no embedding, return nil."
(if ekg-vecdb-provider
(let ((item (vecdb-get-item (car ekg-vecdb-provider)
(cdr ekg-vecdb-provider)
(ekg-embedding-id-to-embed-id id))))
(when item (vecdb-item-vector item)))
(plist-get (triples-get-type ekg-db id 'embedding) :embedding)))
(defun ekg-embedding-get-all-notes ()
"Return an alist of id to embedding.
IDs that do not have embeddings will not be in the list."
(seq-filter #'cdr (cl-loop for s in (ekg-active-note-ids)
collect (cons s (ekg-embedding-get s)))))
(defun ekg-embedding-n-most-similar-to-id (id n)
"From an ID, return a list of the N most similar ids.
The results are in order of most similar to least similar."
(let ((embedding (ekg-embedding-get id)))
(unless embedding (error "Unable to find embedding of note %S" id))
(ekg-embedding-n-most-similar-notes embedding n)))
(defun ekg-embedding-n-most-similar-notes (e n)
"From an embedding E, return a list of the N most similar ids.
The results are in order of most similar to least similar."
(if ekg-vecdb-provider
(let ((provider (car ekg-vecdb-provider))
(collection (cdr ekg-vecdb-provider)))
(mapcar
(lambda (item) (read (plist-get (vecdb-item-payload item) :ekg-id)))
(vecdb-search-by-vector provider collection e n)))
;; Fallback to the triples database if no vecdb-provider is set.
;; This is less efficient, but works for smaller datasets.
(let* ((embeddings (ekg-embedding-get-all-notes)))
(setq embeddings
(sort
(mapcar (lambda (id-embedding)
(cons (car id-embedding)
(ekg-embedding-cosine-similarity e (cdr id-embedding))))
embeddings)
(lambda (a b) (> (cdr a) (cdr b)))))
(mapcar #'car (cl-subseq embeddings 0 (min n (length embeddings)))))))
(defun ekg-embedding-show-similar ()
"Show similar notes to the current note in a new buffer."
(interactive nil ekg-notes-mode)
(ekg-embedding-connect)
(let ((note (ekg-current-note-or-error)))
(ekg-setup-notes-buffer
(format "similar to note \"%s\"" (ekg-note-snippet note))
(lambda () (delq nil
(mapcar #'ekg-get-note-with-id
;; remove the first match, since the current note will
;; always be the most similar.
(cdr (ekg-embedding-n-most-similar-to-id (ekg-note-id note) ekg-notes-size)))))
(ekg-note-tags note))))
(defun ekg-embedding-search (&optional text)
"Show similar notes to the TEXT in a new buffer."
(interactive "MSearch: ")
(ekg-embedding-connect)
(ekg-setup-notes-buffer
(format "similar to \"%s\"" text)
(lambda () (delq nil
(mapcar #'ekg-get-note-with-id (ekg-embedding-n-most-similar-notes
(llm-embedding ekg-embedding-provider text)
ekg-notes-size))))
nil))
(defun ekg-embedding-show-similar-to-current-buffer ()
"Show similar notes to the text in the current buffer."
(interactive)
(ekg-embedding-connect)
(ekg-setup-notes-buffer
(format "similar to buffer \"%s\"" (buffer-name (current-buffer)))
(lambda () (delq nil
(mapcar #'ekg-get-note-with-id
(ekg-embedding-n-most-similar-notes
(llm-embedding ekg-embedding-provider
(funcall ekg-embedding-text-selector
(substring-no-properties (buffer-string))))
ekg-notes-size))))
nil))
(defun ekg-embedding-generate-on-save ()
"Enable embedding generation for new notes.
If you have created notes without embeddings enabled, you should
run `ekg-embedding-generate-all' to generate embeddings for all
notes."
(add-hook 'ekg-note-pre-save-hook #'ekg-embedding-generate-for-note-async)
;; Generating embeddings from a note's tags has to be post-save, since it works
;; by loading saved embeddings.
(add-hook 'ekg-note-save-hook #'ekg-embedding-generate-for-note-tags-delayed))
(defun ekg-embedding-disable-generate-on-save ()
"Disable the embedding module for the Emacs session."
(remove-hook 'ekg-note-pre-save-hook #'ekg-embedding-generate-for-note-async)
(remove-hook 'ekg-note-save-hook #'ekg-embedding-generate-for-note-tags-delayed))
;; Regardless of whether notes are generated on save, when notes are deleted we
;; need to clean up the embeddings.
(add-hook 'ekg-note-delete-hook #'ekg-embedding-delete)
(provide 'ekg-embedding)
;;; ekg-embedding.el ends here