-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path001_init.sql
More file actions
139 lines (129 loc) · 4.6 KB
/
001_init.sql
File metadata and controls
139 lines (129 loc) · 4.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
-- ============================================================
-- Greek Legislation RAG — Supabase bootstrap (run once)
-- Run in the Supabase SQL editor on a fresh project.
-- ============================================================
-- ------------------------------------------------------------
-- Extensions
-- ------------------------------------------------------------
create extension if not exists vector;
create extension if not exists unaccent;
-- public.unaccent() is STABLE, which prevents its use inside generated
-- columns or expression indexes. Wrap it as IMMUTABLE so the FTS
-- column and indexes can be built on top of it.
create or replace function f_unaccent(text)
returns text
language sql
immutable
parallel safe
strict
as $$
select public.unaccent('public.unaccent', $1)
$$;
-- ------------------------------------------------------------
-- Chunks table — content + 1536-d embedding + accent-folded FTS
-- ------------------------------------------------------------
create table if not exists documents (
id bigserial primary key,
content text not null,
metadata jsonb not null default '{}'::jsonb,
embedding vector(1536),
fts tsvector generated always as (
to_tsvector('simple', f_unaccent(content))
) stored,
created_at timestamptz not null default now()
);
-- ------------------------------------------------------------
-- Indexes
-- ------------------------------------------------------------
create index if not exists documents_embedding_idx
on documents using hnsw (embedding vector_cosine_ops);
create index if not exists documents_fts_idx
on documents using gin (fts);
create index if not exists documents_metadata_idx
on documents using gin (metadata);
create index if not exists documents_source_idx
on documents ((metadata->>'source'));
-- ------------------------------------------------------------
-- Pure semantic match (cosine)
-- ------------------------------------------------------------
create or replace function match_documents (
query_embedding vector(1536),
match_count int default 10,
filter jsonb default '{}'::jsonb
)
returns table (
id bigint,
content text,
metadata jsonb,
similarity float
)
language sql stable
as $$
select id, content, metadata,
1 - (embedding <=> query_embedding) as similarity
from documents
where metadata @> filter
order by embedding <=> query_embedding
limit match_count;
$$;
-- ------------------------------------------------------------
-- Hybrid match (RRF: semantic + full-text), accent-folded on both sides
-- so inflected Greek forms hit the full-text leg.
-- ------------------------------------------------------------
create or replace function match_documents_hybrid (
query_text text,
query_embedding vector(1536),
match_count int default 10,
full_text_weight float default 1.0,
semantic_weight float default 1.0,
rrf_k int default 50,
filter jsonb default '{}'::jsonb
)
returns table (
id bigint,
content text,
metadata jsonb,
similarity float,
rank float
)
language sql stable
as $$
with full_text as (
select id,
row_number() over (
order by ts_rank_cd(
fts,
websearch_to_tsquery('simple', f_unaccent(query_text))
) desc
) as rank_ix
from documents
where fts @@ websearch_to_tsquery('simple', f_unaccent(query_text))
and metadata @> filter
limit least(match_count * 2, 50)
),
semantic as (
select id,
row_number() over (order by embedding <=> query_embedding) as rank_ix
from documents
where metadata @> filter
limit least(match_count * 2, 50)
)
select d.id,
d.content,
d.metadata,
1 - (d.embedding <=> query_embedding) as similarity,
coalesce(1.0 / (rrf_k + ft.rank_ix), 0.0) * full_text_weight
+ coalesce(1.0 / (rrf_k + s.rank_ix), 0.0) * semantic_weight as rank
from full_text ft
full outer join semantic s using (id)
join documents d on d.id = coalesce(ft.id, s.id)
order by rank desc
limit match_count;
$$;
-- ------------------------------------------------------------
-- Role config
-- Required on Supabase free tier: the default 8s per-statement
-- timeout cancels bulk inserts of OCR'd PDFs (180+ chunks of
-- vector(1536)) while the HNSW index updates.
-- ------------------------------------------------------------
alter role service_role set statement_timeout = '60s';