Skip to content

Commit 6b7f730

Browse files
hyperpolymathclaude
andcommitted
fix(document): reload stored documents from Tantivy on persistent restart
TantivyDocumentStore::persistent() initialized the in-memory document HashMap as empty, even though the Tantivy index on disk contained all previously indexed documents. This caused has_document to report false after restart despite the data being searchable. Now scans all segments on open, reading stored fields (id, title, body) into the HashMap. Same write-through + in-memory cache pattern used by all other persistent modality stores. Verified: all 6 octads now show has_document=true, has_semantic=true, has_provenance=true, version_count=1 after stop/restart cycle. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent f844f1c commit 6b7f730

1 file changed

Lines changed: 44 additions & 2 deletions

File tree

  • rust-core/verisim-document/src

rust-core/verisim-document/src/lib.rs

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,11 @@ impl TantivyDocumentStore {
176176
})
177177
}
178178

179-
/// Create a persistent store
179+
/// Create a persistent store.
180+
///
181+
/// On open, all existing documents are scanned from the Tantivy index into
182+
/// the in-memory HashMap so that `get()` returns documents that were indexed
183+
/// in previous sessions.
180184
pub fn persistent(path: impl AsRef<Path>) -> Result<Self, DocumentError> {
181185
let schema = DocumentSchema::new();
182186
std::fs::create_dir_all(path.as_ref())?;
@@ -188,12 +192,50 @@ impl TantivyDocumentStore {
188192
.reload_policy(ReloadPolicy::OnCommitWithDelay)
189193
.try_into()?;
190194

195+
// Reload all stored documents from the Tantivy index into the in-memory
196+
// cache so that get() works after restart.
197+
let mut documents = HashMap::new();
198+
let searcher = reader.searcher();
199+
for segment_reader in searcher.segment_readers() {
200+
let store_reader = segment_reader.get_store_reader(1)
201+
.map_err(|e| DocumentError::IndexError(format!("store reader: {e}")))?;
202+
for doc_id in 0..segment_reader.max_doc() {
203+
if segment_reader.is_deleted(doc_id) {
204+
continue;
205+
}
206+
if let Ok(tantivy_doc) = store_reader.get::<TantivyDocument>(doc_id) {
207+
let extract = |field: Field| -> String {
208+
tantivy_doc
209+
.get_first(field)
210+
.and_then(|v| v.as_str())
211+
.unwrap_or_default()
212+
.to_string()
213+
};
214+
let id_val = extract(schema.id);
215+
let title_val = extract(schema.title);
216+
let body_val = extract(schema.body);
217+
218+
if !id_val.is_empty() {
219+
documents.insert(
220+
id_val.clone(),
221+
Document::new(id_val, title_val, body_val),
222+
);
223+
}
224+
}
225+
}
226+
}
227+
228+
tracing::info!(
229+
count = documents.len(),
230+
"Loaded document store from Tantivy index"
231+
);
232+
191233
Ok(Self {
192234
schema,
193235
index,
194236
writer: Arc::new(RwLock::new(writer)),
195237
reader,
196-
documents: Arc::new(RwLock::new(HashMap::new())),
238+
documents: Arc::new(RwLock::new(documents)),
197239
})
198240
}
199241
}

0 commit comments

Comments
 (0)