diff --git a/Cargo.lock b/Cargo.lock index bc4e97b81c..8ebae849ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1492,6 +1492,19 @@ dependencies = [ "syn", ] +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "deflate64" version = "0.1.11" @@ -1788,6 +1801,12 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" +[[package]] +name = "faster-hex" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2a2b11eda1d40935b26cf18f6833c526845ae8c41e58d09af6adeb6f0269183" + [[package]] name = "fastrand" version = "2.3.0" @@ -2053,6 +2072,102 @@ dependencies = [ "weezl", ] +[[package]] +name = "gix-actor" +version = "0.31.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0e454357e34b833cc3a00b6efbbd3dd4d18b24b9fb0c023876ec2645e8aa3f2" +dependencies = [ + "bstr", + "gix-date", + "gix-utils", + "itoa", + "thiserror 1.0.69", + "winnow 0.6.26", +] + +[[package]] +name = "gix-date" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9eed6931f21491ee0aeb922751bd7ec97b4b2fe8fbfedcb678e2a2dce5f3b8c0" +dependencies = [ + "bstr", + "itoa", + "thiserror 1.0.69", + "time", +] + +[[package]] +name = "gix-features" +version = "0.38.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac7045ac9fe5f9c727f38799d002a7ed3583cd777e3322a7c4b43e3cf437dc69" +dependencies = [ + "flate2", + "gix-hash", + "gix-trace", + "libc", + "prodash", + "sha1_smol", + "thiserror 1.0.69", +] + +[[package]] +name = "gix-hash" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f93d7df7366121b5018f947a04d37f034717e113dcf9ccd85c34b58e57a74d5e" +dependencies = [ + "faster-hex", + "thiserror 1.0.69", +] + +[[package]] +name = "gix-object" +version = "0.42.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25da2f46b4e7c2fa7b413ce4dffb87f69eaf89c2057e386491f4c55cadbfe386" +dependencies = [ + "bstr", + "gix-actor", + "gix-date", + "gix-features", + "gix-hash", + "gix-utils", + "gix-validate", + "itoa", + "smallvec", + "thiserror 1.0.69", + "winnow 0.6.26", +] + +[[package]] +name = "gix-trace" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44dc45eae785c0eb14173e0f152e6e224dcf4d45b6a6999a3aed22af541ad678" + +[[package]] +name = "gix-utils" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff08f24e03ac8916c478c8419d7d3c33393da9bb41fa4c24455d5406aeefd35f" +dependencies = [ + "fastrand", + "unicode-normalization", +] + +[[package]] +name = "gix-validate" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82c27dd34a49b1addf193c92070bcbf3beaf6e10f16a78544de6372e146a0acf" +dependencies = [ + "bstr", + "thiserror 1.0.69", +] + [[package]] name = "globset" version = "0.4.18" @@ -3230,6 +3345,15 @@ dependencies = [ "libm", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -3548,6 +3672,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prodash" +version = "28.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "744a264d26b88a6a7e37cbad97953fa233b94d585236310bcbc88474b4092d79" + [[package]] name = "profiling" version = "1.0.18" @@ -3748,7 +3878,14 @@ dependencies = [ "bytes", "chrono", "criterion", + "dashmap", + "flate2", "futures", + "gix-actor", + "gix-date", + "gix-features", + "gix-hash", + "gix-object", "grep", "grep-matcher", "grep-regex", @@ -3769,6 +3906,7 @@ dependencies = [ "tempfile", "thiserror 1.0.69", "tokio", + "toml", "tracing", "uuid", ] @@ -3790,11 +3928,15 @@ dependencies = [ name = "ragfs-python" version = "0.1.0" dependencies = [ + "bytes", + "gix-hash", "pyo3", "ragfs", "ragfs-cache-redis", + "serde", "serde_json", "tokio", + "toml", ] [[package]] @@ -4408,6 +4550,15 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -4984,7 +5135,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", + "itoa", + "libc", "num-conv", + "num_threads", "powerfmt", "serde_core", "time-core", @@ -5114,6 +5268,47 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow 0.7.15", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + [[package]] name = "tower" version = "0.5.3" @@ -5916,6 +6111,24 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "0.6.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e90edd2ac1aa278a5c4599b1d89cf03074b610800f866d4026dc199d7929a28" +dependencies = [ + "memchr", +] + +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + [[package]] name = "wit-bindgen" version = "0.51.0" diff --git a/crates/ov_cli/src/client.rs b/crates/ov_cli/src/client.rs index 63b87e47f5..90e7d99ca1 100644 --- a/crates/ov_cli/src/client.rs +++ b/crates/ov_cli/src/client.rs @@ -33,6 +33,38 @@ fn compact_request_body(body: &mut Value) { }); } +#[derive(serde::Serialize)] +pub struct SnapshotCommitReq { + pub message: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub paths: Option>, + pub branch: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub author_name: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub author_email: Option, +} + +#[derive(serde::Serialize)] +pub struct SnapshotRestoreReq { + #[serde(skip_serializing_if = "Option::is_none")] + pub project_dir: Option, + pub source_commit: String, + pub branch: String, + pub dry_run: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub message: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub author_name: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub author_email: Option, +} + +pub enum SnapshotShowResult { + Metadata(Value), + Blob { oid: String, size: u64, bytes: Vec }, +} + // ============ HttpClient ============ /// High-level HTTP client for OpenViking API @@ -1448,6 +1480,108 @@ impl HttpClient { self.post_with_query("/api/v1/watches/trigger", &empty, ¶ms) .await } + + // ============= Snapshot ============= + + pub async fn snapshot_commit(&self, req: &SnapshotCommitReq) -> Result { + self.post("/api/v1/snapshot/commit", req).await + } + + pub async fn snapshot_restore(&self, req: &SnapshotRestoreReq) -> Result { + self.post("/api/v1/snapshot/restore", req).await + } + + pub async fn snapshot_log(&self, branch: &str, limit: u32) -> Result { + let params = vec![ + ("branch".to_string(), branch.to_string()), + ("limit".to_string(), limit.to_string()), + ]; + self.get("/api/v1/snapshot/log", ¶ms).await + } + + pub async fn snapshot_show( + &self, + target_ref: &str, + path: Option<&str>, + ) -> Result { + let url = format!("{}/api/v1/snapshot/show", self.base.base_url); + let mut query: Vec<(String, String)> = vec![("target_ref".to_string(), target_ref.to_string())]; + if let Some(p) = path { + query.push(("path".to_string(), p.to_string())); + } + + let response = self + .base + .http + .get(&url) + .headers(self.base.build_headers()) + .query(&query) + .send() + .await + .map_err(|e| Error::Network(format!("HTTP request failed: {}", e)))?; + + let status = response.status(); + let content_type = response + .headers() + .get(reqwest::header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_string(); + + if path.is_some() && status.is_success() && content_type.starts_with("application/octet-stream") { + let oid = response + .headers() + .get("x-snapshot-oid") + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_string(); + let size: u64 = response + .headers() + .get("x-snapshot-size") + .and_then(|v| v.to_str().ok()) + .and_then(|s| s.parse().ok()) + .unwrap_or(0); + let bytes = response + .bytes() + .await + .map_err(|e| Error::Network(format!("Failed to read blob bytes: {}", e)))? + .to_vec(); + return Ok(SnapshotShowResult::Blob { oid, size, bytes }); + } + + let bytes = response + .bytes() + .await + .map_err(|e| Error::Network(format!("Failed to read response body: {}", e)))?; + let json: Value = match serde_json::from_slice(&bytes) { + Ok(v) => v, + Err(e) => { + let body_str = String::from_utf8_lossy(&bytes); + return Err(Error::Network(format!( + "Failed to parse JSON response: {}\n\nRaw response body:\n{}", + e, body_str + ))); + } + }; + + if !status.is_success() { + return Err(Error::api_with_status( + crate::base_client::api_error_from_envelope(&json, status), + status.as_u16(), + )); + } + if let Some(error) = json.get("error") { + if !error.is_null() { + return Err(Error::api_with_status( + crate::base_client::api_error_from_envelope(&json, status), + status.as_u16(), + )); + } + } + + let result = json.get("result").cloned().unwrap_or(Value::Null); + Ok(SnapshotShowResult::Metadata(result)) + } } #[cfg(test)] diff --git a/crates/ov_cli/src/commands/mod.rs b/crates/ov_cli/src/commands/mod.rs index 3673431b6c..fa7866f93b 100644 --- a/crates/ov_cli/src/commands/mod.rs +++ b/crates/ov_cli/src/commands/mod.rs @@ -13,5 +13,6 @@ pub mod search; pub mod session; pub mod skills; pub mod system; +pub mod snapshot; pub mod task; pub mod watch; diff --git a/crates/ov_cli/src/commands/snapshot.rs b/crates/ov_cli/src/commands/snapshot.rs new file mode 100644 index 0000000000..15297032b9 --- /dev/null +++ b/crates/ov_cli/src/commands/snapshot.rs @@ -0,0 +1,201 @@ +use std::io::Write; +use std::path::PathBuf; + +use serde_json::{json, Value}; + +use crate::client::{HttpClient, SnapshotCommitReq, SnapshotRestoreReq, SnapshotShowResult}; +use crate::error::Result; +use crate::output::{OutputFormat, output_success}; +use crate::SnapshotCmd; + +pub async fn dispatch( + client: &HttpClient, + cmd: SnapshotCmd, + output_format: OutputFormat, + compact: bool, +) -> Result<()> { + match cmd { + SnapshotCmd::Commit { + message, + paths, + branch, + author_name, + author_email, + } => { + let req = SnapshotCommitReq { + message, + paths, + branch, + author_name, + author_email, + }; + let value = client.snapshot_commit(&req).await?; + print_commit(&value, output_format, compact); + Ok(()) + } + SnapshotCmd::Restore { + project_dir, + source_commit, + branch, + dry_run, + message, + author_name, + author_email, + } => { + let req = SnapshotRestoreReq { + project_dir, + source_commit, + branch, + dry_run, + message, + author_name, + author_email, + }; + let value = client.snapshot_restore(&req).await?; + print_restore(&value, output_format, compact); + Ok(()) + } + SnapshotCmd::Show { + target_ref, + path, + out_path, + } => { + let result = client.snapshot_show(&target_ref, path.as_deref()).await?; + handle_show(result, out_path, output_format, compact) + } + SnapshotCmd::Log { branch, limit } => { + let value = client.snapshot_log(&branch, limit).await?; + print_log(&value, output_format, compact); + Ok(()) + } + } +} + +fn print_commit(value: &Value, output_format: OutputFormat, compact: bool) { + if matches!(output_format, OutputFormat::Json) { + output_success(value, output_format, compact); + return; + } + // The server returns the inner result dict (BaseClient unwraps the envelope). + // It is already a flat object, so hand it to the shared table renderer. + output_success(value, OutputFormat::Table, compact); +} + +fn print_restore(value: &Value, output_format: OutputFormat, compact: bool) { + if matches!(output_format, OutputFormat::Json) { + output_success(value, output_format, compact); + return; + } + // Dry-run shape nests counts under {diff: {to_write, to_delete, unchanged}}, + // which the generic renderer cannot flatten. Reshape it into a flat dict; + // the applied/noop shapes are already flat and render directly. + let display = if let Some(diff) = value.get("diff") { + let count = |key: &str| { + diff.get(key) + .and_then(|v| v.as_array()) + .map(|a| a.len()) + .unwrap_or(0) + }; + json!({ + "result": "dry-run", + "to_write": count("to_write"), + "to_delete": count("to_delete"), + "unchanged": count("unchanged"), + }) + } else { + value.clone() + }; + output_success(&display, OutputFormat::Table, compact); +} + +fn handle_show( + result: SnapshotShowResult, + out_path: Option, + output_format: OutputFormat, + compact: bool, +) -> Result<()> { + match result { + SnapshotShowResult::Metadata(meta) => { + if matches!(output_format, OutputFormat::Json) { + output_success(&meta, output_format, compact); + return Ok(()); + } + // Flatten the `parents` array into a scalar so the whole object + // renders as a single key/value table instead of the renderer + // collapsing onto just the list field. + let mut display = serde_json::Map::new(); + for key in ["oid", "tree", "author", "committer"] { + if let Some(v) = meta.get(key) { + display.insert(key.to_string(), v.clone()); + } + } + if let Some(parents) = meta.get("parents").and_then(|v| v.as_array()) { + let names: Vec = parents + .iter() + .filter_map(|v| v.as_str().map(String::from)) + .collect(); + display.insert("parents".to_string(), Value::String(names.join(", "))); + } + if let Some(msg) = meta.get("message").and_then(|v| v.as_str()) { + display.insert("message".to_string(), Value::String(msg.to_string())); + } + output_success(&Value::Object(display), OutputFormat::Table, compact); + Ok(()) + } + SnapshotShowResult::Blob { oid, bytes, size } => { + if matches!(output_format, OutputFormat::Json) { + let envelope = serde_json::json!({"oid": oid, "size": size}); + output_success(&envelope, output_format, compact); + if let Some(path) = out_path { + let mut f = std::fs::File::create(&path)?; + f.write_all(&bytes)?; + } + return Ok(()); + } + match out_path { + Some(path) => { + let mut f = std::fs::File::create(&path)?; + f.write_all(&bytes)?; + eprintln!("Wrote {} bytes from {} to {}", size, &oid[..12.min(oid.len())], path.display()); + } + None => { + let mut out = std::io::stdout().lock(); + out.write_all(&bytes)?; + eprintln!("Read {} bytes from {}", size, &oid[..12.min(oid.len())]); + } + } + Ok(()) + } + } +} + +fn print_log(value: &Value, output_format: OutputFormat, compact: bool) { + if matches!(output_format, OutputFormat::Json) { + output_success(value, output_format, compact); + return; + } + // value is the unwrapped "result" — a JSON array of commit entries with + // nested authors and multi-line messages. Flatten each entry into a row so + // the shared renderer produces an aligned table like the other commands. + let entries = value.as_array().cloned().unwrap_or_default(); + let rows: Vec = entries + .iter() + .map(|entry| { + let oid = entry.get("oid").and_then(|v| v.as_str()).unwrap_or(""); + let short = oid.get(..12).unwrap_or(oid); + let msg_full = entry.get("message").and_then(|v| v.as_str()).unwrap_or(""); + let subject = msg_full.lines().next().unwrap_or(""); + let author = entry + .get("author") + .and_then(|a| a.get("name").or_else(|| a.as_str().map(|_| a))) + .and_then(|v| v.as_str()) + .unwrap_or(""); + json!({ + "oid": short, + "author": author, + "subject": subject, + }) + }) + .collect(); + output_success(&rows, OutputFormat::Table, compact); +} diff --git a/crates/ov_cli/src/help_ui.rs b/crates/ov_cli/src/help_ui.rs index 96a5fc9553..48eb8666d9 100644 --- a/crates/ov_cli/src/help_ui.rs +++ b/crates/ov_cli/src/help_ui.rs @@ -81,7 +81,7 @@ const CONFIG_STATUS: &[HelpCommand] = help_commands![ ]; const IMPORT_EXPORT_SESSIONS: &[HelpCommand] = help_commands![ - "import", "export", "backup", "restore", "session", "privacy" + "import", "export", "backup", "restore", "snapshot", "session", "privacy" ]; const INTERACTIVE_ADMIN: &[HelpCommand] = help_commands![ @@ -473,6 +473,106 @@ const COMMAND_HELP_SPECS: &[CommandHelpSpec] = &[ description: "Show exact arguments for a session operation.", }], }, + CommandHelpSpec { + path: &["snapshot"], + purpose: "Manage workspace snapshots: commit, restore, show, and walk history.", + examples: &[ + HelpItem { + label: "ov snapshot commit -m \"checkpoint before refactor\"", + description: "Commit the current workspace state.", + }, + HelpItem { + label: "ov snapshot log --branch main", + description: "Walk commit history, newest first.", + }, + HelpItem { + label: "ov snapshot restore viking://projects/acme --dry-run", + description: "Preview restoring a directory to a past snapshot.", + }, + ], + next_steps: &[HelpItem { + label: "ov snapshot --help", + description: "Show exact arguments for a snapshot operation.", + }], + }, + CommandHelpSpec { + path: &["snapshot", "commit"], + purpose: "Commit the current workspace state as a new snapshot.", + examples: &[ + HelpItem { + label: "ov snapshot commit -m \"checkpoint before refactor\"", + description: "Commit the full workspace on the main branch.", + }, + HelpItem { + label: "ov snapshot commit -m \"docs only\" --paths viking://docs", + description: "Commit only the given viking:// URIs. Directories are expanded recursively (with snapshot pruning rules applied).", + }, + ], + next_steps: &[ + HelpItem { + label: "ov snapshot log --branch main", + description: "Confirm the new commit in history.", + }, + HelpItem { + label: "ov snapshot show ", + description: "Inspect the commit metadata.", + }, + ], + }, + CommandHelpSpec { + path: &["snapshot", "restore"], + purpose: "Restore a project directory to a past snapshot via a forward commit.", + examples: &[ + HelpItem { + label: "ov snapshot restore viking://projects/acme --dry-run", + description: "Preview which files would change.", + }, + HelpItem { + label: "ov snapshot restore viking://projects/acme -m \"rollback\"", + description: "Apply the restore as a new commit.", + }, + ], + next_steps: &[HelpItem { + label: "ov snapshot log --branch main", + description: "Verify the restore commit landed.", + }], + }, + CommandHelpSpec { + path: &["snapshot", "show"], + purpose: "Show a commit's metadata, or a single blob at a path.", + examples: &[ + HelpItem { + label: "ov snapshot show ", + description: "Print commit metadata.", + }, + HelpItem { + label: "ov snapshot show --path viking://docs/spec.md --out-file spec.md", + description: "Write a file blob from the snapshot to disk.", + }, + ], + next_steps: &[HelpItem { + label: "ov snapshot log --branch main", + description: "Find another commit to inspect.", + }], + }, + CommandHelpSpec { + path: &["snapshot", "log"], + purpose: "Walk commit history for a branch, newest first.", + examples: &[ + HelpItem { + label: "ov snapshot log --branch main", + description: "Show the latest commits on main.", + }, + HelpItem { + label: "ov snapshot log --branch main --limit 50", + description: "Show more history entries.", + }, + ], + next_steps: &[HelpItem { + label: "ov snapshot show ", + description: "Inspect a commit from the log.", + }], + }, CommandHelpSpec { path: &["add-memory"], purpose: "Add a memory directly from text or JSON messages.", @@ -1651,6 +1751,11 @@ fn localized_command_purpose(spec: &CommandHelpSpec, language: Language) -> &str ["health"] => "快速检查服务器是否可连接。", ["status"] => "查看 OpenViking 服务器诊断状态。", ["language"] => "选择 OpenViking CLI 显示语言。", + ["snapshot"] => "管理工作区快照:提交、恢复、查看,以及遍历历史。", + ["snapshot", "commit"] => "将当前工作区状态提交为新的快照。", + ["snapshot", "restore"] => "通过一次前向提交,将项目目录恢复到历史快照。", + ["snapshot", "show"] => "查看某次提交的元数据,或指定路径下的单个文件内容。", + ["snapshot", "log"] => "按分支遍历提交历史,最新的在前。", _ => spec.purpose, } } @@ -2113,6 +2218,7 @@ fn localized_command_description<'a>( "export" => "导出为 .ovpack", "backup" => "创建仅恢复备份", "restore" => "恢复备份", + "snapshot" => "管理工作区快照", "tui" => "打开交互式浏览器", "chat" => "与 VikingBot 对话", "admin" => "管理账户、用户和 API Key", @@ -2329,7 +2435,7 @@ fn version() -> String { fn is_bare_group_help_command(command: &str) -> bool { matches!( command, - "task" | "skills" | "session" | "privacy" | "admin" | "system" | "observer" + "task" | "skills" | "session" | "snapshot" | "privacy" | "admin" | "system" | "observer" ) } diff --git a/crates/ov_cli/src/main.rs b/crates/ov_cli/src/main.rs index 0a453c384f..164857cc67 100644 --- a/crates/ov_cli/src/main.rs +++ b/crates/ov_cli/src/main.rs @@ -899,6 +899,11 @@ enum Commands { #[command(subcommand)] action: TaskCommands, }, + /// [Version] Manage workspace snapshots (commit, restore, show, log) + Snapshot { + #[command(subcommand)] + cmd: SnapshotCmd, + }, /// [Status] All OpenViking Server components status Status { /// Show full component tables @@ -1018,6 +1023,58 @@ enum TaskCommands { }, } +#[derive(Subcommand)] +pub(crate) enum SnapshotCmd { + /// Create a snapshot of current workspace state + Commit { + #[arg(short = 'm', long)] + message: String, + /// Limit to specific viking:// URIs (comma-separated); accepts files and directories. Directories are expanded recursively with the snapshot pruning rules. Omit to snapshot the full account tree. + #[arg(long, value_delimiter = ',')] + paths: Option>, + #[arg(long, default_value = "main")] + branch: String, + #[arg(long)] + author_name: Option, + #[arg(long)] + author_email: Option, + }, + /// Restore a project directory or the full account tree to a past snapshot (forward commit) + Restore { + /// Commit oid, branch, or tag + source_commit: String, + /// Optional viking:// directory or relative tree path; omit for full-tree restore + project_dir: Option, + #[arg(long, default_value = "main")] + branch: String, + #[arg(long)] + dry_run: bool, + #[arg(short = 'm', long)] + message: Option, + #[arg(long)] + author_name: Option, + #[arg(long)] + author_email: Option, + }, + /// Show a commit's metadata, or a single blob at a path + Show { + target_ref: String, + /// viking:// URI of a file; omit to show commit metadata + #[arg(long)] + path: Option, + /// Write blob bytes to this file (default: stdout) + #[arg(long = "out-file")] + out_path: Option, + }, + /// Walk commit history (newest first) + Log { + #[arg(long, default_value = "main")] + branch: String, + #[arg(long, default_value_t = 20)] + limit: u32, + }, +} + #[derive(Subcommand)] enum SystemCommands { /// Wait for queued async processing to complete @@ -2769,6 +2826,10 @@ async fn main() { } } }, + Commands::Snapshot { cmd } => { + let client = ctx.get_client(); + commands::snapshot::dispatch(&client, cmd, ctx.output_format, ctx.compact).await + } Commands::Status { verbose } => { let client = ctx.get_client(); commands::system::diagnostic_status( diff --git a/crates/ragfs-python/Cargo.toml b/crates/ragfs-python/Cargo.toml index acfe14d236..65b0c801a2 100644 --- a/crates/ragfs-python/Cargo.toml +++ b/crates/ragfs-python/Cargo.toml @@ -8,7 +8,9 @@ publish = false [lib] name = "ragfs_python" -crate-type = ["cdylib"] +# rlib is needed so `cargo test` can build a test binary that links against +# this crate's symbols. cdylib is what maturin packages as the Python .so. +crate-type = ["cdylib", "rlib"] [features] default = ["extension-module", "s3", "cache-redis"] @@ -22,6 +24,10 @@ ragfs-cache-redis = { path = "../ragfs-cache-redis", optional = true } pyo3 = { version = "0.27", features = ["abi3", "abi3-py310"] } tokio = { version = "1", features = ["full"] } serde_json = "1.0" +serde = { version = "1", features = ["derive"] } +toml = "0.8" +gix-hash = "0.14" +bytes = "1.5" [lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ['cfg(feature, values("yuanrong-native", "mooncake-native"))'] } diff --git a/crates/ragfs-python/src/git.rs b/crates/ragfs-python/src/git.rs new file mode 100644 index 0000000000..5c1ae32bd5 --- /dev/null +++ b/crates/ragfs-python/src/git.rs @@ -0,0 +1,872 @@ +//! PyO3 binding helpers for the Git version-control service. +//! +//! This module owns: +//! - Backend construction (`build_git_service`) from a `GitConfig` +//! - Request parsers: `parse_commit_request`, `parse_restore_request`, `parse_show_request` +//! (added in later tasks) +//! - Response converters: `commit_response_to_pydict`, `restore_response_to_pydict`, +//! `show_response_to_pydict` (added in later tasks) +//! - Error mapping `map_git_error` (added in later tasks) +//! +//! The free functions are invoked from thin `#[pymethods]` wrappers in `lib.rs`. + +use std::sync::Arc; + +use pyo3::exceptions::{PyRuntimeError, PyValueError}; +use pyo3::prelude::*; + +use ragfs::core::FileSystem; +use ragfs::git::{ + GitConfig, GitService, IndexStore, LocalIndexStore, LocalObjectStore, LocalRefStore, + ObjectStore, RefStore, +}; + +#[cfg(feature = "s3")] +use ragfs::git::{CasMode, S3Config, S3IndexStore, S3ObjectStore, S3RefStore}; + +/// Build a `GitService` from a `GitConfig` and the binding's MountableFS. +/// +/// Returns `Ok(None)` when `enabled = false`; `Err(PyErr)` if the config is +/// invalid (missing required section, unknown backend, etc.). +/// +/// Backend-specific notes: +/// - `local`: requires `[git.local]` with `base_dir`. Builds `LocalObjectStore` +/// and `LocalRefStore`, both rooted at `base_dir`. +/// - `s3` (feature-gated): requires `[git.s3]` with `bucket`, `region`. +/// `access_key` and `secret_key` are read directly from the config; when +/// omitted, the AWS SDK default credentials chain is used. +pub fn build_git_service( + cfg: &GitConfig, + vfs: Arc, +) -> PyResult>> { + if !cfg.enabled { + return Ok(None); + } + + let (object_store, ref_store, index_store): ( + Arc, + Arc, + Option>, + ) = match cfg.backend.as_str() { + "local" => { + let lc = cfg + .local + .as_ref() + .ok_or_else(|| PyValueError::new_err("[git.local] missing"))?; + let os = Arc::new(LocalObjectStore::new(lc.base_dir.clone())); + let rs = Arc::new(LocalRefStore::new(lc.base_dir.clone())); + let is: Option> = if cfg.tuning.commit_index_enabled { + Some(Arc::new(LocalIndexStore::new(lc.base_dir.clone()))) + } else { + None + }; + (os, rs, is) + } + #[cfg(feature = "s3")] + "s3" => build_s3_service(cfg)?, + #[cfg(not(feature = "s3"))] + "s3" => { + return Err(PyRuntimeError::new_err( + "git backend 's3' requested but ragfs-python built without `s3` feature", + )); + } + other => { + return Err(PyValueError::new_err(format!( + "unsupported git backend: {}", + other + ))); + } + }; + + Ok(Some(Arc::new( + GitService::with_index(vfs, object_store, ref_store, index_store) + .with_blob_exists_precheck(cfg.tuning.blob_exists_precheck_enabled), + ))) +} + +#[cfg(feature = "s3")] +fn build_s3_service( + cfg: &GitConfig, +) -> PyResult<( + Arc, + Arc, + Option>, +)> { + let sc = cfg + .s3 + .as_ref() + .ok_or_else(|| PyValueError::new_err("[git.s3] missing"))?; + + let access_key_id = match sc.access_key.as_deref() { + Some(v) if !v.is_empty() => Some(v.to_string()), + _ => None, + }; + let secret_access_key = match sc.secret_key.as_deref() { + Some(v) if !v.is_empty() => Some(v.to_string()), + _ => None, + }; + + let cas_mode = match sc.cas_mode.as_str() { + "native" => CasMode::Native, + "redis_lock" => CasMode::RedisLock, + other => { + return Err(PyValueError::new_err(format!( + "unsupported cas_mode: {}", + other + ))); + } + }; + + let s3_config = S3Config { + bucket: sc.bucket.clone(), + prefix: sc.prefix.clone(), + region: sc.region.clone(), + endpoint: if sc.endpoint.is_empty() { + None + } else { + Some(sc.endpoint.clone()) + }, + access_key_id, + secret_access_key, + use_path_style: sc.use_path_style, + cas_mode, + }; + + let rt = tokio::runtime::Handle::try_current() + .map_err(|_| PyRuntimeError::new_err("build_s3_service must run inside a tokio runtime"))?; + let os_cfg = s3_config.clone(); + let object_store = Arc::new( + rt.block_on(async move { S3ObjectStore::from_config(os_cfg).await }) + .map_err(|e| PyRuntimeError::new_err(format!("S3ObjectStore: {}", e)))?, + ) as Arc; + + let rs_cfg = s3_config.clone(); + let ref_store = Arc::new( + rt.block_on(async move { S3RefStore::from_config(rs_cfg).await }) + .map_err(|e| PyRuntimeError::new_err(format!("S3RefStore: {}", e)))?, + ) as Arc; + + let index_store: Option> = if cfg.tuning.commit_index_enabled { + let is_cfg = s3_config; + Some(Arc::new( + rt.block_on(async move { S3IndexStore::from_config(is_cfg).await }) + .map_err(|e| PyRuntimeError::new_err(format!("S3IndexStore: {}", e)))?, + ) as Arc) + } else { + None + }; + + Ok((object_store, ref_store, index_store)) +} + +/// Map a `GitError` to the appropriate Python exception. +/// +/// Loads exception classes from the `openviking.pyagfs` module. When the +/// module is not importable (e.g. during unit tests), falls back to +/// `PyRuntimeError` with the same message. +pub fn map_git_error(py: Python<'_>, e: ragfs::git::GitError) -> PyErr { + use ragfs::git::{GitError, ObjectStoreError, RefStoreError}; + let msg = e.to_string(); + match e { + GitError::FeatureDisabled => new_py_err_pub(py, "AGFSNotSupportedError", msg), + GitError::ConcurrentCommit { .. } => new_py_err_pub(py, "GitConcurrentCommitError", msg), + GitError::PathNotFound(_) => new_py_err_pub(py, "AGFSNotFoundError", msg), + GitError::PathIsDirectory(_) => new_py_err_pub(py, "AGFSInvalidOperationError", msg), + GitError::SubtreeNotFoundInCommit { .. } => new_py_err_pub(py, "AGFSNotFoundError", msg), + GitError::InvalidAccountId(_) => new_py_err_pub(py, "AGFSInvalidPathError", msg), + GitError::InvalidProjectDir(_) => new_py_err_pub(py, "AGFSInvalidPathError", msg), + GitError::InvalidPath(_) => new_py_err_pub(py, "AGFSInvalidPathError", msg), + GitError::BlobTooLarge { .. } => new_py_err_pub(py, "AGFSInvalidOperationError", msg), + GitError::TooManyFiles { .. } => new_py_err_pub(py, "AGFSInvalidOperationError", msg), + GitError::CorruptedObject(_) => new_py_err_pub(py, "AGFSInternalError", msg), + GitError::RefStore(RefStoreError::NotFound(_)) => { + new_py_err_pub(py, "AGFSNotFoundError", msg) + } + GitError::RefStore(RefStoreError::Conflict { .. }) => { + new_py_err_pub(py, "GitConcurrentCommitError", msg) + } + GitError::OidPrefixNotFound { .. } => new_py_err_pub(py, "AGFSNotFoundError", msg), + GitError::AmbiguousOid { .. } => new_py_err_pub(py, "AGFSInvalidOperationError", msg), + GitError::ObjectStore(ObjectStoreError::NotFound(_)) => { + new_py_err_pub(py, "AGFSNotFoundError", msg) + } + GitError::RestoreWritebackPartial(p) => writeback_partial_to_pyerr(py, *p, msg), + GitError::ObjectStore(_) + | GitError::RefStore(_) + | GitError::Vfs(_) + | GitError::Other(_) => PyRuntimeError::new_err(msg), + } +} + +/// Build a Python exception carrying the structured `RestoreWritebackPartial` +/// payload. Falls back to `PyRuntimeError` when `openviking.pyagfs` is not +/// importable (e.g. cargo-test environment) — in that case the structured +/// data is lost, but the error message still survives. +fn writeback_partial_to_pyerr(py: Python<'_>, p: RestoreWritebackPartial, msg: String) -> PyErr { + let exc_class = match PyModule::import(py, "openviking.pyagfs") + .and_then(|m| m.getattr("GitRestoreWritebackPartialError")) + { + Ok(c) => c, + Err(_) => { + return PyRuntimeError::new_err(format!( + "{msg} (structured payload dropped: pyagfs unavailable)" + )); + } + }; + + let payload = PyDict::new(py); + // Strings + ints marshal trivially; (String, String) tuples become + // Python tuples via pyo3's IntoPy impl. + let set_oid = + |k: &str, oid: &gix_hash::ObjectId| -> PyResult<()> { payload.set_item(k, oid_hex(oid)) }; + + let build = || -> PyResult<()> { + set_oid("new_commit_oid", &p.new_commit_oid)?; + set_oid("source_commit", &p.source_commit)?; + set_oid("parent_commit", &p.parent_commit)?; + payload.set_item("written", p.written)?; + payload.set_item("deleted", p.deleted)?; + payload.set_item("unchanged", p.unchanged)?; + payload.set_item("written_paths", p.written_paths.clone())?; + payload.set_item("deleted_paths", p.deleted_paths.clone())?; + payload.set_item("failed_writes", p.failed_writes.clone())?; + payload.set_item("failed_deletes", p.failed_deletes.clone())?; + Ok(()) + }; + if let Err(e) = build() { + return e; + } + + match exc_class.call1((msg.clone(), &payload)) { + Ok(instance) => PyErr::from_value(instance), + Err(_) => PyRuntimeError::new_err(format!( + "{msg} (structured payload dropped: failed to instantiate \ + GitRestoreWritebackPartialError)" + )), + } +} + +/// Local copy of the new_py_err pattern used in lib.rs. We duplicate it here +/// to keep git.rs self-contained — lib.rs's helper is private. If lib.rs's +/// helper is later made `pub(crate)`, this can be deleted in favor of that. +pub fn new_py_err_pub(py: Python<'_>, name: &str, msg: String) -> PyErr { + let exc = PyModule::import(py, "openviking.pyagfs") + .and_then(|m| m.getattr(name)) + .and_then(|exc| Ok(exc.cast_into::()?)); + match exc { + Ok(exc) => PyErr::from_type(exc, msg), + Err(_) => PyRuntimeError::new_err(msg), + } +} + +use pyo3::types::{PyBytes, PyDict, PyList}; +use ragfs::git::{ + Actor, CommitRequest, CommitResponse, RestoreDiff, RestoreRequest, RestoreResponse, + RestoreWritebackPartial, ShowRequest, ShowResponse, +}; + +// ---------- request parsers ---------- + +fn require_str(kwargs: &Bound, key: &str) -> PyResult { + let val = kwargs + .get_item(key)? + .ok_or_else(|| PyValueError::new_err(format!("missing required kwarg: {}", key)))?; + val.extract::() + .map_err(|_| PyValueError::new_err(format!("kwarg {} must be a string", key))) +} + +fn optional_str(kwargs: &Bound, key: &str) -> PyResult> { + match kwargs.get_item(key)? { + Some(v) if !v.is_none() => v + .extract::() + .map(Some) + .map_err(|_| PyValueError::new_err(format!("kwarg {} must be a string", key))), + _ => Ok(None), + } +} + +fn optional_bool(kwargs: &Bound, key: &str, default: bool) -> PyResult { + match kwargs.get_item(key)? { + Some(v) if !v.is_none() => v + .extract::() + .map_err(|_| PyValueError::new_err(format!("kwarg {} must be a bool", key))), + _ => Ok(default), + } +} + +fn optional_string_list(kwargs: &Bound, key: &str) -> PyResult>> { + match kwargs.get_item(key)? { + Some(v) if !v.is_none() => v + .extract::>() + .map(Some) + .map_err(|_| PyValueError::new_err(format!("kwarg {} must be a list of strings", key))), + _ => Ok(None), + } +} + +pub fn parse_commit_request(kwargs: &Bound) -> PyResult { + Ok(CommitRequest { + account: require_str(kwargs, "account")?, + branch: require_str(kwargs, "branch")?, + message: require_str(kwargs, "message")?, + paths: optional_string_list(kwargs, "paths")?, + author_name: require_str(kwargs, "author_name")?, + author_email: require_str(kwargs, "author_email")?, + }) +} + +pub fn parse_restore_request(kwargs: &Bound) -> PyResult { + Ok(RestoreRequest { + account: require_str(kwargs, "account")?, + branch: require_str(kwargs, "branch")?, + project_dir: optional_str(kwargs, "project_dir")?, + source_commit: require_str(kwargs, "source_commit")?, + dry_run: optional_bool(kwargs, "dry_run", false)?, + message: optional_str(kwargs, "message")?, + author_name: require_str(kwargs, "author_name")?, + author_email: require_str(kwargs, "author_email")?, + }) +} + +pub fn parse_show_request(kwargs: &Bound) -> PyResult { + Ok(ShowRequest { + account: require_str(kwargs, "account")?, + target_ref: require_str(kwargs, "target_ref")?, + path: optional_str(kwargs, "path")?, + }) +} + +// ---------- response converters ---------- + +fn oid_hex(oid: &gix_hash::ObjectId) -> String { + oid.to_hex().to_string() +} + +fn actor_to_dict(py: Python<'_>, a: &Actor) -> PyResult> { + let d = PyDict::new(py); + d.set_item("name", &a.name)?; + d.set_item("email", &a.email)?; + d.set_item("time_seconds", a.time_seconds)?; + d.set_item("tz_offset_seconds", a.tz_offset_seconds)?; + Ok(d.into()) +} + +pub fn commit_response_to_pydict(py: Python<'_>, resp: CommitResponse) -> PyResult> { + let d = PyDict::new(py); + match resp { + CommitResponse::Created { + commit_oid, + changed, + } => { + d.set_item("result", "created")?; + d.set_item("commit_oid", oid_hex(&commit_oid))?; + d.set_item("changed", changed)?; + } + CommitResponse::Noop { commit_oid } => { + d.set_item("result", "noop")?; + d.set_item("commit_oid", oid_hex(&commit_oid))?; + } + } + Ok(d.into_any().unbind()) +} + +fn diff_to_dict(py: Python<'_>, diff: &RestoreDiff) -> PyResult> { + let d = PyDict::new(py); + let to_write = PyList::empty(py); + for (path, oid) in &diff.to_write { + let pair = PyDict::new(py); + pair.set_item("path", path)?; + pair.set_item("oid", oid_hex(oid))?; + to_write.append(pair)?; + } + d.set_item("to_write", to_write)?; + d.set_item("to_delete", diff.to_delete.clone())?; + d.set_item("unchanged", diff.unchanged.clone())?; + Ok(d.into()) +} + +pub fn restore_response_to_pydict(py: Python<'_>, resp: RestoreResponse) -> PyResult> { + let d = PyDict::new(py); + match resp { + RestoreResponse::Applied { + new_commit_oid, + source_commit, + parent_commit, + written, + deleted, + unchanged, + written_paths, + deleted_paths, + } => { + d.set_item("result", "applied")?; + d.set_item("new_commit_oid", oid_hex(&new_commit_oid))?; + d.set_item("source_commit", oid_hex(&source_commit))?; + d.set_item("parent_commit", oid_hex(&parent_commit))?; + d.set_item("written", written)?; + d.set_item("deleted", deleted)?; + d.set_item("unchanged", unchanged)?; + d.set_item("written_paths", written_paths)?; + d.set_item("deleted_paths", deleted_paths)?; + } + RestoreResponse::Noop { head, source } => { + d.set_item("result", "noop")?; + d.set_item("head", oid_hex(&head))?; + d.set_item("source", oid_hex(&source))?; + } + RestoreResponse::DryRun { diff, head, source } => { + d.set_item("result", "dry_run")?; + d.set_item("head", oid_hex(&head))?; + d.set_item("source", oid_hex(&source))?; + d.set_item("diff", diff_to_dict(py, &diff)?)?; + } + } + Ok(d.into_any().unbind()) +} + +pub fn show_response_to_pydict(py: Python<'_>, resp: ShowResponse) -> PyResult> { + let d = PyDict::new(py); + match resp { + ShowResponse::Commit { + oid, + tree, + parents, + author, + committer, + message, + } => { + d.set_item("oid", oid_hex(&oid))?; + d.set_item("tree", oid_hex(&tree))?; + let plist = PyList::empty(py); + for p in &parents { + plist.append(oid_hex(p))?; + } + d.set_item("parents", plist)?; + d.set_item("author", actor_to_dict(py, &author)?)?; + d.set_item("committer", actor_to_dict(py, &committer)?)?; + d.set_item("message", message)?; + } + ShowResponse::Blob { oid, size, bytes } => { + d.set_item("oid", oid_hex(&oid))?; + d.set_item("size", size)?; + d.set_item("bytes", PyBytes::new(py, &bytes))?; + } + } + Ok(d.into_any().unbind()) +} + +#[cfg(test)] +mod tests { + use super::*; + use ragfs::core::MountableFS; + use ragfs::git::GitError; + use std::sync::Arc; + + fn local_cfg(base_dir: &str) -> ragfs::git::GitConfig { + ragfs::git::GitConfig { + enabled: true, + backend: "local".into(), + default_branch: "main".into(), + author_name: "test".into(), + author_email: "t@e".into(), + local: Some(ragfs::git::GitLocalConfig { + base_dir: base_dir.into(), + }), + s3: None, + tuning: Default::default(), + } + } + + #[tokio::test] + async fn build_git_service_disabled_returns_none() { + let fs = Arc::new(MountableFS::new()) as Arc; + let mut cfg = local_cfg("/tmp/ov-git-test-disabled"); + cfg.enabled = false; + let svc = build_git_service(&cfg, fs).expect("build ok"); + assert!(svc.is_none()); + } + + #[tokio::test] + async fn build_git_service_local_returns_some() { + let fs = Arc::new(MountableFS::new()) as Arc; + let cfg = local_cfg("/tmp/ov-git-test-local"); + let svc = build_git_service(&cfg, fs).expect("build ok"); + assert!(svc.is_some()); + } + + #[tokio::test] + async fn build_git_service_unknown_backend_errors() { + // Building a PyErr requires the Python interpreter to be initialized; + // the `extension-module` feature disables auto-initialize. + Python::initialize(); + let fs = Arc::new(MountableFS::new()) as Arc; + let mut cfg = local_cfg("/tmp/ov-git-test-bad"); + cfg.backend = "bogus".into(); + // `GitService` is not `Debug`, so we can't use `unwrap_err()`; match instead. + let err = match build_git_service(&cfg, fs) { + Ok(_) => panic!("expected error for bogus backend"), + Err(e) => e, + }; + assert!(err.to_string().contains("unsupported git backend")); + } + + #[tokio::test] + async fn build_git_service_local_without_section_errors() { + Python::initialize(); + let fs = Arc::new(MountableFS::new()) as Arc; + let mut cfg = local_cfg("/tmp/ov-git-test-nolocal"); + cfg.local = None; + let err = match build_git_service(&cfg, fs) { + Ok(_) => panic!("expected error when [git.local] missing"), + Err(e) => e, + }; + assert!(err.to_string().contains("[git.local] missing")); + } + + #[test] + fn map_git_error_feature_disabled() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let err = map_git_error(py, GitError::FeatureDisabled); + // We don't require the openviking.pyagfs module to be importable + // in this Rust-only test, so the fallback PyRuntimeError is fine. + // We just assert that mapping does not panic and yields a PyErr. + assert!(err.to_string().to_lowercase().contains("git")); + }); + } + + #[test] + fn map_git_error_concurrent_commit() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let err = map_git_error( + py, + GitError::ConcurrentCommit { + ref_name: "refs/heads/main".into(), + expected: None, + actual: None, + }, + ); + assert!(err.to_string().to_lowercase().contains("concurrent")); + }); + } + + /// `RestoreWritebackPartial` must round-trip through `map_git_error` + /// without panicking. When `openviking.pyagfs` is not importable (the + /// usual case in pure cargo-test environments) the helper falls back to + /// `PyRuntimeError` and tags the message so the operator notices the + /// dropped payload; we only assert on the message preamble here. + #[test] + fn map_git_error_writeback_partial() { + use ragfs::git::RestoreWritebackPartial; + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let payload = RestoreWritebackPartial { + new_commit_oid: gix_hash::ObjectId::null(gix_hash::Kind::Sha1), + source_commit: gix_hash::ObjectId::null(gix_hash::Kind::Sha1), + parent_commit: gix_hash::ObjectId::null(gix_hash::Kind::Sha1), + written: 1, + deleted: 0, + unchanged: 0, + written_paths: vec!["resources/proj_a/b.md".to_string()], + deleted_paths: vec![], + failed_writes: vec![( + "resources/proj_a/a.md".to_string(), + "forced write failure".to_string(), + )], + failed_deletes: vec![], + }; + let err = map_git_error(py, GitError::RestoreWritebackPartial(Box::new(payload))); + let s = err.to_string().to_lowercase(); + assert!( + s.contains("restore writeback partial"), + "expected partial message, got {s:?}" + ); + // The Display includes counts derived from the payload. + assert!( + s.contains("1 write"), + "expected write count in message: {s:?}" + ); + }); + } + + #[test] + fn map_git_error_path_not_found() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let err = map_git_error(py, GitError::PathNotFound("foo/bar".into())); + assert!(err.to_string().contains("foo/bar")); + }); + } + + #[test] + fn map_git_error_invalid_account() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let err = map_git_error(py, GitError::InvalidAccountId("../bad".into())); + assert!(err.to_string().contains("bad")); + }); + } + + #[test] + fn map_git_error_blob_too_large() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let err = map_git_error( + py, + GitError::BlobTooLarge { + size: 200, + limit: 100, + }, + ); + assert!(err.to_string().contains("200")); + }); + } + + use pyo3::types::PyDict; + + #[test] + fn parse_commit_request_required_fields() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let kwargs = PyDict::new(py); + kwargs.set_item("account", "acct1").unwrap(); + kwargs.set_item("branch", "main").unwrap(); + kwargs.set_item("message", "hello").unwrap(); + kwargs.set_item("author_name", "alice").unwrap(); + kwargs.set_item("author_email", "a@e.com").unwrap(); + let req = parse_commit_request(&kwargs).expect("parses"); + assert_eq!(req.account, "acct1"); + assert_eq!(req.branch, "main"); + assert_eq!(req.message, "hello"); + assert!(req.paths.is_none()); + assert_eq!(req.author_name, "alice"); + assert_eq!(req.author_email, "a@e.com"); + }); + } + + #[test] + fn parse_commit_request_with_paths_list() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let kwargs = PyDict::new(py); + kwargs.set_item("account", "a").unwrap(); + kwargs.set_item("branch", "main").unwrap(); + kwargs.set_item("message", "m").unwrap(); + kwargs.set_item("author_name", "n").unwrap(); + kwargs.set_item("author_email", "e").unwrap(); + kwargs + .set_item("paths", vec!["resources/a.md", "resources/b.md"]) + .unwrap(); + let req = parse_commit_request(&kwargs).expect("parses"); + assert_eq!(req.paths.as_ref().unwrap().len(), 2); + assert_eq!(req.paths.as_ref().unwrap()[0], "resources/a.md"); + }); + } + + #[test] + fn parse_commit_request_missing_required_errors() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let kwargs = PyDict::new(py); + kwargs.set_item("branch", "main").unwrap(); + // missing account, message, author_* + let err = parse_commit_request(&kwargs).unwrap_err(); + assert!(err.to_string().contains("account")); + }); + } + + #[test] + fn parse_restore_request_defaults_project_dir_none() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let kwargs = PyDict::new(py); + kwargs.set_item("account", "a").unwrap(); + kwargs.set_item("branch", "main").unwrap(); + kwargs.set_item("source_commit", "deadbeef").unwrap(); + kwargs.set_item("author_name", "n").unwrap(); + kwargs.set_item("author_email", "e").unwrap(); + let req = parse_restore_request(&kwargs).expect("parses"); + assert!(req.project_dir.is_none()); + assert!(!req.dry_run); + assert!(req.message.is_none()); + }); + } + + #[test] + fn parse_restore_request_project_dir_some() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let kwargs = PyDict::new(py); + kwargs.set_item("account", "a").unwrap(); + kwargs.set_item("branch", "main").unwrap(); + kwargs.set_item("project_dir", "resources/proj").unwrap(); + kwargs.set_item("source_commit", "deadbeef").unwrap(); + kwargs.set_item("author_name", "n").unwrap(); + kwargs.set_item("author_email", "e").unwrap(); + let req = parse_restore_request(&kwargs).expect("parses"); + assert_eq!(req.project_dir.as_deref(), Some("resources/proj")); + }); + } + + #[test] + fn parse_restore_request_dry_run_and_message() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let kwargs = PyDict::new(py); + kwargs.set_item("account", "a").unwrap(); + kwargs.set_item("branch", "main").unwrap(); + kwargs.set_item("project_dir", "x").unwrap(); + kwargs.set_item("source_commit", "abc123").unwrap(); + kwargs.set_item("author_name", "n").unwrap(); + kwargs.set_item("author_email", "e").unwrap(); + kwargs.set_item("dry_run", true).unwrap(); + kwargs.set_item("message", "custom msg").unwrap(); + let req = parse_restore_request(&kwargs).expect("parses"); + assert!(req.dry_run); + assert_eq!(req.message.as_deref(), Some("custom msg")); + }); + } + + #[test] + fn parse_show_request_with_and_without_path() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let kwargs = PyDict::new(py); + kwargs.set_item("account", "a").unwrap(); + kwargs.set_item("target_ref", "main").unwrap(); + let req = parse_show_request(&kwargs).expect("parses"); + assert!(req.path.is_none()); + + let kwargs2 = PyDict::new(py); + kwargs2.set_item("account", "a").unwrap(); + kwargs2.set_item("target_ref", "main").unwrap(); + kwargs2.set_item("path", "resources/a.md").unwrap(); + let req2 = parse_show_request(&kwargs2).expect("parses"); + assert_eq!(req2.path.as_deref(), Some("resources/a.md")); + }); + } + + #[test] + fn commit_response_created_to_dict() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let oid = gix_hash::ObjectId::null(gix_hash::Kind::Sha1); + let resp = ragfs::git::CommitResponse::Created { + commit_oid: oid, + changed: 3, + }; + let obj = commit_response_to_pydict(py, resp).expect("converts"); + let d: &Bound = obj.bind(py).downcast().unwrap(); + let result: String = d.get_item("result").unwrap().unwrap().extract().unwrap(); + assert_eq!(result, "created"); + let changed: usize = d.get_item("changed").unwrap().unwrap().extract().unwrap(); + assert_eq!(changed, 3); + }); + } + + #[test] + fn commit_response_noop_to_dict() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let oid = gix_hash::ObjectId::null(gix_hash::Kind::Sha1); + let resp = ragfs::git::CommitResponse::Noop { commit_oid: oid }; + let obj = commit_response_to_pydict(py, resp).expect("converts"); + let d: &Bound = obj.bind(py).downcast().unwrap(); + let result: String = d.get_item("result").unwrap().unwrap().extract().unwrap(); + assert_eq!(result, "noop"); + }); + } + + #[test] + fn show_response_blob_to_dict_carries_bytes() { + pyo3::prepare_freethreaded_python(); + Python::attach(|py| { + let oid = gix_hash::ObjectId::null(gix_hash::Kind::Sha1); + let resp = ragfs::git::ShowResponse::Blob { + oid, + size: 5, + bytes: bytes::Bytes::from_static(b"hello"), + }; + let obj = show_response_to_pydict(py, resp).expect("converts"); + let d: &Bound = obj.bind(py).downcast().unwrap(); + let b: Vec = d.get_item("bytes").unwrap().unwrap().extract().unwrap(); + assert_eq!(b, b"hello".to_vec()); + let size: u64 = d.get_item("size").unwrap().unwrap().extract().unwrap(); + assert_eq!(size, 5); + }); + } + + // ---------- direct binding: account isolation ---------- + + /// Build a `GitService` over local backends, mirroring what the binding's + /// `build_git_service` does for the `local` backend. The `base_dir` does + /// not need to exist: a malicious account must be rejected at the + /// `GitService` boundary *before* any path under `base_dir` is built. + fn local_git_service() -> Arc { + use ragfs::git::{LocalObjectStore, LocalRefStore}; + let base = std::env::temp_dir().join("ov-git-account-validation-test"); + let vfs = Arc::new(MountableFS::new()) as Arc; + let object_store = Arc::new(LocalObjectStore::new(&base)) as Arc; + let ref_store = Arc::new(LocalRefStore::new(&base)) as Arc; + Arc::new(GitService::new(vfs, object_store, ref_store)) + } + + #[tokio::test] + async fn binding_commit_rejects_traversal_account() { + let svc = local_git_service(); + let req = CommitRequest { + account: "../escape".into(), + branch: "main".into(), + message: "m".into(), + paths: None, + author_name: "n".into(), + author_email: "e".into(), + }; + let err = svc.commit(req).await; + assert!(matches!(err, Err(GitError::InvalidAccountId(_)))); + } + + #[tokio::test] + async fn binding_commit_rejects_slash_account() { + let svc = local_git_service(); + let req = CommitRequest { + account: "a/b".into(), + branch: "main".into(), + message: "m".into(), + paths: None, + author_name: "n".into(), + author_email: "e".into(), + }; + let err = svc.commit(req).await; + assert!(matches!(err, Err(GitError::InvalidAccountId(_)))); + } + + #[tokio::test] + async fn binding_show_rejects_backslash_account() { + let svc = local_git_service(); + let req = ShowRequest { + account: "a\\b".into(), + target_ref: "main".into(), + path: None, + }; + let err = svc.show(req).await; + assert!(matches!(err, Err(GitError::InvalidAccountId(_)))); + } + + #[tokio::test] + async fn binding_restore_rejects_empty_account() { + let svc = local_git_service(); + let req = RestoreRequest { + account: "".into(), + branch: "main".into(), + project_dir: Some("resources/x".into()), + source_commit: "deadbeef".into(), + dry_run: false, + message: None, + author_name: "n".into(), + author_email: "e".into(), + }; + let err = svc.restore(req).await; + assert!(matches!(err, Err(GitError::InvalidAccountId(_)))); + } +} diff --git a/crates/ragfs-python/src/lib.rs b/crates/ragfs-python/src/lib.rs index b1d4b01e81..88aabf30c4 100644 --- a/crates/ragfs-python/src/lib.rs +++ b/crates/ragfs-python/src/lib.rs @@ -496,6 +496,8 @@ fn cache_policy_from_config(config: &RagfsCacheConfig) -> CachePolicy { ) } +mod git; + fn py_detach_blocking(py: Python<'_>, f: F) -> T where T: Send, @@ -805,6 +807,40 @@ fn build_fs_context(ctx: Option>) -> FsContext { Arc::new(FsContextInner::new(account_id)) } +#[derive(serde::Deserialize)] +struct BindingConfig { + #[serde(default)] + git: Option, +} + +fn build_git_from_cfg( + git_cfg: ragfs::git::GitConfig, + fs: &Arc, + rt: &tokio::runtime::Runtime, +) -> PyResult<(Option>, Option)> { + let backend = git_cfg.backend.clone(); + let vfs = fs.clone() as Arc; + let _guard = rt.enter(); + let svc = git::build_git_service(&git_cfg, vfs)?; + Ok((svc, Some(backend))) +} + +fn load_git_from_config( + path: &str, + fs: &Arc, + rt: &tokio::runtime::Runtime, +) -> PyResult<(Option>, Option)> { + let body = std::fs::read_to_string(path).map_err(|e| { + PyRuntimeError::new_err(format!("read config_path {}: {}", path, e)) + })?; + let cfg: BindingConfig = toml::from_str(&body) + .map_err(|e| PyRuntimeError::new_err(format!("parse config_path: {}", e)))?; + match cfg.git { + Some(git_cfg) => build_git_from_cfg(git_cfg, fs, rt), + None => Ok((None, None)), + } +} + /// RAGFS Python Binding Client. /// /// Embeds the ragfs filesystem engine directly in the Python process. @@ -816,6 +852,8 @@ struct RAGFSBindingClient { /// Data entry point: `Stats(Encryption(Mountable))` when encrypted, else `Stats(Mountable)`. top: Arc, rt: tokio::runtime::Runtime, + git_service: Option>, + git_backend: Option, } impl RAGFSBindingClient { @@ -846,11 +884,12 @@ impl RAGFSBindingClient { /// `provider_type` (int) and causes the stack to include an `EncryptionWrappedFS` layer. /// Runtime `cache` configuration takes precedence over `config_path`. #[new] - #[pyo3(signature = (config_path=None, config=None))] + #[pyo3(signature = (config_path=None, config=None, git_config_path=None))] fn new( py: Python<'_>, config_path: Option<&str>, config: Option>>, + git_config_path: Option<&str> ) -> PyResult { let rt = tokio::runtime::Runtime::new() .map_err(|e| PyRuntimeError::new_err(format!("Failed to create runtime: {}", e)))?; @@ -858,6 +897,7 @@ impl RAGFSBindingClient { // Phase A (holding GIL): parse the sectioned config into an owned RagfsConfig. let mut ragfs_cfg = RagfsConfig::default(); let mut runtime_cache_config = None; + let mut inline_git_cfg: Option = None; if let Some(cfg) = config { if let Some(enc_obj) = cfg.get("encryption") { let enc: HashMap> = enc_obj.extract(py)?; @@ -885,6 +925,12 @@ impl RAGFSBindingClient { PyRuntimeError::new_err(format!("Invalid cache config: {error}")) })?); } + if let Some(git_obj) = cfg.get("git") { + let git_value = py_to_json_value(git_obj.bind(py))?; + let git_cfg: ragfs::git::GitConfig = serde_json::from_value(git_value) + .map_err(|e| PyRuntimeError::new_err(format!("Invalid git config: {e}")))?; + inline_git_cfg = Some(git_cfg); + } } let cache_config = match runtime_cache_config { @@ -926,10 +972,23 @@ impl RAGFSBindingClient { } else { rt.block_on(build_default_stack(ragfs_cfg)) }; + + // Build the git service from inline config when present; otherwise fall + // back to loading the [git] section from a config file path. + let (git_service, git_backend) = match inline_git_cfg { + Some(git_cfg) => build_git_from_cfg(git_cfg, &stack.mountable, &rt)?, + None => match git_config_path { + Some(path) => load_git_from_config(path, &stack.mountable, &rt)?, + None => (None, None), + }, + }; + Ok(Self { mountable: stack.mountable, top: stack.top, rt, + git_service, + git_backend, }) } @@ -937,9 +996,70 @@ impl RAGFSBindingClient { fn health(&self) -> PyResult> { let mut m = HashMap::new(); m.insert("status".to_string(), "healthy".to_string()); + m.insert( + "git_enabled".to_string(), + if self.git_service.is_some() { "true".into() } else { "false".into() }, + ); + if let Some(b) = &self.git_backend { + m.insert("git_backend".to_string(), b.clone()); + } Ok(m) } + /// Commit a snapshot of the account's tree. + #[pyo3(signature = (**kwargs))] + fn git_commit( + &self, + py: Python<'_>, + kwargs: Option<&Bound<'_, PyDict>>, + ) -> PyResult> { + let svc = self.git_service.clone().ok_or_else(|| { + git::new_py_err_pub(py, "AGFSNotSupportedError", "git feature disabled".into()) + })?; + let empty = PyDict::new(py); + let kw = kwargs.unwrap_or(&empty); + let req = git::parse_commit_request(kw)?; + let resp = py_detach_blocking(py, move || self.rt.block_on(svc.commit(req))) + .map_err(|e| git::map_git_error(py, e))?; + git::commit_response_to_pydict(py, resp) + } + + /// Restore a project_dir subtree to the state at source_commit. + #[pyo3(signature = (**kwargs))] + fn git_restore( + &self, + py: Python<'_>, + kwargs: Option<&Bound<'_, PyDict>>, + ) -> PyResult> { + let svc = self.git_service.clone().ok_or_else(|| { + git::new_py_err_pub(py, "AGFSNotSupportedError", "git feature disabled".into()) + })?; + let empty = PyDict::new(py); + let kw = kwargs.unwrap_or(&empty); + let req = git::parse_restore_request(kw)?; + let resp = py_detach_blocking(py, move || self.rt.block_on(svc.restore(req))) + .map_err(|e| git::map_git_error(py, e))?; + git::restore_response_to_pydict(py, resp) + } + + /// Read a commit's metadata or a blob's bytes at a path. + #[pyo3(signature = (**kwargs))] + fn git_show( + &self, + py: Python<'_>, + kwargs: Option<&Bound<'_, PyDict>>, + ) -> PyResult> { + let svc = self.git_service.clone().ok_or_else(|| { + git::new_py_err_pub(py, "AGFSNotSupportedError", "git feature disabled".into()) + })?; + let empty = PyDict::new(py); + let kw = kwargs.unwrap_or(&empty); + let req = git::parse_show_request(kw)?; + let resp = py_detach_blocking(py, move || self.rt.block_on(svc.show(req))) + .map_err(|e| git::map_git_error(py, e))?; + git::show_response_to_pydict(py, resp) + } + /// Get client capabilities. fn get_capabilities(&self) -> PyResult>> { Python::attach(|py| { diff --git a/crates/ragfs/Cargo.toml b/crates/ragfs/Cargo.toml index 64c1db03a5..8ea09680ca 100644 --- a/crates/ragfs/Cargo.toml +++ b/crates/ragfs/Cargo.toml @@ -35,6 +35,17 @@ serde_json = "1.0" # Logging tracing = "0.1" +# Git (gitoxide) +gix-hash = "0.14" +gix-object = "0.42" +gix-features = { version = "0.38", features = ["zlib"] } +gix-actor = "0.31" +gix-date = "0.8" + +# Git backend dependencies +dashmap = "5.0" +flate2 = "1.0" + # Path handling and filesystem path-clean = "1.0" @@ -90,6 +101,14 @@ grep-searcher = "0.1.16" [dev-dependencies] tempfile = "3.12" criterion = "0.5" +tokio = { version = "1.38", features = ["full"] } +toml = "0.8" +# Used by the S3/TOS git integration tests (crates/ragfs/tests/git_s3_integration.rs) +gix-hash = "0.14" +gix-object = "0.42" +bytes = "1.5" +serde_json = "1.0" +uuid = { version = "1.0", features = ["v4"] } [features] default = [] diff --git a/crates/ragfs/src/git/backends/local.rs b/crates/ragfs/src/git/backends/local.rs new file mode 100644 index 0000000000..3aeb7dbc06 --- /dev/null +++ b/crates/ragfs/src/git/backends/local.rs @@ -0,0 +1,537 @@ +//! Local filesystem backend for Git object and ref storage + +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; + +use async_trait::async_trait; +use bytes::Bytes; +use dashmap::DashMap; +use gix_hash::ObjectId; +use tokio::sync::Mutex; + +use crate::git::error::{ObjectStoreError, RefStoreError}; +use crate::git::index_store::{ + decode_index, encode_index, systemtime_to_ns, CommitIndex, IndexStore, IndexStoreError, +}; +use crate::git::object_store::ObjectStore; +use crate::git::ref_store::RefStore; +use crate::git::util::validate_ref_name; + +/// Per-process counter used to give each in-flight object write a unique temp +/// filename, preventing concurrent `put` calls from sharing one `.tmp`. +static TMP_SEQ: AtomicU64 = AtomicU64::new(0); + +/// Local filesystem implementation of ObjectStore. +/// +/// Stores objects in Git's standard loose object format: +/// `{base_dir}/{account}/objects/{aa}/{bb...}` +/// where `aa` is the first 2 hex chars of the oid, and `bb...` is the rest. +pub struct LocalObjectStore { + base_dir: PathBuf, +} + +impl LocalObjectStore { + /// Create a new LocalObjectStore with the given base directory. + pub fn new(base_dir: impl Into) -> Self { + Self { + base_dir: base_dir.into(), + } + } + + /// Get the filesystem path for an object. + fn object_path(&self, account: &str, oid: &ObjectId) -> PathBuf { + let hex = oid.to_hex().to_string(); + self.base_dir + .join(account) + .join("objects") + .join(&hex[..2]) + .join(&hex[2..]) + } +} + +#[async_trait] +impl ObjectStore for LocalObjectStore { + async fn put( + &self, + account: &str, + oid: &ObjectId, + zlib_body: Bytes, + ) -> Result<(), ObjectStoreError> { + let path = self.object_path(account, oid); + + // Idempotent: if object already exists, do nothing + if tokio::fs::try_exists(&path).await? { + return Ok(()); + } + + // Ensure parent directory exists + if let Some(parent) = path.parent() { + tokio::fs::create_dir_all(parent).await?; + } + + // Write to a unique temp file first, then rename for atomicity. A + // per-process counter keeps the temp name unique so concurrent `put` + // calls for the same not-yet-existing object don't share one `.tmp` + // and clobber each other mid-write. + let seq = TMP_SEQ.fetch_add(1, Ordering::Relaxed); + let tmp_path = path.with_extension(format!("tmp.{}.{}", std::process::id(), seq)); + tokio::fs::write(&tmp_path, &zlib_body).await?; + match tokio::fs::rename(&tmp_path, &path).await { + Ok(()) => Ok(()), + Err(e) => { + // A racing `put` for the same object may have already produced + // it. Idempotency holds as long as the object exists, so treat + // that as success. Clean up our orphaned temp file regardless. + let _ = tokio::fs::remove_file(&tmp_path).await; + if tokio::fs::try_exists(&path).await.unwrap_or(false) { + Ok(()) + } else { + Err(e.into()) + } + } + } + } + + async fn get(&self, account: &str, oid: &ObjectId) -> Result { + let path = self.object_path(account, oid); + match tokio::fs::read(&path).await { + Ok(bytes) => Ok(Bytes::from(bytes)), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + Err(ObjectStoreError::NotFound(*oid)) + } + Err(e) => Err(e.into()), + } + } + + async fn exists(&self, account: &str, oid: &ObjectId) -> Result { + let path = self.object_path(account, oid); + tokio::fs::try_exists(&path) + .await + .map_err(ObjectStoreError::Io) + } +} + +/// Local filesystem implementation of RefStore. +/// +/// Stores refs as plain text files with hex oid content, and uses: +/// - In-memory locks per (account, ref_name) for process-level serialization +/// - Atomic rename for filesystem-level atomicity +pub struct LocalRefStore { + base_dir: PathBuf, + locks: DashMap<(String, String), Arc>>, +} + +impl LocalRefStore { + /// Create a new LocalRefStore with the given base directory. + pub fn new(base_dir: impl Into) -> Self { + Self { + base_dir: base_dir.into(), + locks: DashMap::new(), + } + } + + /// Get the filesystem path for a ref. + fn ref_path(&self, account: &str, ref_name: &str) -> PathBuf { + self.base_dir.join(account).join(ref_name) + } + + /// Get or create a lock for the given (account, ref_name). + fn get_lock(&self, account: &str, ref_name: &str) -> Arc> { + self.locks + .entry((account.to_string(), ref_name.to_string())) + .or_insert_with(|| Arc::new(Mutex::new(()))) + .clone() + } + + /// Read a ref from disk, returns None if not found. + async fn read_ref_opt(path: &Path) -> Result, RefStoreError> { + match tokio::fs::read_to_string(path).await { + Ok(content) => { + let trimmed = content.trim(); + let oid = trimmed.parse::().map_err(|_| { + RefStoreError::Backend(format!("invalid oid in ref file: {trimmed}")) + })?; + Ok(Some(oid)) + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None), + Err(e) => Err(e.into()), + } + } +} + +#[async_trait] +impl RefStore for LocalRefStore { + async fn read(&self, account: &str, ref_name: &str) -> Result { + // Validate ref name + validate_ref_name(ref_name)?; + + let path = self.ref_path(account, ref_name); + Self::read_ref_opt(&path) + .await? + .ok_or_else(|| RefStoreError::NotFound(ref_name.to_string())) + } + + async fn cas_update( + &self, + account: &str, + ref_name: &str, + expected: Option, + new: ObjectId, + ) -> Result<(), RefStoreError> { + // Validate ref name first + validate_ref_name(ref_name)?; + + // Acquire per-ref lock to serialize concurrent updates + let lock = self.get_lock(account, ref_name); + let _guard = lock.lock().await; + + let path = self.ref_path(account, ref_name); + + // Check current value matches expected + let actual = Self::read_ref_opt(&path).await?; + if actual != expected { + return Err(RefStoreError::Conflict { expected, actual }); + } + + // Ensure parent directory exists + if let Some(parent) = path.parent() { + tokio::fs::create_dir_all(parent).await?; + } + + // Write to temp file then rename for atomicity + let tmp_path = path.with_extension("tmp"); + tokio::fs::write(&tmp_path, format!("{}\n", new.to_hex())).await?; + tokio::fs::rename(&tmp_path, &path).await?; + + Ok(()) + } + + async fn list( + &self, + account: &str, + prefix: &str, + ) -> Result, RefStoreError> { + let dir_path = self.base_dir.join(account).join(prefix); + let mut result = Vec::new(); + + // Walk the directory recursively + let mut stack = vec![dir_path]; + + while let Some(current_path) = stack.pop() { + match tokio::fs::read_dir(¤t_path).await { + Ok(mut entries) => { + while let Some(entry) = entries.next_entry().await? { + let entry_path = entry.path(); + if entry_path.is_dir() { + stack.push(entry_path); + } else { + // It's a file - parse as ref + if let Ok(ref_name) = entry_path.strip_prefix(self.base_dir.join(account)) + { + if let Some(ref_name_str) = ref_name.to_str() { + if let Ok(Some(oid)) = Self::read_ref_opt(&entry_path).await { + result.push((ref_name_str.to_string(), oid)); + } + } + } + } + } + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + // Directory doesn't exist - return empty list + break; + } + Err(e) => return Err(e.into()), + } + } + + Ok(result) + } +} + +/// Local filesystem implementation of [`IndexStore`]. +/// +/// Persists each `(account, branch)` snapshot at +/// `{base_dir}/{account}/index/{branch}.json`. The branch component is +/// `validate_ref_name`-checked before any path is constructed to keep crafted +/// names from escaping the per-account directory. +/// +/// All errors degrade to `Ok(None)` on `load`: missing file, decode failure, +/// version skew. Save uses tempfile + rename for atomicity, so a crash mid- +/// write leaves the previous snapshot intact. +pub struct LocalIndexStore { + base_dir: PathBuf, +} + +impl LocalIndexStore { + /// Create a new `LocalIndexStore` rooted at `base_dir`. Per-account + /// subdirectories are created lazily on first save. + pub fn new(base_dir: impl Into) -> Self { + Self { + base_dir: base_dir.into(), + } + } + + fn index_path(&self, account: &str, branch: &str) -> PathBuf { + self.base_dir + .join(account) + .join("index") + .join(format!("{branch}.json")) + } +} + +#[async_trait] +impl IndexStore for LocalIndexStore { + async fn load( + &self, + account: &str, + branch: &str, + ) -> Result, IndexStoreError> { + validate_ref_name(branch) + .map_err(|_| IndexStoreError::InvalidBranch(branch.to_string()))?; + + let path = self.index_path(account, branch); + match tokio::fs::read(&path).await { + Ok(bytes) => match decode_index(&bytes) { + Ok(Some(mut idx)) => { + // Stamp the index with its own on-disk mtime so the commit + // path can apply the racy-clean guard. Same clock + same + // granularity as the working-tree file mtimes it's compared + // against. A stat failure leaves `saved_at_ns = None` + // (conservative: every entry is then treated as racy). + idx.saved_at_ns = tokio::fs::metadata(&path) + .await + .ok() + .and_then(|m| m.modified().ok()) + .and_then(systemtime_to_ns); + Ok(Some(idx)) + } + Ok(None) => Ok(None), + Err(_) => Ok(None), + }, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None), + Err(e) => Err(e.into()), + } + } + + async fn save( + &self, + account: &str, + branch: &str, + index: &CommitIndex, + ) -> Result<(), IndexStoreError> { + validate_ref_name(branch) + .map_err(|_| IndexStoreError::InvalidBranch(branch.to_string()))?; + + let bytes = encode_index(index)?; + let path = self.index_path(account, branch); + if let Some(parent) = path.parent() { + tokio::fs::create_dir_all(parent).await?; + } + let tmp_path = path.with_extension("tmp"); + tokio::fs::write(&tmp_path, &bytes).await?; + tokio::fs::rename(&tmp_path, &path).await?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + #[tokio::test] + async fn test_local_object_store_put_get_exists() { + let dir = tempdir().unwrap(); + let store = LocalObjectStore::new(dir.path()); + + let account = "test-account"; + let oid = "0123456789abcdef0123456789abcdef01234567".parse::().unwrap(); + let body = Bytes::from("test content"); + + // Initially doesn't exist + assert!(!store.exists(account, &oid).await.unwrap()); + + // Put object + store.put(account, &oid, body.clone()).await.unwrap(); + + // Now exists + assert!(store.exists(account, &oid).await.unwrap()); + + // Get object and verify + let retrieved = store.get(account, &oid).await.unwrap(); + assert_eq!(retrieved, body); + + // Put again is idempotent + store.put(account, &oid, body).await.unwrap(); + } + + #[tokio::test] + async fn test_local_object_store_concurrent_put_same_oid() { + let dir = tempdir().unwrap(); + let store = Arc::new(LocalObjectStore::new(dir.path())); + + let account = "test-account"; + let oid = "0123456789abcdef0123456789abcdef01234567".parse::().unwrap(); + let body = Bytes::from("test content"); + + // Many concurrent puts for the same not-yet-existing object must all + // succeed: the unique temp name + rename-failure recheck preserves + // idempotency under the race. + let mut handles = Vec::new(); + for _ in 0..32 { + let store = store.clone(); + let body = body.clone(); + handles.push(tokio::spawn(async move { + store.put(account, &oid, body).await + })); + } + for handle in handles { + handle.await.unwrap().unwrap(); + } + + assert_eq!(store.get(account, &oid).await.unwrap(), body); + } + + #[tokio::test] + async fn test_local_ref_store_read_cas_list() { + let dir = tempdir().unwrap(); + let store = LocalRefStore::new(dir.path()); + + let account = "test-account"; + let ref_name = "refs/heads/main"; + let oid1 = "0123456789abcdef0123456789abcdef01234567".parse::().unwrap(); + let oid2 = "fedcba9876543210fedcba9876543210fedcba98".parse::().unwrap(); + + // Initially not found + assert!(matches!( + store.read(account, ref_name).await, + Err(RefStoreError::NotFound(_)) + )); + + // CAS from None should work + store.cas_update(account, ref_name, None, oid1).await.unwrap(); + + // Read should return oid1 + assert_eq!(store.read(account, ref_name).await.unwrap(), oid1); + + // CAS from oid1 to oid2 should work + store.cas_update(account, ref_name, Some(oid1), oid2).await.unwrap(); + assert_eq!(store.read(account, ref_name).await.unwrap(), oid2); + + // CAS with wrong expected should fail + let result = store.cas_update(account, ref_name, Some(oid1), oid1).await; + assert!(matches!(result, Err(RefStoreError::Conflict { .. }))); + + // List refs + let refs = store.list(account, "refs/heads/").await.unwrap(); + assert_eq!(refs.len(), 1); + assert_eq!(refs[0].0, "refs/heads/main"); + assert_eq!(refs[0].1, oid2); + } + + #[tokio::test] + async fn test_local_ref_store_concurrent_cas() { + let dir = tempdir().unwrap(); + let store = Arc::new(LocalRefStore::new(dir.path())); + + let account = "test-account"; + let ref_name = "refs/heads/main"; + + // Spawn multiple concurrent cas_update tasks + let mut handles = Vec::new(); + for i in 0..10 { + let store = store.clone(); + let oid = format!("{:040}", i).parse::().unwrap(); + handles.push(tokio::spawn(async move { + // Each task will try to CAS in a loop until it succeeds + let mut attempts = 0; + while attempts < 100 { + let current = store.read(account, ref_name).await.ok(); + match store.cas_update(account, ref_name, current, oid).await { + Ok(_) => return true, + Err(RefStoreError::Conflict { .. }) => attempts += 1, + Err(e) => panic!("unexpected error: {e}"), + } + } + false + })); + } + + // All should succeed eventually + for handle in handles { + assert!(handle.await.unwrap()); + } + } + + fn idx_oid(b: u8) -> ObjectId { + let mut bytes = [0u8; 20]; + bytes.fill(b); + ObjectId::from_bytes_or_panic(&bytes) + } + + #[tokio::test] + async fn local_index_store_round_trip() { + let dir = tempdir().unwrap(); + let store = LocalIndexStore::new(dir.path()); + + // Missing → None + let loaded = store.load("acct", "main").await.unwrap(); + assert!(loaded.is_none()); + + let mut entries = std::collections::HashMap::new(); + entries.insert( + "resources/a.md".into(), + crate::git::types::IndexEntry { + size: 11, + mtime_ns: 1_700_000_000_000_000_000, + oid: idx_oid(0xAA), + }, + ); + let idx = crate::git::index_store::CommitIndex { + parent_oid: idx_oid(0xCC), + entries, + saved_at_ns: None, + }; + + store.save("acct", "main", &idx).await.unwrap(); + let loaded = store.load("acct", "main").await.unwrap().unwrap(); + // parent_oid + entries round-trip through the wire format unchanged. + assert_eq!(loaded.parent_oid, idx.parent_oid); + assert_eq!(loaded.entries, idx.entries); + // saved_at_ns is NOT in the wire format — it is stamped from the index + // file's own mtime on load, so a freshly-saved index loads with Some. + assert!( + loaded.saved_at_ns.is_some(), + "load must stamp saved_at_ns from the index file mtime", + ); + } + + #[tokio::test] + async fn local_index_store_corruption_is_soft_miss() { + let dir = tempdir().unwrap(); + let store = LocalIndexStore::new(dir.path()); + + // Manually drop a malformed file at the expected path + let path = dir.path().join("acct").join("index").join("main.json"); + tokio::fs::create_dir_all(path.parent().unwrap()).await.unwrap(); + tokio::fs::write(&path, b"definitely not json").await.unwrap(); + + // Should be Ok(None), not Err + assert!(store.load("acct", "main").await.unwrap().is_none()); + } + + #[tokio::test] + async fn local_index_store_rejects_invalid_branch() { + let dir = tempdir().unwrap(); + let store = LocalIndexStore::new(dir.path()); + + // Path-traversal style branch name → InvalidBranch error + let result = store.load("acct", "../escape").await; + assert!(matches!( + result, + Err(crate::git::index_store::IndexStoreError::InvalidBranch(_)) + )); + } +} diff --git a/crates/ragfs/src/git/backends/mod.rs b/crates/ragfs/src/git/backends/mod.rs new file mode 100644 index 0000000000..834fc129c4 --- /dev/null +++ b/crates/ragfs/src/git/backends/mod.rs @@ -0,0 +1,11 @@ +//! Git storage backends + +pub mod local; + +#[cfg(feature = "s3")] +pub mod s3; + +pub use local::{LocalObjectStore, LocalRefStore}; + +#[cfg(feature = "s3")] +pub use s3::{CasMode, S3Config, S3ObjectStore, S3RefStore}; diff --git a/crates/ragfs/src/git/backends/s3.rs b/crates/ragfs/src/git/backends/s3.rs new file mode 100644 index 0000000000..62d78f62b3 --- /dev/null +++ b/crates/ragfs/src/git/backends/s3.rs @@ -0,0 +1,795 @@ +//! S3 storage backend for Git objects and refs + +use std::sync::Arc; + +use async_trait::async_trait; +use aws_sdk_s3::config::BehaviorVersion; +use aws_sdk_s3::config::Credentials; +use aws_sdk_s3::config::Region; +use bytes::Bytes; +use gix_hash::ObjectId; + +use crate::git::error::{ObjectStoreError, RefStoreError}; +use crate::git::index_store::{ + decode_index, encode_index, CommitIndex, IndexStore, IndexStoreError, +}; +use crate::git::object_store::ObjectStore; +use crate::git::ref_store::RefStore; +use crate::git::util::validate_ref_name; + +/// CAS (Compare-and-Swap) mode for S3 ref updates. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CasMode { + /// Use S3's native conditional writes with If-Match and ETag. + /// This requires the S3 backend to support ETag and conditional headers. + Native, + /// Use a distributed lock (e.g., Redis) for CAS. + /// Not yet implemented - placeholder for future. + #[allow(dead_code)] + RedisLock, +} + +/// S3 configuration for Git storage +#[derive(Debug, Clone)] +pub struct S3Config { + /// S3 bucket name + pub bucket: String, + /// Key prefix for Git storage (e.g. "git/") + pub prefix: String, + /// Region (e.g. "us-east-1") + pub region: String, + /// Optional endpoint URL (for MinIO, LocalStack, TOS, etc.) + pub endpoint: Option, + /// Optional access key ID + pub access_key_id: Option, + /// Optional secret access key + pub secret_access_key: Option, + /// Whether to use path-style addressing (bucket/key vs bucket.host/key) + pub use_path_style: bool, + /// CAS mode for ref updates + pub cas_mode: CasMode, +} + +impl Default for S3Config { + fn default() -> Self { + Self { + bucket: String::new(), + prefix: String::new(), + region: "us-east-1".to_string(), + endpoint: None, + access_key_id: None, + secret_access_key: None, + use_path_style: true, + cas_mode: CasMode::Native, + } + } +} + +/// S3-based object store implementation +pub struct S3ObjectStore { + client: Arc, + bucket: String, + prefix: String, +} + +impl S3ObjectStore { + /// Create a new S3ObjectStore from an existing S3 client + pub fn new(client: Arc, bucket: String, prefix: String) -> Self { + Self { + client, + bucket, + prefix, + } + } + + /// Create a new S3ObjectStore from configuration + pub async fn from_config(config: S3Config) -> Result { + let mut s3_config_builder = aws_sdk_s3::Config::builder() + .behavior_version(BehaviorVersion::latest()) + .region(Region::new(config.region)) + .force_path_style(config.use_path_style); + + // Set endpoint if provided (MinIO, LocalStack, TOS) + if let Some(ep) = config.endpoint { + s3_config_builder = s3_config_builder.endpoint_url(ep); + } + + // Set credentials if provided, otherwise SDK uses default chain + if let (Some(ak), Some(sk)) = (config.access_key_id, config.secret_access_key) { + let creds = Credentials::new(ak, sk, None, None, "ragfs-git"); + s3_config_builder = s3_config_builder.credentials_provider(creds); + } + + let s3_config = s3_config_builder.build(); + let client = Arc::new(aws_sdk_s3::Client::from_conf(s3_config)); + + Ok(Self::new(client, config.bucket, config.prefix)) + } + + /// Build the full S3 key for a Git object + fn object_key(&self, account: &str, oid: &ObjectId) -> String { + build_object_key(&self.prefix, account, oid) + } +} + +#[async_trait] +impl ObjectStore for S3ObjectStore { + async fn put( + &self, + account: &str, + oid: &ObjectId, + zlib_body: Bytes, + ) -> Result<(), ObjectStoreError> { + let key = self.object_key(account, oid); + + // Use If-None-Match: "*" to ensure idempotency - only write if not exists + match self + .client + .put_object() + .bucket(&self.bucket) + .key(&key) + .body(zlib_body.to_vec().into()) + .if_none_match("*") + .send() + .await + { + Ok(_) => Ok(()), + Err(aws_sdk_s3::error::SdkError::ServiceError(err)) => { + // Check if the error indicates object already exists + let err_str = format!("{:?}", err); + if err_str.to_lowercase().contains("preconditionfailed") + || err_str.to_lowercase().contains("412") + || err_str.to_lowercase().contains("not modified") + { + // Object already exists - that's fine for idempotency + Ok(()) + } else { + Err(ObjectStoreError::Backend(format!( + "S3 put error: {:?}", + err + ))) + } + } + Err(err) => Err(ObjectStoreError::Backend(format!("S3 put error: {:?}", err))), + } + } + + async fn get(&self, account: &str, oid: &ObjectId) -> Result { + let key = self.object_key(account, oid); + + match self + .client + .get_object() + .bucket(&self.bucket) + .key(&key) + .send() + .await + { + Ok(resp) => { + let bytes = resp + .body + .collect() + .await + .map_err(|e| ObjectStoreError::Backend(format!("S3 read body error: {:?}", e)))?; + Ok(Bytes::copy_from_slice(&bytes.to_vec())) + } + Err(err) => { + // Check if the error indicates object not found + let err_str = format!("{:?}", err); + if err_str.to_lowercase().contains("no_such_key") + || err_str.to_lowercase().contains("404") + { + Err(ObjectStoreError::NotFound(*oid)) + } else { + Err(ObjectStoreError::Backend(format!("S3 get error: {:?}", err))) + } + } + } + } + + async fn exists(&self, account: &str, oid: &ObjectId) -> Result { + let key = self.object_key(account, oid); + + match self + .client + .head_object() + .bucket(&self.bucket) + .key(&key) + .send() + .await + { + Ok(_) => Ok(true), + Err(err) => { + // Check if the error indicates object not found + let err_str = format!("{:?}", err); + if err_str.to_lowercase().contains("not_found") + || err_str.to_lowercase().contains("404") + { + Ok(false) + } else { + Err(ObjectStoreError::Backend(format!("S3 head error: {:?}", err))) + } + } + } + } +} + +/// S3-based ref store implementation +pub struct S3RefStore { + client: Arc, + bucket: String, + prefix: String, + cas_mode: CasMode, +} + +impl S3RefStore { + /// Create a new S3RefStore from an existing S3 client + pub fn new(client: Arc, bucket: String, prefix: String) -> Self { + Self { + client, + bucket, + prefix, + cas_mode: CasMode::Native, + } + } + + /// Create a new S3RefStore with explicit CAS mode + pub fn with_cas_mode( + client: Arc, + bucket: String, + prefix: String, + cas_mode: CasMode, + ) -> Self { + Self { + client, + bucket, + prefix, + cas_mode, + } + } + + /// Create a new S3RefStore from configuration + pub async fn from_config(config: S3Config) -> Result { + let mut s3_config_builder = aws_sdk_s3::Config::builder() + .behavior_version(BehaviorVersion::latest()) + .region(Region::new(config.region)) + .force_path_style(config.use_path_style); + + // Set endpoint if provided (MinIO, LocalStack, TOS) + if let Some(ep) = config.endpoint { + s3_config_builder = s3_config_builder.endpoint_url(ep); + } + + // Set credentials if provided, otherwise SDK uses default chain + if let (Some(ak), Some(sk)) = (config.access_key_id, config.secret_access_key) { + let creds = Credentials::new(ak, sk, None, None, "ragfs-git"); + s3_config_builder = s3_config_builder.credentials_provider(creds); + } + + let s3_config = s3_config_builder.build(); + let client = Arc::new(aws_sdk_s3::Client::from_conf(s3_config)); + + Ok(Self::with_cas_mode( + client, + config.bucket, + config.prefix, + config.cas_mode, + )) + } + + /// Build the full S3 key for a Git ref + fn ref_key(&self, account: &str, ref_name: &str) -> String { + build_ref_key(&self.prefix, account, ref_name) + } + + /// Read the current value of a ref, returning None if it doesn't exist + async fn read_ref_opt( + &self, + account: &str, + ref_name: &str, + ) -> Result)>, RefStoreError> { + let key = self.ref_key(account, ref_name); + + match self + .client + .get_object() + .bucket(&self.bucket) + .key(&key) + .send() + .await + { + Ok(resp) => { + let etag = resp.e_tag; + let bytes = resp + .body + .collect() + .await + .map_err(|e| RefStoreError::Backend(format!("S3 read body error: {:?}", e)))?; + let vec_bytes = bytes.to_vec(); + let content = String::from_utf8_lossy(&vec_bytes); + let oid = parse_ref_oid(&content)?; + Ok(Some((oid, etag))) + } + Err(err) => { + // Check if the error indicates ref not found + let err_str = format!("{:?}", err); + if err_str.to_lowercase().contains("no_such_key") + || err_str.to_lowercase().contains("404") + { + Ok(None) + } else { + Err(RefStoreError::Backend(format!("S3 get error: {:?}", err))) + } + } + } + } + + /// Perform native CAS with S3 conditional headers + async fn cas_native( + &self, + account: &str, + ref_name: &str, + expected: Option, + new: ObjectId, + ) -> Result<(), RefStoreError> { + let key = self.ref_key(account, ref_name); + + // First, read to get the current value and ETag + let (current_value, current_etag) = match self.read_ref_opt(account, ref_name).await? { + Some((oid, etag)) => (Some(oid), etag), + None => (None, None), + }; + + // Verify the expected value matches + if current_value != expected { + return Err(RefStoreError::Conflict { + expected, + actual: current_value, + }); + } + + // Prepare the conditional put request + let body = format!("{}\n", new.to_hex()); + let mut put_builder = self + .client + .put_object() + .bucket(&self.bucket) + .key(&key) + .body(body.into_bytes().into()); + + put_builder = match (current_etag, expected) { + (Some(etag), Some(_)) => { + // Existing ref - use If-Match with the current ETag + put_builder.if_match(etag) + } + (None, None) => { + // New ref - use If-None-Match: "*" to ensure it doesn't exist + put_builder.if_none_match("*") + } + _ => { + // This shouldn't happen after our check, but just in case + return Err(RefStoreError::Conflict { + expected, + actual: current_value, + }); + } + }; + + match put_builder.send().await { + Ok(_) => Ok(()), + Err(aws_sdk_s3::error::SdkError::ServiceError(err)) => { + let err_str = format!("{:?}", err); + if err_str.to_lowercase().contains("preconditionfailed") + || err_str.to_lowercase().contains("412") + { + // Conditional check failed - re-read and report conflict + let actual = self.read_ref_opt(account, ref_name).await?.map(|(oid, _)| oid); + Err(RefStoreError::Conflict { expected, actual }) + } else { + Err(RefStoreError::Backend(format!("S3 put error: {:?}", err))) + } + } + Err(err) => Err(RefStoreError::Backend(format!("S3 put error: {:?}", err))), + } + } +} + +#[async_trait] +impl RefStore for S3RefStore { + async fn read(&self, account: &str, ref_name: &str) -> Result { + // Validate ref name + validate_ref_name(ref_name)?; + + self.read_ref_opt(account, ref_name) + .await? + .map(|(oid, _)| oid) + .ok_or_else(|| RefStoreError::NotFound(ref_name.to_string())) + } + + async fn cas_update( + &self, + account: &str, + ref_name: &str, + expected: Option, + new: ObjectId, + ) -> Result<(), RefStoreError> { + // Validate ref name first + validate_ref_name(ref_name)?; + + match self.cas_mode { + CasMode::Native => { + self.cas_native(account, ref_name, expected, new).await + } + CasMode::RedisLock => { + // Redis lock mode not yet implemented + Err(RefStoreError::Backend( + "RedisLock CAS mode not yet implemented".to_string(), + )) + } + } + } + + async fn list( + &self, + account: &str, + prefix: &str, + ) -> Result, RefStoreError> { + let key_prefix = self.ref_key(account, prefix); + let key_prefix = if key_prefix.ends_with('/') { + key_prefix + } else { + format!("{}/", key_prefix) + }; + + let mut result = Vec::new(); + let mut continuation_token = None; + + loop { + let mut req = self + .client + .list_objects_v2() + .bucket(&self.bucket) + .prefix(&key_prefix); + + if let Some(token) = continuation_token { + req = req.continuation_token(token); + } + + let resp = req + .send() + .await + .map_err(|e| RefStoreError::Backend(format!("S3 list error: {:?}", e)))?; + + let next_token = resp.next_continuation_token().map(|s| s.to_string()); + + for obj in resp.contents() { + if let Some(key) = obj.key() { + // Skip directory markers + if key.ends_with('/') { + continue; + } + + // Strip the base prefix to get the ref name + let base_prefix = self.ref_key(account, ""); + let ref_name = key.strip_prefix(&base_prefix).unwrap_or(key); + + // Read the ref value (without ETag) + if let Ok(Some((oid, _))) = self.read_ref_opt(account, ref_name).await { + result.push((ref_name.to_string(), oid)); + } + } + } + + if resp.is_truncated() == Some(true) { + continuation_token = next_token; + } else { + break; + } + } + + Ok(result) + } +} + +/// S3-backed implementation of [`IndexStore`]. +/// +/// Stores each `(account, branch)` snapshot at +/// `{prefix}/{account}/index/{branch}.json`. The branch component is +/// `validate_ref_name`-checked before any key is built, so crafted branch +/// names cannot escape the per-account namespace. +/// +/// Save uses a plain `put_object` (last-write-wins) — there is no CAS because +/// the index is a soft-state cache and correctness is enforced at load time +/// via the `parent_oid` check. Decode failures and `NoSuchKey` both surface +/// as `Ok(None)` from `load`. +pub struct S3IndexStore { + client: Arc, + bucket: String, + prefix: String, +} + +impl S3IndexStore { + /// Create a new `S3IndexStore` from an existing S3 client. + pub fn new(client: Arc, bucket: String, prefix: String) -> Self { + Self { + client, + bucket, + prefix, + } + } + + /// Create a new `S3IndexStore` from configuration. Reuses the same + /// credential / endpoint setup as [`S3ObjectStore::from_config`]. + pub async fn from_config(config: S3Config) -> Result { + let mut s3_config_builder = aws_sdk_s3::Config::builder() + .behavior_version(BehaviorVersion::latest()) + .region(Region::new(config.region)) + .force_path_style(config.use_path_style); + + if let Some(ep) = config.endpoint { + s3_config_builder = s3_config_builder.endpoint_url(ep); + } + + if let (Some(ak), Some(sk)) = (config.access_key_id, config.secret_access_key) { + let creds = Credentials::new(ak, sk, None, None, "ragfs-git"); + s3_config_builder = s3_config_builder.credentials_provider(creds); + } + + let s3_config = s3_config_builder.build(); + let client = Arc::new(aws_sdk_s3::Client::from_conf(s3_config)); + + Ok(Self::new(client, config.bucket, config.prefix)) + } + + fn index_key(&self, account: &str, branch: &str) -> String { + build_index_key(&self.prefix, account, branch) + } +} + +#[async_trait] +impl IndexStore for S3IndexStore { + async fn load( + &self, + account: &str, + branch: &str, + ) -> Result, IndexStoreError> { + validate_ref_name(branch) + .map_err(|_| IndexStoreError::InvalidBranch(branch.to_string()))?; + + let key = self.index_key(account, branch); + match self + .client + .get_object() + .bucket(&self.bucket) + .key(&key) + .send() + .await + { + Ok(resp) => { + // Capture the object's last-modified time before consuming the + // body — used to stamp the racy-clean guard's `saved_at_ns`. + // S3 timestamps are second-granularity, which widens the racy + // window to ~1s; that is the conservative (safe) direction. + let last_modified_ns = resp.last_modified().and_then(|dt| { + let secs = dt.secs(); + if secs < 0 { + None + } else { + Some(secs as i128 * 1_000_000_000 + dt.subsec_nanos() as i128) + } + }); + let bytes = resp + .body + .collect() + .await + .map_err(|e| IndexStoreError::Backend(format!("S3 read body: {:?}", e)))?; + match decode_index(&bytes.to_vec()) { + Ok(Some(mut idx)) => { + idx.saved_at_ns = last_modified_ns; + Ok(Some(idx)) + } + Ok(None) => Ok(None), + Err(_) => Ok(None), + } + } + Err(err) => { + let err_str = format!("{:?}", err); + if err_str.to_lowercase().contains("no_such_key") + || err_str.to_lowercase().contains("404") + { + Ok(None) + } else { + Err(IndexStoreError::Backend(format!("S3 get error: {:?}", err))) + } + } + } + } + + async fn save( + &self, + account: &str, + branch: &str, + index: &CommitIndex, + ) -> Result<(), IndexStoreError> { + validate_ref_name(branch) + .map_err(|_| IndexStoreError::InvalidBranch(branch.to_string()))?; + + let bytes = encode_index(index)?; + let key = self.index_key(account, branch); + self.client + .put_object() + .bucket(&self.bucket) + .key(&key) + .body(bytes.into()) + .send() + .await + .map_err(|e| IndexStoreError::Backend(format!("S3 put error: {:?}", e)))?; + Ok(()) + } +} + +/// Build the full S3 key for a Git object. +/// +/// Layout: `{prefix}/{account}/objects/{aa}/{bb..}` where `aa` is the first two +/// hex chars of the object id. When `prefix` is empty the leading segment is +/// omitted. A trailing slash on `prefix` is ignored. +fn build_object_key(prefix: &str, account: &str, oid: &ObjectId) -> String { + let hex = oid.to_hex().to_string(); + let prefix = prefix.trim_end_matches('/'); + if prefix.is_empty() { + format!("{}/objects/{}/{}", account, &hex[..2], &hex[2..]) + } else { + format!("{}/{}/objects/{}/{}", prefix, account, &hex[..2], &hex[2..]) + } +} + +/// Build the full S3 key for a Git ref. +/// +/// Layout: `{prefix}/{account}/{ref_name}`. When `prefix` is empty the leading +/// segment is omitted. A trailing slash on `prefix` is ignored. +fn build_ref_key(prefix: &str, account: &str, ref_name: &str) -> String { + let prefix = prefix.trim_end_matches('/'); + if prefix.is_empty() { + format!("{}/{}", account, ref_name) + } else { + format!("{}/{}/{}", prefix, account, ref_name) + } +} + +/// Parse the trimmed content of a ref blob into an [`ObjectId`]. +/// +/// Returns [`RefStoreError::Backend`] when the content is not a valid object id. +fn parse_ref_oid(content: &str) -> Result { + let trimmed = content.trim(); + trimmed + .parse::() + .map_err(|_| RefStoreError::Backend(format!("invalid oid in ref: {}", trimmed))) +} + +/// Build the full S3 key for a persisted commit index. +/// +/// Layout: `{prefix}/{account}/index/{branch}.json`. When `prefix` is empty +/// the leading segment is omitted. A trailing slash on `prefix` is ignored. +fn build_index_key(prefix: &str, account: &str, branch: &str) -> String { + let prefix = prefix.trim_end_matches('/'); + if prefix.is_empty() { + format!("{}/index/{}.json", account, branch) + } else { + format!("{}/{}/index/{}.json", prefix, account, branch) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// A valid 40-char SHA-1 hex object id used across tests. + const VALID_OID_HEX: &str = "0123456789abcdef0123456789abcdef01234567"; + + #[test] + fn test_cas_mode_variants() { + assert_eq!(CasMode::Native, CasMode::Native); + assert_ne!(CasMode::Native, CasMode::RedisLock); + } + + #[test] + fn test_s3_config_default() { + let config = S3Config::default(); + assert_eq!(config.region, "us-east-1"); + assert_eq!(config.use_path_style, true); + assert_eq!(config.cas_mode, CasMode::Native); + } + + #[test] + fn test_object_key_with_prefix() { + let oid: ObjectId = VALID_OID_HEX.parse().unwrap(); + let key = build_object_key("git", "acct1", &oid); + assert_eq!( + key, + "git/acct1/objects/01/23456789abcdef0123456789abcdef01234567" + ); + } + + #[test] + fn test_object_key_empty_prefix() { + let oid: ObjectId = VALID_OID_HEX.parse().unwrap(); + let key = build_object_key("", "acct1", &oid); + assert_eq!( + key, + "acct1/objects/01/23456789abcdef0123456789abcdef01234567" + ); + } + + #[test] + fn test_object_key_trailing_slash_prefix() { + let oid: ObjectId = VALID_OID_HEX.parse().unwrap(); + // A trailing slash on the prefix must not produce a double slash. + let key = build_object_key("git/", "acct1", &oid); + assert_eq!( + key, + "git/acct1/objects/01/23456789abcdef0123456789abcdef01234567" + ); + } + + #[test] + fn test_ref_key_with_prefix() { + let key = build_ref_key("git", "acct1", "refs/heads/main"); + assert_eq!(key, "git/acct1/refs/heads/main"); + } + + #[test] + fn test_ref_key_empty_prefix() { + let key = build_ref_key("", "acct1", "refs/heads/main"); + assert_eq!(key, "acct1/refs/heads/main"); + } + + #[test] + fn test_ref_key_trailing_slash_prefix() { + let key = build_ref_key("git/", "acct1", "refs/heads/main"); + assert_eq!(key, "git/acct1/refs/heads/main"); + } + + #[test] + fn test_parse_ref_oid_valid() { + let oid = parse_ref_oid(VALID_OID_HEX).unwrap(); + assert_eq!(oid.to_hex().to_string(), VALID_OID_HEX); + } + + #[test] + fn test_parse_ref_oid_valid_with_whitespace() { + // Ref blobs are commonly written with a trailing newline. + let oid = parse_ref_oid(&format!(" {}\n", VALID_OID_HEX)).unwrap(); + assert_eq!(oid.to_hex().to_string(), VALID_OID_HEX); + } + + #[test] + fn test_parse_ref_oid_invalid_non_hex() { + let err = parse_ref_oid("not-a-valid-oid").unwrap_err(); + assert!(matches!(err, RefStoreError::Backend(_))); + } + + #[test] + fn test_parse_ref_oid_invalid_wrong_length() { + // Valid hex but too short to be a SHA-1 object id. + let err = parse_ref_oid("0123abcd").unwrap_err(); + assert!(matches!(err, RefStoreError::Backend(_))); + } + + #[test] + fn test_index_key_with_prefix() { + let key = build_index_key("git", "acct1", "main"); + assert_eq!(key, "git/acct1/index/main.json"); + } + + #[test] + fn test_index_key_empty_prefix() { + let key = build_index_key("", "acct1", "main"); + assert_eq!(key, "acct1/index/main.json"); + } + + #[test] + fn test_index_key_trailing_slash_prefix() { + let key = build_index_key("git/", "acct1", "main"); + assert_eq!(key, "git/acct1/index/main.json"); + } +} diff --git a/crates/ragfs/src/git/commit.rs b/crates/ragfs/src/git/commit.rs new file mode 100644 index 0000000000..ec19ee5380 --- /dev/null +++ b/crates/ragfs/src/git/commit.rs @@ -0,0 +1,107 @@ +//! Git commit object construction. + +use gix_hash::ObjectId; +use gix_object::{bstr::BString, Commit, WriteTo}; + +use crate::git::error::ObjectStoreError; +use crate::git::object_store::ObjectStore; +use crate::git::util::write_object; + +/// Build a `gix_object::Commit` and write it via the existing `write_object` +/// helper. Uses `gix_date::Time::now_local_or_utc()` for both author and +/// committer timestamps. Returns the new commit's ObjectId. +pub async fn write_commit( + store: &dyn ObjectStore, + account: &str, + tree: ObjectId, + parents: Vec, + author_name: &str, + author_email: &str, + message: &str, +) -> Result { + let now = gix_date::Time::now_local_or_utc(); + let actor = gix_actor::Signature { + name: author_name.into(), + email: author_email.into(), + time: now, + }; + let commit = Commit { + tree, + parents: parents.into(), + author: actor.clone(), + committer: actor, + encoding: None, + message: BString::from(message), + extra_headers: Vec::new(), + }; + let mut buf = Vec::with_capacity(256); + commit + .write_to(&mut buf) + .map_err(|e| ObjectStoreError::Backend(format!("commit encode: {e}")))?; + write_object(store, account, gix_object::Kind::Commit, &buf).await +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_write_commit_round_trip_no_parent() { + use tempfile::tempdir; + use crate::git::backends::local::LocalObjectStore; + use crate::git::util::{read_object, parse_object_header}; + use gix_object::bstr::ByteSlice; + + let dir = tempdir().unwrap(); + let store = LocalObjectStore::new(dir.path()); + + let tree = gix_hash::ObjectId::empty_tree(gix_hash::Kind::Sha1); + let oid = write_commit( + &store, "acct", + tree, + vec![], // root commit, no parents + "alice", "alice@example.com", + "init", + ).await.unwrap(); + + let raw = read_object(&store, "acct", &oid).await.unwrap(); + let (kind, _, hdr) = parse_object_header(&raw).unwrap(); + assert_eq!(kind, gix_object::Kind::Commit); + let parsed = gix_object::CommitRef::from_bytes(&raw[hdr..]).unwrap(); + assert_eq!(parsed.tree(), tree); + assert_eq!(parsed.parents().count(), 0); + assert_eq!(parsed.message, b"init".as_bstr()); + assert_eq!(parsed.author.name, b"alice".as_bstr()); + assert_eq!(parsed.author.email, b"alice@example.com".as_bstr()); + } + + #[tokio::test] + async fn test_write_commit_with_parent() { + use tempfile::tempdir; + use crate::git::backends::local::LocalObjectStore; + use crate::git::util::{read_object, parse_object_header}; + + let dir = tempdir().unwrap(); + let store = LocalObjectStore::new(dir.path()); + + let tree = gix_hash::ObjectId::empty_tree(gix_hash::Kind::Sha1); + let parent = gix_hash::ObjectId::from_hex( + b"1234567890abcdef1234567890abcdef12345678" + ).unwrap(); + + let oid = write_commit( + &store, "acct", + tree, + vec![parent], + "bob", "bob@example.com", + "child commit", + ).await.unwrap(); + + let raw = read_object(&store, "acct", &oid).await.unwrap(); + let (_, _, hdr) = parse_object_header(&raw).unwrap(); + let parsed = gix_object::CommitRef::from_bytes(&raw[hdr..]).unwrap(); + let parents: Vec<_> = parsed.parents().collect(); + assert_eq!(parents.len(), 1); + assert_eq!(parents[0], parent); + } +} diff --git a/crates/ragfs/src/git/config.rs b/crates/ragfs/src/git/config.rs new file mode 100644 index 0000000000..29bcf2b95e --- /dev/null +++ b/crates/ragfs/src/git/config.rs @@ -0,0 +1,189 @@ +//! Git module config types loaded from the [git] section of the binding TOML. + +use serde::Deserialize; + +#[derive(Debug, Clone, Deserialize)] +pub struct GitConfig { + #[serde(default)] + pub enabled: bool, + #[serde(default = "default_backend")] + pub backend: String, + #[serde(default = "default_branch")] + pub default_branch: String, + #[serde(default = "default_author_name")] + pub author_name: String, + #[serde(default = "default_author_email")] + pub author_email: String, + + #[serde(default)] + pub local: Option, + #[serde(default)] + pub s3: Option, + + #[serde(default)] + pub tuning: GitTuningConfig, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct GitLocalConfig { + pub base_dir: String, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct GitS3ConfigPy { + pub bucket: String, + #[serde(default = "default_s3_prefix")] + pub prefix: String, + pub region: String, + #[serde(default)] + pub endpoint: String, + #[serde(default)] + pub access_key: Option, + #[serde(default)] + pub secret_key: Option, + #[serde(default = "default_cas_mode")] + pub cas_mode: String, + #[serde(default)] + pub redis_lock_url: Option, + #[serde(default = "default_true")] + pub use_path_style: bool, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct GitTuningConfig { + #[serde(default = "default_upload_concurrency")] + pub upload_concurrency: usize, + #[serde(default = "default_restore_concurrency")] + pub restore_concurrency: usize, + #[serde(default = "default_ref_cas_max_retry")] + pub ref_cas_max_retry: u32, + #[serde(default = "default_ref_cas_backoff_ms")] + pub ref_cas_backoff_ms: u64, + /// Enable Fast Path 1: skip read+SHA-1 for files whose `(size, mtime_ns)` + /// match the previous commit's persisted index. Defaults to `true`; set + /// to `false` to force the slow path on every commit (useful for tests + /// and for environments with unreliable mtimes). + #[serde(default = "default_true")] + pub commit_index_enabled: bool, + /// Enable Fast Path 3: on the commit slow path, run an `exists()` precheck + /// before compressing and putting a blob, skipping the write when the + /// object already exists. Defaults to `true`. `put` is idempotent, so this + /// only affects backend call counts, never commit results. + #[serde(default = "default_true")] + pub blob_exists_precheck_enabled: bool, +} + +impl Default for GitTuningConfig { + fn default() -> Self { + Self { + upload_concurrency: default_upload_concurrency(), + restore_concurrency: default_restore_concurrency(), + ref_cas_max_retry: default_ref_cas_max_retry(), + ref_cas_backoff_ms: default_ref_cas_backoff_ms(), + commit_index_enabled: default_true(), + blob_exists_precheck_enabled: default_true(), + } + } +} + +fn default_backend() -> String { + "local".to_string() +} +fn default_branch() -> String { + "main".to_string() +} +fn default_author_name() -> String { + "openviking-bot".to_string() +} +fn default_author_email() -> String { + "bot@openviking.local".to_string() +} +fn default_s3_prefix() -> String { + ".ovgit".to_string() +} +fn default_cas_mode() -> String { + "native".to_string() +} +fn default_upload_concurrency() -> usize { + 64 +} +fn default_restore_concurrency() -> usize { + 32 +} +fn default_ref_cas_max_retry() -> u32 { + 3 +} +fn default_ref_cas_backoff_ms() -> u64 { + 50 +} +fn default_true() -> bool { + true +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_minimal_local_config() { + let toml_src = r#" + enabled = true + backend = "local" + + [local] + base_dir = "/tmp/ov-git" + "#; + let cfg: GitConfig = toml::from_str(toml_src).unwrap(); + assert!(cfg.enabled); + assert_eq!(cfg.backend, "local"); + assert_eq!(cfg.default_branch, "main"); + assert_eq!(cfg.author_name, "openviking-bot"); + assert_eq!(cfg.author_email, "bot@openviking.local"); + assert_eq!(cfg.local.as_ref().unwrap().base_dir, "/tmp/ov-git"); + assert!(cfg.s3.is_none()); + assert_eq!(cfg.tuning.upload_concurrency, 64); + assert_eq!(cfg.tuning.restore_concurrency, 32); + assert_eq!(cfg.tuning.ref_cas_max_retry, 3); + assert_eq!(cfg.tuning.ref_cas_backoff_ms, 50); + assert!(cfg.tuning.commit_index_enabled); + assert!(cfg.tuning.blob_exists_precheck_enabled); + } + + #[test] + fn parses_s3_config_with_overrides() { + let toml_src = r#" + enabled = true + backend = "s3" + default_branch = "trunk" + author_name = "alice" + author_email = "alice@example.com" + + [s3] + bucket = "ov-bucket" + region = "us-west-2" + endpoint = "https://s3.example.com" + access_key = "AKxxx" + secret_key = "SKxxx" + + [tuning] + upload_concurrency = 128 + "#; + let cfg: GitConfig = toml::from_str(toml_src).unwrap(); + assert_eq!(cfg.backend, "s3"); + assert_eq!(cfg.default_branch, "trunk"); + let s3 = cfg.s3.as_ref().unwrap(); + assert_eq!(s3.bucket, "ov-bucket"); + assert_eq!(s3.prefix, ".ovgit"); + assert_eq!(s3.region, "us-west-2"); + assert_eq!(s3.cas_mode, "native"); + assert_eq!(cfg.tuning.upload_concurrency, 128); + assert_eq!(cfg.tuning.restore_concurrency, 32); + } + + #[test] + fn defaults_when_section_minimal() { + let cfg: GitConfig = toml::from_str("").unwrap(); + assert!(!cfg.enabled); + assert_eq!(cfg.backend, "local"); + } +} diff --git a/crates/ragfs/src/git/enumerate.rs b/crates/ragfs/src/git/enumerate.rs new file mode 100644 index 0000000000..088d877e57 --- /dev/null +++ b/crates/ragfs/src/git/enumerate.rs @@ -0,0 +1,405 @@ +//! VFS enumeration with pruning rules from design §4.2. +//! +//! Walks an account's VFS subtree at `/local/{account}` and returns +//! account-relative paths of files that must be included in a Git snapshot. +//! +//! Pruning rules (verbatim from design §4.2): +//! +//! | Skip if account-relative path matches | Reason | +//! |---|---| +//! | First path segment ∈ {`_system`, `tasks`, `temp`, `queue`, `upload`} | Internal scopes + runtime locks | +//! | Any segment equals `.path.ovlock` OR starts with `.path.ovlock` | Runtime lock | +//! | File extension is `.faiss` or `.index`, OR path contains an `embedding_cache/` segment | Vector index — derived data | +//! +//! L0/L1 derived files (`.abstract.md`, `.overview.md`, `.relations.json`) +//! are intentionally KEPT — design §4.2 says they belong in snapshots. + +use std::sync::Arc; + +use crate::core::filesystem::FileSystem; +use crate::git::error::GitError; + +/// First-segment internal scope names that are pruned. +/// +/// Combines `INTERNAL_SCOPES` (system/tasks/temp/queue/upload) and +/// `VikingFS._INTERNAL_NAMES` from the design. +const INTERNAL_FIRST_SEGMENTS: &[&str] = &["_system", "tasks", "temp", "queue", "upload"]; + +/// Returns true if this account-relative path must be excluded from commits. +/// +/// `rel` is the path relative to the account root (no leading "/", no +/// "/local/{account}/" prefix). Examples of valid input: +/// - "resources/a.md" +/// - "_system/lock" +/// - ".path.ovlock" +pub fn prune_path(rel: &str) -> bool { + if rel.is_empty() { + return true; + } + + let segments: Vec<&str> = rel.split('/').filter(|s| !s.is_empty()).collect(); + if segments.is_empty() { + return true; + } + + // Rule 1: first path segment in INTERNAL_FIRST_SEGMENTS + if INTERNAL_FIRST_SEGMENTS.contains(&segments[0]) { + return true; + } + + // Rule 2: any segment equals or starts with ".path.ovlock" + for seg in &segments { + if seg.starts_with(".path.ovlock") { + return true; + } + } + + // Rule 3a: any intermediate segment named exactly "embedding_cache" + // (i.e. path contains an `embedding_cache/` segment). + for seg in &segments[..segments.len().saturating_sub(1)] { + if *seg == "embedding_cache" { + return true; + } + } + + // Rule 3b: extension is .faiss or .index + if let Some(last) = segments.last() { + if last.ends_with(".faiss") || last.ends_with(".index") { + return true; + } + } + + false +} + +/// Enumerate every versionable file in an account's VFS subtree. +/// +/// Returns paths relative to the account root (no leading "/", no +/// "/local/{account}/" prefix). Directories are filtered out; only file +/// entries that survive pruning are returned. +pub async fn collect_all( + vfs: &Arc, + account: &str, +) -> Result, GitError> { + collect_under(vfs, account, "").await +} + +/// Enumerate every versionable file under an account-relative subdirectory. +/// +/// `sub_path` is account-relative (no leading "/", no trailing "/"). An empty +/// `sub_path` enumerates the entire account tree (equivalent to +/// [`collect_all`]). Returned paths are account-relative and include the +/// `sub_path` prefix. Directories are filtered out; only file entries that +/// survive [`prune_path`] (applied to the account-relative form) are returned. +pub async fn collect_under( + vfs: &Arc, + account: &str, + sub_path: &str, +) -> Result, GitError> { + let root = if sub_path.is_empty() { + format!("/local/{}", account) + } else { + format!("/local/{}/{}", account, sub_path) + }; + let account_prefix = format!("/local/{}/", account); + + let entries = vfs.tree_directory(&root, true, None, None).await?; + + let mut survivors = Vec::new(); + for entry in entries { + if entry.info.is_dir { + continue; + } + + // Strip "/local/{account}/" so pruning sees account-relative paths. + // If the entry doesn't carry that prefix, skip it (defensive — should + // not happen for a well-formed VFS). + let rel = match entry.path.strip_prefix(&account_prefix) { + Some(r) => r, + None => continue, + }; + + if prune_path(rel) { + continue; + } + + survivors.push(rel.to_string()); + } + + Ok(survivors) +} + +#[cfg(test)] +mod tests { + use super::*; + use async_trait::async_trait; + use std::collections::HashMap; + use std::sync::Arc; + + use crate::core::errors::Result; + use crate::core::filesystem::FileSystem; + use crate::core::types::{FileInfo, TreeEntry, WriteFlag}; + + /// In-test mock that directly overrides `tree_directory` to return a + /// precomputed list of (path, is_dir). All other trait methods are + /// unimplemented because `collect_all` only calls `tree_directory`. + struct MockFS { + /// Map: root path -> list of (absolute path, is_dir) + entries_by_root: HashMap>, + } + + impl MockFS { + fn new(root: &str, entries: Vec<(&str, bool)>) -> Self { + let mut map = HashMap::new(); + map.insert( + root.to_string(), + entries + .into_iter() + .map(|(p, d)| (p.to_string(), d)) + .collect(), + ); + Self { + entries_by_root: map, + } + } + } + + #[async_trait] + impl FileSystem for MockFS { + async fn create(&self, _path: &str) -> Result<()> { + unimplemented!() + } + async fn mkdir(&self, _path: &str, _mode: u32) -> Result<()> { + unimplemented!() + } + async fn remove(&self, _path: &str) -> Result<()> { + unimplemented!() + } + async fn remove_all(&self, _path: &str) -> Result<()> { + unimplemented!() + } + async fn read(&self, _path: &str, _offset: u64, _size: u64) -> Result> { + unimplemented!() + } + async fn write( + &self, + _path: &str, + _data: &[u8], + _offset: u64, + _flags: WriteFlag, + ) -> Result { + unimplemented!() + } + async fn read_dir(&self, _path: &str) -> Result> { + unimplemented!() + } + async fn stat(&self, _path: &str) -> Result { + unimplemented!() + } + async fn rename(&self, _old_path: &str, _new_path: &str) -> Result<()> { + unimplemented!() + } + async fn chmod(&self, _path: &str, _mode: u32) -> Result<()> { + unimplemented!() + } + + /// Override tree_directory to return precomputed entries, bypassing + /// the default read_dir/stat-based recursion. + async fn tree_directory( + &self, + path: &str, + _show_hidden: bool, + _node_limit: Option, + _level_limit: Option, + ) -> Result> { + let raw = self.entries_by_root.get(path).cloned().unwrap_or_default(); + + let prefix = if path == "/" { + "/".to_string() + } else { + format!("{}/", path) + }; + + let mut out = Vec::new(); + for (full_path, is_dir) in raw { + let rel_path = full_path + .strip_prefix(&prefix) + .unwrap_or(&full_path) + .to_string(); + let name = full_path + .rsplit('/') + .next() + .unwrap_or(&full_path) + .to_string(); + let info = if is_dir { + FileInfo::new_dir(name, 0o755) + } else { + FileInfo::new_file(name, 0, 0o644) + }; + out.push(TreeEntry { + path: full_path, + rel_path, + info, + extra: HashMap::new(), + }); + } + Ok(out) + } + } + + #[tokio::test] + async fn test_collect_all_prunes_internal_scopes_and_vector_indexes() { + let mock = MockFS::new( + "/local/acct", + vec![ + ("/local/acct/resources/a.md", false), + ("/local/acct/agent/b.py", false), + ("/local/acct/_system/lock", false), + ("/local/acct/tasks/job.json", false), + ("/local/acct/temp/upload.bin", false), + ("/local/acct/queue/x", false), + ("/local/acct/upload/y", false), + ("/local/acct/resources/x.faiss", false), + ("/local/acct/resources/x.index", false), + ("/local/acct/resources/embedding_cache/v.bin", false), + ], + ); + let fs: Arc = Arc::new(mock); + + let mut got = collect_all(&fs, "acct").await.unwrap(); + got.sort(); + + let mut expected = vec!["agent/b.py".to_string(), "resources/a.md".to_string()]; + expected.sort(); + + assert_eq!(got, expected); + } + + #[tokio::test] + async fn test_collect_all_keeps_derived_l0_l1_files() { + let mock = MockFS::new( + "/local/acct", + vec![ + ("/local/acct/resources/x.md", false), + ("/local/acct/resources/x.md.abstract.md", false), + ("/local/acct/resources/x.md.overview.md", false), + ("/local/acct/resources/x.md.relations.json", false), + ], + ); + let fs: Arc = Arc::new(mock); + + let mut got = collect_all(&fs, "acct").await.unwrap(); + got.sort(); + + let mut expected = vec![ + "resources/x.md".to_string(), + "resources/x.md.abstract.md".to_string(), + "resources/x.md.overview.md".to_string(), + "resources/x.md.relations.json".to_string(), + ]; + expected.sort(); + + assert_eq!(got, expected); + } + + #[tokio::test] + async fn test_collect_all_returns_account_relative_paths() { + let mock = MockFS::new("/local/acct", vec![("/local/acct/resources/a.md", false)]); + let fs: Arc = Arc::new(mock); + + let got = collect_all(&fs, "acct").await.unwrap(); + assert_eq!(got, vec!["resources/a.md".to_string()]); + // Defensive: ensure no absolute leakage. + for p in &got { + assert!(!p.starts_with('/'), "path should not be absolute: {}", p); + assert!( + !p.contains("/local/acct"), + "path should not contain account prefix: {}", + p + ); + } + } + + #[tokio::test] + async fn test_collect_under_returns_paths_below_subdir_with_prefix() { + let mock = MockFS::new( + "/local/acct/resources", + vec![ + ("/local/acct/resources/a.md", false), + ("/local/acct/resources/sub/b.md", false), + ("/local/acct/resources/x.faiss", false), + ], + ); + let fs: Arc = Arc::new(mock); + + let mut got = collect_under(&fs, "acct", "resources").await.unwrap(); + got.sort(); + + let mut expected = vec![ + "resources/a.md".to_string(), + "resources/sub/b.md".to_string(), + ]; + expected.sort(); + + assert_eq!(got, expected); + } + + #[tokio::test] + async fn test_collect_under_empty_sub_path_equivalent_to_collect_all() { + let mock = MockFS::new( + "/local/acct", + vec![ + ("/local/acct/resources/a.md", false), + ("/local/acct/_system/lock", false), + ], + ); + let fs: Arc = Arc::new(mock); + + let got_under = collect_under(&fs, "acct", "").await.unwrap(); + let got_all = collect_all(&fs, "acct").await.unwrap(); + assert_eq!(got_under, got_all); + assert_eq!(got_under, vec!["resources/a.md".to_string()]); + } + + #[tokio::test] + async fn test_collect_under_applies_pruning_on_account_relative_form() { + // A `_system` sub_path is itself a pruned prefix — every file under it + // is pruned because the first account-relative segment is "_system". + let mock = MockFS::new( + "/local/acct/_system", + vec![("/local/acct/_system/lock", false)], + ); + let fs: Arc = Arc::new(mock); + + let got = collect_under(&fs, "acct", "_system").await.unwrap(); + assert!(got.is_empty(), "everything under _system must be pruned"); + } + + #[test] + fn test_prune_path_table() { + // Pruned + assert!(prune_path("_system/lock")); + assert!(prune_path("tasks/job.json")); + assert!(prune_path("temp/x")); + assert!(prune_path("queue/x")); + assert!(prune_path("upload/x")); + assert!(prune_path("resources/.path.ovlock")); + assert!(prune_path(".path.ovlock")); + assert!(prune_path("resources/x.faiss")); + assert!(prune_path("resources/x.index")); + assert!(prune_path("resources/embedding_cache/v.bin")); + assert!(prune_path("agent/embedding_cache/something")); + + // Survivors + assert!(!prune_path("resources/a.md")); + assert!(!prune_path("agent/skills/b.py")); + assert!(!prune_path("resources/x.md.abstract.md")); + assert!(!prune_path("resources/x.md.overview.md")); + assert!(!prune_path("resources/x.md.relations.json")); + // "_systemfoo" is NOT "_system", must survive. + assert!(!prune_path("_systemfoo/x")); + // Per "any segment starting with .path.ovlock", ".path.ovlocking" + // is pruned even though it isn't exactly ".path.ovlock". + assert!(prune_path("resources/.path.ovlocking")); + } +} diff --git a/crates/ragfs/src/git/error.rs b/crates/ragfs/src/git/error.rs new file mode 100644 index 0000000000..12bd8b8b22 --- /dev/null +++ b/crates/ragfs/src/git/error.rs @@ -0,0 +1,170 @@ +//! Git module error types + +use thiserror::Error; + +/// Errors from ObjectStore operations +#[derive(Debug, Error)] +pub enum ObjectStoreError { + /// Object not found + #[error("object not found: {0}")] + NotFound(gix_hash::ObjectId), + + /// I/O error + #[error("i/o error: {0}")] + Io(#[from] std::io::Error), + + /// Zlib decompression error + #[error("zlib error: {0}")] + Zlib(String), + + /// ObjectId mismatch (content integrity check failed) + #[error("oid mismatch: expected {expected}, got {actual}")] + OidMismatch { + expected: gix_hash::ObjectId, + actual: gix_hash::ObjectId, + }, + + /// Backend-specific error + #[error("backend error: {0}")] + Backend(String), +} + +/// Errors from RefStore operations +#[derive(Debug, Error)] +pub enum RefStoreError { + /// Ref not found + #[error("ref not found: {0}")] + NotFound(String), + + /// CAS conflict - expected value didn't match actual + #[error("cas conflict: expected {expected:?}, actual {actual:?}")] + Conflict { + expected: Option, + actual: Option, + }, + + /// Invalid ref name (failed validation) + #[error("invalid ref name: {0}")] + InvalidName(String), + + /// I/O error + #[error("i/o error: {0}")] + Io(#[from] std::io::Error), + + /// Backend-specific error + #[error("backend error: {0}")] + Backend(String), +} + +/// Top-level Git service error +#[derive(Debug, Error)] +pub enum GitError { + /// ObjectStore error + #[error("object store error: {0}")] + ObjectStore(#[from] ObjectStoreError), + + /// RefStore error + #[error("ref store error: {0}")] + RefStore(#[from] RefStoreError), + + /// Path not found in tree + #[error("path not found in tree: {0}")] + PathNotFound(String), + + /// Path exists in tree but resolves to a directory (tree), not a blob. + /// Returned by `show()` when the caller asked for blob bytes at a path + /// that turned out to be a subdirectory. + #[error("path is a directory, not a file: {0}")] + PathIsDirectory(String), + + /// `project_dir` is an empty / malformed path string. + /// Same validation as `TreeEditor::upsert`: must be non-empty, no leading + /// or trailing `/`, no empty components, no `.` / `..` / backslash / + /// control char components. + #[error("invalid project_dir: {0}")] + InvalidProjectDir(String), + + /// A user-supplied relative path in `CommitRequest.paths` or + /// `ShowRequest.path` failed validation. The Rust GitService is a native + /// boundary (PyO3 bindings, future SDK callers), so it cannot rely on + /// upstream HTTP / SDK layers to have already normalized away `..` / + /// `\` / control chars. Rejected before any VFS or object-store I/O. + #[error("invalid path: {0}")] + InvalidPath(String), + + /// The requested `project_dir` does not resolve to a subtree in the + /// referenced commit's tree (either the path is missing entirely or it + /// resolves to a blob rather than a tree). + #[error("project_dir {project_dir:?} not found as a subtree in commit {commit}")] + SubtreeNotFoundInCommit { + project_dir: String, + commit: gix_hash::ObjectId, + }, + + /// Invalid account ID + #[error("invalid account id: {0}")] + InvalidAccountId(String), + + /// Concurrent commit conflict + #[error("concurrent commit: ref {ref_name} changed during commit (expected {expected:?}, actual {actual:?})")] + ConcurrentCommit { + ref_name: String, + expected: Option, + actual: Option, + }, + + /// `restore()` advanced the branch ref to the new commit, but at least + /// one per-path VFS write or delete failed afterwards. The branch ref + /// already points at `new_commit_oid`; the caller must use the payload + /// to drive reindex of the paths that did reach the VFS and report the + /// failures to whoever needs to retry them. + #[error( + "restore writeback partial: {writes_failed} write(s) and {deletes_failed} delete(s) failed after ref advanced to {new_commit}", + writes_failed = .0.failed_writes.len(), + deletes_failed = .0.failed_deletes.len(), + new_commit = .0.new_commit_oid, + )] + RestoreWritebackPartial(Box), + + /// Blob too large + #[error("blob too large: {size} bytes exceeds limit {limit} bytes")] + BlobTooLarge { size: u64, limit: u64 }, + + /// Too many files in commit + #[error("too many files: {count} exceeds limit {limit}")] + TooManyFiles { count: usize, limit: usize }, + + /// Feature not enabled + #[error("git feature not enabled")] + FeatureDisabled, + + /// Corrupted object + #[error("corrupted object: {0}")] + CorruptedObject(String), + + /// No object matched the abbreviated OID prefix + #[error("no commit found matching OID prefix {prefix}")] + OidPrefixNotFound { prefix: String }, + + /// Multiple objects matched the abbreviated OID prefix + #[error("ambiguous OID prefix {prefix} matches {count} commits: {candidates}")] + AmbiguousOid { + prefix: String, + count: usize, + candidates: String, + }, + + /// Other error + #[error("{0}")] + Other(String), + + /// Vfs error wrapper + #[error("vfs: {0}")] + Vfs(String), +} + +impl From for GitError { + fn from(e: crate::core::errors::Error) -> Self { + GitError::Vfs(e.to_string()) + } +} diff --git a/crates/ragfs/src/git/index_store.rs b/crates/ragfs/src/git/index_store.rs new file mode 100644 index 0000000000..191e0efea5 --- /dev/null +++ b/crates/ragfs/src/git/index_store.rs @@ -0,0 +1,279 @@ +//! Persistent commit index ("Fast Path 1" stat cache). +//! +//! After each successful commit, `GitService::commit` saves a snapshot of +//! `(path -> size, mtime_ns, oid)` for every file that was part of the +//! resulting tree. The next commit can then skip the read+SHA-1+write path +//! for any file whose `(size, mtime_ns)` match the cached entry — saving +//! the expensive blob materialization that produces the same OID we already +//! have. +//! +//! Correctness guard: every saved index records its `parent_oid` (the commit +//! the index reflects). On load, if `parent_oid != prev_head` (concurrent +//! commit, branch switch, first run) the cache is silently discarded and +//! commit proceeds via the slow path. Cache misses are *always* a soft +//! failure — we never produce an incorrect commit because of a stale or +//! corrupt index. +//! +//! Racy-clean guard: `(size, mtime_ns)` alone cannot distinguish "unchanged" +//! from "changed to the same size within the same filesystem clock tick as +//! the last commit". To close that window (the classic git "racy clean" +//! problem) the store records *when the index was written* in +//! [`CommitIndex::saved_at_ns`] — populated at `load()` time from the index +//! file's own mtime (local) or object `last_modified` (S3), i.e. the same +//! clock and granularity as the working-tree file mtimes it is compared +//! against. A cache entry is only trusted when its `mtime_ns` is strictly +//! older than `saved_at_ns`; an entry whose mtime is in the same tick as (or +//! newer than) the index write is "racy" and forced down the slow path. This +//! costs at most one extra read per file, and only for files that shared a +//! clock tick with a commit — steady-state Fast Path 1 is unaffected. +//! +//! The wire format is JSON for debuggability. OIDs are stored as 40-char hex. +//! Any deserialization or backend error is mapped to `Ok(None)` on `load()`, +//! and `save()` failures are logged by the caller (the commit itself has +//! already succeeded by the time we save). +//! +//! Per-(account, branch). The branch component is `validate_ref_name`-checked +//! before any path is constructed, so attempts at path traversal via crafted +//! branch names are rejected. + +use std::collections::HashMap; +use std::time::SystemTime; + +use async_trait::async_trait; +use gix_hash::ObjectId; +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +use crate::git::types::IndexEntry; + +/// Convert a [`SystemTime`] to nanoseconds since the Unix epoch, matching the +/// `i128` representation used for `IndexEntry::mtime_ns`. Returns `None` for +/// pre-epoch times (which the racy-clean guard treats conservatively). Shared +/// by the `IndexStore` backends to stamp [`CommitIndex::saved_at_ns`] from the +/// index file's own modification time on load. +pub fn systemtime_to_ns(t: SystemTime) -> Option { + t.duration_since(SystemTime::UNIX_EPOCH) + .ok() + .map(|d| d.as_nanos() as i128) +} + +/// Snapshot of the working tree's `(size, mtime_ns, oid)` after the commit +/// identified by `parent_oid`. +/// +/// `entries` is keyed by account-relative path (same form as `CommitRequest::paths`). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CommitIndex { + /// Commit OID this snapshot reflects. Used to invalidate the cache when + /// the branch head has moved (concurrent commit, branch switch, etc.). + pub parent_oid: ObjectId, + /// Account-relative path → cached `(size, mtime_ns, oid)`. + pub entries: HashMap, + /// Wall-clock time (ns since epoch) the index file was written, as + /// observed from the backend's own metadata at `load()` time. `None` for + /// freshly-built (not-yet-persisted) indexes and when the backend cannot + /// report a modification time. Used for the racy-clean guard: a cache + /// entry whose `mtime_ns >= saved_at_ns` may have been modified within the + /// same clock tick as the last commit and is forced down the slow path. + /// Not part of the on-disk wire format — it is derived from the file's own + /// timestamp on load. + pub saved_at_ns: Option, +} + +/// Error type returned by [`IndexStore`] operations. All error variants are +/// soft failures from the caller's perspective — `load()` callers map them to +/// `Ok(None)` and `save()` callers log and continue. +#[derive(Debug, Error)] +pub enum IndexStoreError { + /// Underlying I/O error from the backend (filesystem, network, etc.). + #[error("i/o error: {0}")] + Io(#[from] std::io::Error), + /// On-disk format could not be parsed (corruption, version skew, bad oid). + #[error("decode error: {0}")] + Decode(String), + /// Branch component failed `validate_ref_name` (path-traversal guard). + #[error("invalid branch name: {0}")] + InvalidBranch(String), + /// Non-I/O backend failure (e.g. S3 SDK error). + #[error("backend error: {0}")] + Backend(String), +} + +/// Per-(account, branch) commit-index storage. +/// +/// `load` returns `Ok(None)` for any "no usable index here" outcome — missing +/// file, decode failure, version mismatch, etc. The caller treats every miss +/// as "skip Fast Path 1, fall back to the full read/hash path". +/// +/// `save` is fire-and-forget from the caller's perspective: the commit has +/// already succeeded; the worst-case cost of a save failure is one extra +/// slow-path commit next time. +#[async_trait] +pub trait IndexStore: Send + Sync + 'static { + /// Load the latest persisted index for `(account, branch)`. Returns + /// `Ok(None)` if no index has been written yet, or if the persisted bytes + /// fail to decode (treated as a soft miss so commit falls back to the + /// slow path). + async fn load( + &self, + account: &str, + branch: &str, + ) -> Result, IndexStoreError>; + + /// Persist `index` for `(account, branch)`, replacing any prior snapshot. + /// Last-write-wins semantics — there is no CAS because the index is a + /// soft-state cache; correctness is enforced at load time via the + /// `parent_oid` check. + async fn save( + &self, + account: &str, + branch: &str, + index: &CommitIndex, + ) -> Result<(), IndexStoreError>; +} + +// ---- Wire format --------------------------------------------------------- + +/// Bumped only when a backwards-incompatible change to the layout ships. +/// Older readers see an unknown version and treat the file as absent. +const INDEX_FORMAT_VERSION: u32 = 1; + +#[derive(Debug, Serialize, Deserialize)] +struct WireIndex { + version: u32, + parent_oid: String, + entries: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +struct WireEntry { + path: String, + size: u64, + mtime_ns: i128, + oid: String, +} + +/// Serialize a `CommitIndex` into the on-disk JSON wire format. +pub fn encode_index(index: &CommitIndex) -> Result, IndexStoreError> { + let mut entries: Vec = index + .entries + .iter() + .map(|(path, e)| WireEntry { + path: path.clone(), + size: e.size, + mtime_ns: e.mtime_ns, + oid: e.oid.to_hex().to_string(), + }) + .collect(); + // Sorted output → deterministic byte content for tests / hashing. + entries.sort_by(|a, b| a.path.cmp(&b.path)); + let wire = WireIndex { + version: INDEX_FORMAT_VERSION, + parent_oid: index.parent_oid.to_hex().to_string(), + entries, + }; + serde_json::to_vec(&wire).map_err(|e| IndexStoreError::Decode(e.to_string())) +} + +/// Decode the on-disk JSON wire format. An unknown `version` field returns +/// `Ok(None)`; malformed JSON or invalid OIDs return `Err(Decode)` (which +/// the trait `load()` callers also map to `None`). +pub fn decode_index(bytes: &[u8]) -> Result, IndexStoreError> { + let wire: WireIndex = match serde_json::from_slice(bytes) { + Ok(w) => w, + Err(e) => return Err(IndexStoreError::Decode(e.to_string())), + }; + if wire.version != INDEX_FORMAT_VERSION { + // Forward-compat: unknown version → silently treat as missing. + return Ok(None); + } + let parent_oid = ObjectId::from_hex(wire.parent_oid.as_bytes()) + .map_err(|e| IndexStoreError::Decode(format!("parent_oid: {e}")))?; + let mut entries = HashMap::with_capacity(wire.entries.len()); + for w in wire.entries { + let oid = ObjectId::from_hex(w.oid.as_bytes()) + .map_err(|e| IndexStoreError::Decode(format!("entry oid {}: {e}", w.path)))?; + entries.insert( + w.path, + IndexEntry { + size: w.size, + mtime_ns: w.mtime_ns, + oid, + }, + ); + } + Ok(Some(CommitIndex { + parent_oid, + entries, + // The wire format carries no timestamp; callers that need the + // racy-clean guard populate `saved_at_ns` from the backing file's own + // mtime after a successful decode. + saved_at_ns: None, + })) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn oid_from_byte(b: u8) -> ObjectId { + let mut bytes = [0u8; 20]; + bytes.fill(b); + ObjectId::from_bytes_or_panic(&bytes) + } + + #[test] + fn round_trip_preserves_entries() { + let mut entries = HashMap::new(); + entries.insert( + "resources/a.md".into(), + IndexEntry { + size: 42, + mtime_ns: 1_700_000_000_000_000_000, + oid: oid_from_byte(0xAA), + }, + ); + entries.insert( + "agent/b.py".into(), + IndexEntry { + size: 7, + mtime_ns: -1, + oid: oid_from_byte(0xBB), + }, + ); + let idx = CommitIndex { + parent_oid: oid_from_byte(0xCC), + entries, + saved_at_ns: None, + }; + let bytes = encode_index(&idx).unwrap(); + let decoded = decode_index(&bytes).unwrap().unwrap(); + assert_eq!(decoded, idx); + } + + #[test] + fn unknown_version_is_silent_miss() { + let bogus = serde_json::json!({ + "version": 9999, + "parent_oid": format!("{:040}", 0), + "entries": [] + }); + let bytes = serde_json::to_vec(&bogus).unwrap(); + assert!(decode_index(&bytes).unwrap().is_none()); + } + + #[test] + fn malformed_json_errors() { + assert!(decode_index(b"not-json").is_err()); + } + + #[test] + fn invalid_oid_errors() { + let bogus = serde_json::json!({ + "version": INDEX_FORMAT_VERSION, + "parent_oid": "zzzz", + "entries": [] + }); + let bytes = serde_json::to_vec(&bogus).unwrap(); + assert!(decode_index(&bytes).is_err()); + } +} diff --git a/crates/ragfs/src/git/mod.rs b/crates/ragfs/src/git/mod.rs new file mode 100644 index 0000000000..e127f84565 --- /dev/null +++ b/crates/ragfs/src/git/mod.rs @@ -0,0 +1,58 @@ +//! Git version control module +//! +//! This module provides Git-based version control capabilities for OpenViking, +//! allowing users to commit snapshots, checkout previous versions, and view +//! history. +//! +//! Architecture +//! +//! - [`object_store`]: Trait and implementations for content-addressable storage +//! - [`ref_store`]: Trait and implementations for named reference storage +//! - [`backends`]: Backend implementations (local filesystem, S3) +//! +//! Example +//! +//! ```rust,ignore +//! use ragfs::git::backends::local::{LocalObjectStore, LocalRefStore}; +//! use ragfs::git::object_store::ObjectStore; +//! use ragfs::git::ref_store::RefStore; +//! +//! # #[tokio::main] +//! # async fn main() { +//! let object_store = LocalObjectStore::new("/data/git"); +//! let ref_store = LocalRefStore::new("/data/git"); +//! +//! // Use object_store and ref_store... +//! # } +//! ``` + +pub mod backends; +pub mod commit; +pub mod config; +pub mod enumerate; +pub mod error; +pub mod index_store; +pub mod object_store; +pub mod ref_store; +pub mod service; +pub mod tree_builder; +pub mod types; +pub mod util; + +pub use config::{GitConfig, GitLocalConfig, GitS3ConfigPy, GitTuningConfig}; +pub use error::{GitError, ObjectStoreError, RefStoreError}; +pub use index_store::{CommitIndex, IndexStore, IndexStoreError}; +pub use object_store::ObjectStore; +pub use ref_store::RefStore; +pub use service::GitService; +pub use tree_builder::{flatten, lookup, TreeEditor}; +pub use types::{ + Actor, CommitRequest, CommitResponse, IndexEntry, RestoreDiff, RestoreRequest, + RestoreResponse, RestoreWritebackPartial, ShowRequest, ShowResponse, +}; + +// Re-exports from backends +pub use backends::local::{LocalIndexStore, LocalObjectStore, LocalRefStore}; + +#[cfg(feature = "s3")] +pub use backends::s3::{CasMode, S3Config, S3IndexStore, S3ObjectStore, S3RefStore}; diff --git a/crates/ragfs/src/git/object_store.rs b/crates/ragfs/src/git/object_store.rs new file mode 100644 index 0000000000..abdbfddc37 --- /dev/null +++ b/crates/ragfs/src/git/object_store.rs @@ -0,0 +1,44 @@ +//! ObjectStore trait - content-addressable storage for Git objects + +use async_trait::async_trait; +use bytes::Bytes; +use gix_hash::ObjectId; + +use crate::git::error::ObjectStoreError; + +/// Content-addressable storage for Git objects. +/// +/// This trait abstracts over storage backends (local filesystem, S3, etc.) +/// for storing and retrieving Git objects (blobs, trees, commits). +/// +/// All operations are content-addressable by ObjectId (SHA-1). +/// `put` operations are idempotent - writing the same object multiple times +/// has the same effect as writing it once. +#[async_trait] +pub trait ObjectStore: Send + Sync + 'static { + /// Write a zlib-compressed loose object. + /// + /// The `zlib_body` must be a valid zlib-compressed Git loose object, + /// and `oid` must be the SHA-1 hash of the uncompressed object + /// (including the Git header: "type size\0content"). + /// + /// Implementations should ensure this is idempotent: calling `put` + /// multiple times with the same `oid` is safe and has no additional effect. + async fn put( + &self, + account: &str, + oid: &ObjectId, + zlib_body: Bytes, + ) -> Result<(), ObjectStoreError>; + + /// Read and decompress an object. + /// + /// Returns the full uncompressed bytes (including Git header: "type size\0content"). + async fn get(&self, account: &str, oid: &ObjectId) -> Result; + + /// Check if an object exists without reading its content. + /// + /// This is an optimization path - implementations should use the cheapest + /// available method (e.g., `stat` for local, `HEAD` for S3). + async fn exists(&self, account: &str, oid: &ObjectId) -> Result; +} diff --git a/crates/ragfs/src/git/ref_store.rs b/crates/ragfs/src/git/ref_store.rs new file mode 100644 index 0000000000..5643240f15 --- /dev/null +++ b/crates/ragfs/src/git/ref_store.rs @@ -0,0 +1,41 @@ +//! RefStore trait - named references to Git objects + +use async_trait::async_trait; +use gix_hash::ObjectId; + +use crate::git::error::RefStoreError; + +/// Storage for named references (branches, tags) pointing to Git objects. +/// +/// The core operation is `cas_update` (Compare-And-Swap), which ensures +/// atomicity and consistency in the presence of concurrent writers. +#[async_trait] +pub trait RefStore: Send + Sync + 'static { + /// Read the current value of a ref. + /// + /// Returns `NotFound` if the ref doesn't exist. + async fn read(&self, account: &str, ref_name: &str) -> Result; + + /// Compare-And-Swap update: write `new` only if current value == `expected`. + /// + /// - `expected = None` means "write only if ref doesn't exist yet" + /// - `expected = Some(oid)` means "write only if ref currently has this oid" + /// + /// Returns `Conflict` if the expectation fails. + async fn cas_update( + &self, + account: &str, + ref_name: &str, + expected: Option, + new: ObjectId, + ) -> Result<(), RefStoreError>; + + /// List all refs under a prefix (e.g., "refs/heads/"). + /// + /// Returns a list of (full_ref_name, target_oid) pairs. + async fn list( + &self, + account: &str, + prefix: &str, + ) -> Result, RefStoreError>; +} diff --git a/crates/ragfs/src/git/service.rs b/crates/ragfs/src/git/service.rs new file mode 100644 index 0000000000..e0bfd29c3b --- /dev/null +++ b/crates/ragfs/src/git/service.rs @@ -0,0 +1,5426 @@ +//! `GitService` - high-level integration tying together object/ref stores, +//! VFS enumeration, tree building, and commit-object construction. +//! +//! See design §8.1 for the `commit()` algorithm. Fast Path 1 (persistent +//! stat cache `commit_index.bin`) is wired through an optional +//! `IndexStore`: when present and the cached snapshot's `parent_oid` matches +//! the current branch HEAD, files whose `(size, mtime_ns)` match the cached +//! entry skip the read+SHA-1 step and reuse the cached blob OID. Fast Path 3 +//! (`exists()` dedup before blob write) is implemented in the slow path: after +//! a blob's oid is computed, an `exists()` precheck skips the zlib compression +//! and `put` when the object is already present. It is a pure performance +//! optimization (`write_object` is idempotent) and can be toggled off via +//! [`GitService::with_blob_exists_precheck`]. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::UNIX_EPOCH; + +use gix_hash::ObjectId; +use tracing::warn; + +use crate::core::filesystem::FileSystem; +use crate::core::types::FileInfo; +use crate::git::{ + error::{GitError, ObjectStoreError, RefStoreError}, + index_store::{CommitIndex, IndexStore}, + object_store::ObjectStore, + ref_store::RefStore, + types::{ + CommitRequest, CommitResponse, IndexEntry, RestoreRequest, RestoreResponse, ShowRequest, + ShowResponse, + }, +}; + +/// `GitService` orchestrates the full commit pipeline against a `FileSystem` +/// (the working tree), an `ObjectStore`, and a `RefStore`. An optional +/// `IndexStore` enables Fast Path 1 — the persistent stat cache that lets +/// `commit()` skip read+SHA-1 for files whose `(size, mtime_ns)` are +/// unchanged since the last commit. +pub struct GitService { + /// Working-tree filesystem rooted at `/local/{account}`. + pub vfs: Arc, + /// Backing object store (loose Git objects, content-addressed). + pub object_store: Arc, + /// Backing ref store (branch heads). + pub ref_store: Arc, + /// Optional Fast Path 1 stat cache. `None` disables the optimization + /// — every commit then reads and SHA-1s every candidate file. + pub index_store: Option>, + /// Fast Path 3 toggle: when `true` (default), the slow path runs an + /// `exists()` precheck before compressing and putting a blob. Disable to + /// fall back to an unconditional `write_object`. + pub blob_exists_precheck: bool, +} + +impl GitService { + /// Build a service without Fast Path 1. Equivalent to + /// [`GitService::with_index`] passing `None`. + pub fn new( + vfs: Arc, + object_store: Arc, + ref_store: Arc, + ) -> Self { + Self { + vfs, + object_store, + ref_store, + index_store: None, + blob_exists_precheck: true, + } + } + + /// Build a service with an optional [`IndexStore`] backing Fast Path 1. + /// Pass `Some(...)` to enable the stat cache, `None` for parity with + /// [`GitService::new`]. + pub fn with_index( + vfs: Arc, + object_store: Arc, + ref_store: Arc, + index_store: Option>, + ) -> Self { + Self { + vfs, + object_store, + ref_store, + index_store, + blob_exists_precheck: true, + } + } + + /// Toggle Fast Path 3 (`exists()` precheck before blob write). Defaults to + /// enabled; pass `false` to force an unconditional `write_object` on the + /// slow path. + pub fn with_blob_exists_precheck(mut self, enabled: bool) -> Self { + self.blob_exists_precheck = enabled; + self + } + + /// Build a new commit on `branch` reflecting the current state of the + /// account's VFS subtree. + /// + /// - If `paths` is `Some`, only those account-relative paths are + /// considered (each is still pruned via `enumerate::prune_path`). + /// - If `paths` is `None`, the full `/local/{account}` subtree is + /// enumerated via `enumerate::collect_all`. + /// + /// On no-op (no editor change) the branch ref is untouched and + /// `CommitResponse::Noop` is returned. + /// + /// When `paths` is `Some(...)`, every listed path must refer to a file in + /// the VFS. To commit a subtree, list each file explicitly or omit `paths` + /// for full enumeration. + /// + /// On a CAS conflict, returns `GitError::ConcurrentCommit` so the + /// caller can decide whether to retry. There is intentionally no + /// retry loop inside `commit()`. + /// + /// Fast Path 1: when an [`IndexStore`] is configured and the cached + /// snapshot's `parent_oid` matches `prev_head`, candidates whose + /// `(size, mtime_ns)` match the cached entry skip read+SHA-1 and reuse + /// the cached blob OID. Any cache miss / parent mismatch / decode + /// error silently falls back to the slow path; a stale or corrupt + /// index can never produce an incorrect commit. + pub async fn commit(&self, req: CommitRequest) -> Result { + let CommitRequest { + account, + branch, + message, + paths, + author_name, + author_email, + } = req; + validate_account_id(&account)?; + if let Some(ps) = &paths { + for p in ps { + validate_relative_path(p)?; + } + } + let ref_name = format!("refs/heads/{branch}"); + + // 1. Resolve current HEAD (may not exist → root commit). + let prev_head: Option = match self.ref_store.read(&account, &ref_name).await { + Ok(oid) => Some(oid), + Err(RefStoreError::NotFound(_)) => None, + Err(e) => return Err(e.into()), + }; + let prev_tree: Option = match prev_head { + Some(commit_oid) => Some( + load_commit_meta(self.object_store.as_ref(), &account, &commit_oid) + .await? + .tree, + ), + None => None, + }; + + // 1b. Fast Path 1: load the persisted commit index for this branch + // and verify its `parent_oid` matches `prev_head`. Any mismatch / + // missing file / decode error → no cache for this commit. + let prev_index: Option = match (&self.index_store, prev_head) { + (Some(store), Some(head)) => match store.load(&account, &branch).await { + Ok(Some(idx)) if idx.parent_oid == head => Some(idx), + Ok(_) => None, + Err(e) => { + warn!( + "commit index load failed for {account}/{branch}: {e}; \ + falling back to slow path" + ); + None + } + }, + _ => None, + }; + // Track whether Fast Path 1 is even potentially live: if disabled + // (`index_store` is None), we skip the new-index build below. + let fast_path_active = self.index_store.is_some() && prev_index.is_some(); + + // 2. Build TreeEditor from prev tree if any; otherwise start empty. + // (The well-known empty-tree oid is not guaranteed to exist in the + // store, so we cannot blindly hand it to `from_tree`.) + let mut editor = match prev_tree { + Some(t) => { + crate::git::tree_builder::TreeEditor::from_tree( + self.object_store.as_ref(), + &account, + t, + ) + .await? + } + None => crate::git::tree_builder::TreeEditor::empty(), + }; + + // 2.5. Explicit paths: classify each entry as File / Directory / + // NotFound via a single VFS stat, and assemble three locals: + // + // * `candidates` - the deduped set of files this commit will + // process. Directories contribute their recursive listing + // (pruned) plus any prev_tree paths under the same prefix + // (so deletions inside the directory surface as a remove). + // A NotFound entry behaves the same way for any prev_tree + // paths under its prefix, treating it as "delete whatever + // used to live here". + // * `cleanup_exact` - keys to drop from the new index seed + // before the main loop re-fills them. + // * `cleanup_prefixes` - directory-style prefixes ("docs/") + // whose contents in the new index seed must also be dropped. + // + // Pruning is applied uniformly: explicit files, expanded + // directory contents, and prev_tree-derived entries all run + // through `prune_path`. A path supplied by the caller that is + // pruned is silently dropped (no error, no warn). + let mut cleanup_exact: HashSet = HashSet::new(); + let mut cleanup_prefixes: Vec = Vec::new(); + + // Lazily flatten prev_tree once if any explicit path requires it. + let mut prev_paths_cache: Option> = None; + + let candidates: Vec = match &paths { + Some(ps) => { + let mut set: std::collections::BTreeSet = std::collections::BTreeSet::new(); + + for p in ps { + let abs = format!("/local/{}/{}", account, p); + match self.vfs.stat(&abs).await { + Ok(info) if info.is_dir => { + // Directory: recursive listing + prev_tree subtree. + cleanup_prefixes.push(format!("{}/", p)); + + let listed = + crate::git::enumerate::collect_under(&self.vfs, &account, p) + .await?; + for rel in listed { + set.insert(rel); + } + + if let Some(t) = prev_tree { + if prev_paths_cache.is_none() { + prev_paths_cache = Some( + crate::git::tree_builder::flatten( + self.object_store.as_ref(), + &account, + t, + &None, + ) + .await?, + ); + } + let pref = format!("{}/", p); + for (path, _) in prev_paths_cache.as_ref().unwrap() { + if path.starts_with(&pref) + && !crate::git::enumerate::prune_path(path) + { + set.insert(path.clone()); + } + } + } + } + Ok(_) => { + // File: take it verbatim, subject to pruning. + cleanup_exact.insert(p.clone()); + if !crate::git::enumerate::prune_path(p) { + set.insert(p.clone()); + } + } + Err(e) if is_not_found(&e) => { + // Neither file nor directory in the VFS. Treat + // it as a delete-by-name: feed `p` into the main + // loop (where the NotFound branch will remove it + // from the tree if it was a file) AND union in + // every prev_tree path under "p/" so a missing + // directory drops its whole subtree. + warn!( + "commit path {:?} not found in VFS; \ + treating as deletion of any matching subtree", + p + ); + cleanup_exact.insert(p.clone()); + cleanup_prefixes.push(format!("{}/", p)); + + if !crate::git::enumerate::prune_path(p) { + set.insert(p.clone()); + } + if let Some(t) = prev_tree { + if prev_paths_cache.is_none() { + prev_paths_cache = Some( + crate::git::tree_builder::flatten( + self.object_store.as_ref(), + &account, + t, + &None, + ) + .await?, + ); + } + let pref = format!("{}/", p); + for (path, _) in prev_paths_cache.as_ref().unwrap() { + if path.starts_with(&pref) + && !crate::git::enumerate::prune_path(path) + { + set.insert(path.clone()); + } + } + } + } + Err(e) => return Err(e.into()), + } + } + + set.into_iter().collect() + } + // Full enumeration: union the files currently on disk with the + // paths recorded in prev_tree. `collect_all` only sees files that + // still exist, so a file deleted since the last commit would never + // become a candidate and its deletion would be silently lost. By + // adding prev_tree's paths, a path that's gone from disk falls into + // the `NotFound → remove` branch below and is dropped from the new + // snapshot. Deduped via BTreeSet so a path present in both sources + // is only processed once. + None => { + let mut set: std::collections::BTreeSet = + crate::git::enumerate::collect_all(&self.vfs, &account) + .await? + .into_iter() + .collect(); + if let Some(t) = prev_tree { + let prev_paths = crate::git::tree_builder::flatten( + self.object_store.as_ref(), + &account, + t, + &None, + ) + .await?; + for (p, _) in prev_paths { + if !crate::git::enumerate::prune_path(&p) { + set.insert(p); + } + } + } + set.into_iter().collect() + } + }; + + // 3b. Seed the new index. For partial commits (paths=Some), unlisted + // paths in the previous index must be preserved verbatim — they + // were not enumerated this round, so the cache should keep them. + // For full enumeration (paths=None), start empty: only paths seen + // this commit end up in the new index. + let mut new_index_entries: HashMap = + match (self.index_store.is_some(), &paths, &prev_index) { + (true, Some(_), Some(idx)) => idx.entries.clone(), + (true, _, _) => HashMap::new(), + _ => HashMap::new(), + }; + // For partial commits we still need to drop entries for any explicitly + // listed path before we re-fill it — otherwise a deleted path that + // was in the old index would linger. Directory entries clean by + // prefix; file/NotFound entries clean by exact key. + if paths.is_some() { + for key in &cleanup_exact { + new_index_entries.remove(key); + } + if !cleanup_prefixes.is_empty() { + new_index_entries + .retain(|k, _| !cleanup_prefixes.iter().any(|pref| k.starts_with(pref))); + } + } + + // 4. For each candidate: detect delete vs upsert. Blob writes on the + // slow path go through Fast Path 3 (exists precheck) when enabled; + // write_object is idempotent regardless. + // + // `prev_lookup_cache` memoises decoded prev_tree subtree contents + // keyed on tree OID, so K candidate paths sharing the same depth-D + // ancestor chain pay D unique loads instead of K×D — every commit + // in the same parent subtree only fetches each ancestor once. + // Pre-seeded with the editor's root entries so the first + // `lookup_cached` doesn't re-fetch what `from_tree` already decoded. + let mut prev_lookup_cache = crate::git::tree_builder::TreeLookupCache::new(); + if let Some(t) = prev_tree { + prev_lookup_cache.seed(t, editor.root.clone()); + } + // Racy-clean threshold: the time the loaded index was written (from the + // backend's own metadata). A cached entry is only trustworthy when its + // `mtime_ns` is strictly older than this — otherwise the file may have + // been changed (to the same size) within the same clock tick as the + // last commit, and `(size, mtime_ns)` cannot detect it. `None` (backend + // could not report a write time) forces every entry down the slow path. + let index_saved_at_ns: Option = prev_index.as_ref().and_then(|idx| idx.saved_at_ns); + let mut changed = 0usize; + for rel_path in candidates { + let abs = format!("/local/{}/{}", account, rel_path); + match self.vfs.stat(&abs).await { + Ok(info) => { + let stat = stat_signature(&info); + + // Fast Path 1: cached `(size, mtime_ns)` match → reuse oid, + // skip vfs.read + write_object. The cached oid was once + // written by a successful commit, so it's known good in + // the object store. Guarded by the racy-clean check: the + // entry's mtime must predate the index write, otherwise a + // same-size change in the index's clock tick could slip + // through undetected. + let cached = prev_index + .as_ref() + .and_then(|idx| idx.entries.get(&rel_path)); + let oid = match (cached, stat) { + (Some(entry), Some((size, mtime_ns))) + if entry.size == size + && entry.mtime_ns == mtime_ns + && index_saved_at_ns.is_some_and(|saved| mtime_ns < saved) => + { + entry.oid + } + _ => { + let bytes = self.vfs.read(&abs, 0, 0).await?; + if self.blob_exists_precheck { + crate::git::util::write_object_if_absent( + self.object_store.as_ref(), + &account, + gix_object::Kind::Blob, + &bytes, + ) + .await? + } else { + crate::git::util::write_object( + self.object_store.as_ref(), + &account, + gix_object::Kind::Blob, + &bytes, + ) + .await? + } + } + }; + + // Skip the upsert if prev_tree already has this exact + // path+oid — re-writing the same blob is not an editor + // change and shouldn't count toward the no-op decision. + let prev_entry = match prev_tree { + Some(t) => { + crate::git::tree_builder::lookup_cached( + self.object_store.as_ref(), + &account, + t, + &rel_path, + &mut prev_lookup_cache, + ) + .await? + } + None => None, + }; + if prev_entry.map(|(o, _)| o) != Some(oid) { + editor + .upsert(self.object_store.as_ref(), &account, &rel_path, oid) + .await?; + changed += 1; + } + + // Record in the new index regardless of whether the editor + // was touched — the on-disk file is still present and + // its (size, mtime_ns, oid) is the new ground truth. + if self.index_store.is_some() { + if let Some((size, mtime_ns)) = stat { + new_index_entries.insert( + rel_path.clone(), + IndexEntry { + size, + mtime_ns, + oid, + }, + ); + } else { + // No usable mtime → don't poison the cache. + new_index_entries.remove(&rel_path); + } + } + } + Err(e) if is_not_found(&e) => { + // Only count as a change if the path actually existed + // in prev_tree, since TreeEditor::remove silently no-ops + // for missing paths. With no prev_tree (root commit) a + // missing path is just irrelevant. + let prev_entry = match prev_tree { + Some(t) => { + crate::git::tree_builder::lookup_cached( + self.object_store.as_ref(), + &account, + t, + &rel_path, + &mut prev_lookup_cache, + ) + .await? + } + None => None, + }; + if prev_entry.is_some() { + editor + .remove(self.object_store.as_ref(), &account, &rel_path) + .await?; + changed += 1; + } + // Path is gone → drop any lingering cache entry. + if self.index_store.is_some() { + new_index_entries.remove(&rel_path); + } + } + Err(e) => return Err(e.into()), + } + } + + // 5. No-op short-circuit. Even though the tree didn't change, the + // on-disk (size, mtime_ns) for enumerated paths may have shifted + // (e.g. `touch` of an unchanged file). Persist the refreshed index + // keyed on the *current* HEAD so the next commit can still hit the + // fast path. Soft-fail on save errors. + if changed == 0 { + let noop_oid = prev_head.unwrap_or_else(|| ObjectId::null(gix_hash::Kind::Sha1)); + if let (Some(store), Some(parent)) = (&self.index_store, prev_head) { + let new_index = CommitIndex { + parent_oid: parent, + entries: new_index_entries, + // Stamped from the backing file's mtime on the next load. + saved_at_ns: None, + }; + if let Err(e) = store.save(&account, &branch, &new_index).await { + warn!("commit index save failed for {account}/{branch}: {e}"); + } + } + let _ = fast_path_active; + return Ok(CommitResponse::Noop { + commit_oid: noop_oid, + }); + } + + // 6. Write the new tree + the commit object. + let new_tree = editor.write(self.object_store.as_ref(), &account).await?; + let parents: Vec = prev_head.iter().copied().collect(); + let commit_oid = crate::git::commit::write_commit( + self.object_store.as_ref(), + &account, + new_tree, + parents, + &author_name, + &author_email, + &message, + ) + .await?; + + // 7. CAS update the branch ref. Map Conflict → ConcurrentCommit. + match self + .ref_store + .cas_update(&account, &ref_name, prev_head, commit_oid) + .await + { + Ok(()) => {} + Err(RefStoreError::Conflict { expected, actual }) => { + return Err(GitError::ConcurrentCommit { + ref_name, + expected, + actual, + }); + } + Err(other) => return Err(other.into()), + } + + // 8. Persist the new commit index. Soft-fail: a save error logs and + // continues — the commit itself has already succeeded; the worst + // case is one slow-path commit next time. + if let Some(store) = &self.index_store { + let new_index = CommitIndex { + parent_oid: commit_oid, + entries: new_index_entries, + // Stamped from the backing file's mtime on the next load. + saved_at_ns: None, + }; + if let Err(e) = store.save(&account, &branch, &new_index).await { + warn!("commit index save failed for {account}/{branch}: {e}"); + } + } + // Suppress the "fast_path_active was set but never read" lint when no + // future code path inspects it; left in scope for diagnostics. + let _ = fast_path_active; + + Ok(CommitResponse::Created { + commit_oid, + changed, + }) + } + + /// Read a commit's metadata, or a single blob's bytes from inside a commit's tree. + /// + /// `target_ref` resolution: 40-hex OID / "main" / "refs/heads/main". + /// + /// - `path = None` → returns `ShowResponse::Commit { oid, tree, parents, author, committer, message }`. + /// - `path = Some(p)` → returns `ShowResponse::Blob { oid, size, bytes }` for the path inside + /// the commit's tree. Missing path → `GitError::PathNotFound(p)`. Path that resolves to + /// a tree (not a blob) → `GitError::PathIsDirectory(p)` — distinct from missing so callers + /// can tell apart "no such path" from "path exists but is a directory, not a file". + /// + /// Missing ref → `GitError::RefStore(RefStoreError::NotFound)`. + /// Missing commit object → `GitError::ObjectStore(ObjectStoreError::NotFound)`. + pub async fn show(&self, req: ShowRequest) -> Result { + let ShowRequest { + account, + target_ref, + path, + } = req; + + validate_account_id(&account)?; + if let Some(p) = &path { + validate_relative_path(p)?; + } + + let commit_oid = resolve_ref( + self.ref_store.as_ref(), + self.object_store.as_ref(), + &account, + &target_ref, + ) + .await?; + let meta = load_commit_meta(self.object_store.as_ref(), &account, &commit_oid).await?; + + match path { + None => Ok(ShowResponse::Commit { + oid: commit_oid, + tree: meta.tree, + parents: meta.parents, + author: meta.author, + committer: meta.committer, + message: meta.message, + }), + Some(p) => { + let entry = crate::git::tree_builder::lookup( + self.object_store.as_ref(), + &account, + meta.tree, + &p, + ) + .await?; + let (blob_oid, mode) = entry.ok_or_else(|| GitError::PathNotFound(p.clone()))?; + // Reject trees masquerading as paths: callers asked for blob bytes. + if mode.is_tree() { + return Err(GitError::PathIsDirectory(p)); + } + let raw = + crate::git::util::read_object(self.object_store.as_ref(), &account, &blob_oid) + .await?; + let (kind, payload_size, hdr) = crate::git::util::parse_object_header(&raw)?; + if kind != gix_object::Kind::Blob { + return Err(GitError::CorruptedObject(format!( + "expected blob at {p}, got {kind:?}" + ))); + } + // `raw` is already a `Bytes`; `slice` is O(1) and shares the + // backing buffer instead of allocating a fresh payload copy. + let bytes = raw.slice(hdr..); + Ok(ShowResponse::Blob { + oid: blob_oid, + size: payload_size, + bytes, + }) + } + } + } + + /// Restore a subtree at `project_dir` to the state it had in `source_commit`, + /// producing a new commit whose parent is the **current HEAD** (not + /// `source_commit`). HEAD always moves forward. + /// + /// See design §8.2 for the full algorithm and `RestoreResponse` for the + /// three possible outcomes (`Applied` / `Noop` / `DryRun`). + /// + /// Errors: + /// - `GitError::InvalidProjectDir` — `project_dir` is empty / malformed. + /// - `GitError::RefStore(NotFound)` — branch HEAD or source_commit ref missing. + /// - `GitError::SubtreeNotFoundInCommit` — `project_dir` does not resolve + /// to a subtree in `source_commit`'s tree. + /// - `GitError::ConcurrentCommit` — branch ref changed between our read + /// and the CAS swap. + pub async fn restore(&self, req: RestoreRequest) -> Result { + let RestoreRequest { + account, + branch, + project_dir, + source_commit, + dry_run, + message: _, + author_name: _, + author_email: _, + } = &req; + + validate_account_id(account)?; + + if let Some(project_dir) = project_dir { + validate_project_dir(project_dir)?; + } + let ref_name = format!("refs/heads/{branch}"); + + // 1. Resolve both commits. + let source_oid = resolve_ref( + self.ref_store.as_ref(), + self.object_store.as_ref(), + account, + source_commit, + ) + .await?; + let head_oid = self.ref_store.read(account, &ref_name).await?; + let source_meta = + load_commit_meta(self.object_store.as_ref(), account, &source_oid).await?; + let head_meta = load_commit_meta(self.object_store.as_ref(), account, &head_oid).await?; + + // 2. Extract subtree from each (or use full tree if project_dir is None). + // Source missing → error (if project_dir is Some). + // Head missing → treat as empty (every file is a fresh write). + let (source_tree_to_flatten, head_tree_to_flatten) = match project_dir { + Some(project_dir) => { + let source_subtree = match crate::git::tree_builder::lookup( + self.object_store.as_ref(), + account, + source_meta.tree, + project_dir, + ) + .await? + { + Some((oid, mode)) if mode.is_tree() => oid, + _ => { + return Err(GitError::SubtreeNotFoundInCommit { + project_dir: project_dir.clone(), + commit: source_oid, + }); + } + }; + let head_subtree = match crate::git::tree_builder::lookup( + self.object_store.as_ref(), + account, + head_meta.tree, + project_dir, + ) + .await? + { + Some((oid, mode)) if mode.is_tree() => Some(oid), + _ => None, + }; + (source_subtree, head_subtree) + } + None => (source_meta.tree, Some(head_meta.tree)), + }; + + // 3. Flatten and diff (paths in the result are subtree-relative, or account-relative if full tree). + let source_entries = crate::git::tree_builder::flatten( + self.object_store.as_ref(), + account, + source_tree_to_flatten, + &None, + ) + .await?; + let head_entries = match head_tree_to_flatten { + Some(oid) => { + crate::git::tree_builder::flatten(self.object_store.as_ref(), account, oid, &None) + .await? + } + None => Vec::new(), + }; + let diff = compute_subtree_diff(&source_entries, &head_entries); + + // 4. dry_run short-circuits BEFORE any writes. + if *dry_run { + return Ok(RestoreResponse::DryRun { + diff, + head: head_oid, + source: source_oid, + }); + } + + // 5. Source == head → noop. + if diff.to_write.is_empty() && diff.to_delete.is_empty() { + return Ok(RestoreResponse::Noop { + head: head_oid, + source: source_oid, + }); + } + + // 6. Prepare writeback metadata. Paths in the diff are relative to + // project_dir — prefix here. NOTE: the VFS is NOT mutated yet. The + // ref-consistency protocol (steps 7–9) must complete first so that a + // losing CAS race leaves the working tree untouched; the actual + // writeback happens in step 10 only after the ref swap succeeds. + use futures::stream::{self, StreamExt}; + + let abs_prefix = match project_dir { + Some(project_dir) => format!("/local/{}/{}", account, project_dir), + None => format!("/local/{}", account), + }; + let unchanged_count = diff.unchanged.len(); + + // 7. Build the new tree: load head.tree into an editor and splice + // source_subtree at project_dir, or use source tree directly if full restore. + let new_tree_oid = match project_dir { + Some(project_dir) => { + let mut editor = crate::git::tree_builder::TreeEditor::from_tree( + self.object_store.as_ref(), + account, + head_meta.tree, + ) + .await?; + editor + .upsert_subtree( + self.object_store.as_ref(), + account, + project_dir, + source_tree_to_flatten, + ) + .await?; + editor.write(self.object_store.as_ref(), account).await? + } + None => source_meta.tree, + }; + + // 8. Construct the new commit. parent = head_oid (NOT source_oid). + let msg = req.message.clone().unwrap_or_else(|| { + let short = &source_oid.to_hex().to_string()[..12.min(40)]; + match &req.project_dir { + Some(project_dir) => format!("restore {} from {}", project_dir, short), + None => format!("restore full tree from {}", short), + } + }); + let new_commit_oid = crate::git::commit::write_commit( + self.object_store.as_ref(), + account, + new_tree_oid, + vec![head_oid], + &req.author_name, + &req.author_email, + &msg, + ) + .await?; + + // 9. CAS-swap the branch ref. Map Conflict → ConcurrentCommit. + // This MUST happen before any VFS writeback: if another commit + // advanced the branch between our HEAD read and now, the CAS fails + // and we return early with the working tree still matching HEAD, + // leaving caller-driven reindex and on-disk state consistent. + match self + .ref_store + .cas_update(account, &ref_name, Some(head_oid), new_commit_oid) + .await + { + Ok(()) => {} + Err(crate::git::error::RefStoreError::Conflict { expected, actual }) => { + return Err(GitError::ConcurrentCommit { + ref_name, + expected, + actual, + }); + } + Err(other) => return Err(other.into()), + } + + // 10. The ref swap committed our new state. Now write back through the + // VFS so the working tree reflects the restored content. The ref + // has already advanced, so a per-path failure here can NOT be + // rolled back — instead the streams below collect every failure + // and we surface them as `GitError::RestoreWritebackPartial`. The + // caller (Python) then schedules reindex for the paths that *did* + // reach the VFS so the vector index does not stay stale. + let object_store_ref = self.object_store.clone(); + let vfs_ref = self.vfs.clone(); + let account_owned = account.clone(); + let abs_prefix_for_writes = abs_prefix.clone(); + let project_dir_for_writes = project_dir.clone(); + + let write_results: Vec<(String, Result<(), GitError>)> = + stream::iter(diff.to_write.clone().into_iter()) + .map(|(rel, blob_oid)| { + let object_store = object_store_ref.clone(); + let vfs = vfs_ref.clone(); + let account = account_owned.clone(); + let abs_prefix = abs_prefix_for_writes.clone(); + let project_dir = project_dir_for_writes.clone(); + async move { + let account_rel = match &project_dir { + Some(pd) => format!("{}/{}", pd, rel), + None => rel.clone(), + }; + let r = async { + let bytes = + read_blob_payload(object_store.as_ref(), &account, &blob_oid) + .await?; + let abs = format!("{}/{}", abs_prefix, rel); + // The target's parent directory may have been removed out of + // band (e.g. an `rm -r` that a later commit recorded as a + // deletion), so the restore must recreate the directory + // chain before writing the blob back. + crate::core::filesystem::FileSystem::ensure_parent_dirs( + vfs.as_ref(), + &abs, + 0o755, + ) + .await?; + crate::core::filesystem::FileSystem::write( + vfs.as_ref(), + &abs, + &bytes, + 0, + crate::core::types::WriteFlag::Create, + ) + .await?; + Ok::<(), GitError>(()) + } + .await; + (account_rel, r) + } + }) + .buffer_unordered(32) + .collect() + .await; + + let abs_prefix_for_deletes = abs_prefix.clone(); + let vfs_for_deletes = self.vfs.clone(); + let project_dir_for_deletes = project_dir.clone(); + let delete_results: Vec<(String, Result<(), GitError>)> = + stream::iter(diff.to_delete.clone().into_iter()) + .map(|rel| { + let vfs = vfs_for_deletes.clone(); + let abs_prefix = abs_prefix_for_deletes.clone(); + let project_dir = project_dir_for_deletes.clone(); + async move { + let account_rel = match &project_dir { + Some(pd) => format!("{}/{}", pd, rel), + None => rel.clone(), + }; + let abs = format!("{}/{}", abs_prefix, rel); + // Restore is idempotent: a path the diff wants to delete may + // already be absent from the VFS (e.g. derived files like + // `.abstract.md` that were removed or regenerated out of band). + // Treat NotFound as success rather than counting it as a failure. + let r = + match crate::core::filesystem::FileSystem::remove(vfs.as_ref(), &abs) + .await + { + Ok(_) => Ok::<(), GitError>(()), + Err(crate::core::errors::Error::NotFound(_)) => Ok(()), + Err(e) => Err(e.into()), + }; + (account_rel, r) + } + }) + .buffer_unordered(32) + .collect() + .await; + + // 10b. Prune directories left empty by the deletes above. Git does not + // track directories, so `to_delete` only ever lists files; removing + // the last file in a directory would otherwise leave an empty husk + // in the VFS. Walk each deleted file's ancestor directories (within + // project_dir, deepest first) and drop any that are now empty. + // Best-effort: a directory that still holds entries, or has already + // vanished, is simply skipped — pruning never aborts the restore. + use std::collections::BTreeSet; + // (depth, rel_dir): BTreeSet iterates ascending, so reversing yields the + // deepest directories first — children are pruned before their parents, + // letting a parent that held only pruned subdirs be removed in turn. + let mut prune_candidates: BTreeSet<(usize, String)> = BTreeSet::new(); + for rel in &diff.to_delete { + let mut dir = rel.as_str(); + while let Some(idx) = dir.rfind('/') { + dir = &dir[..idx]; + prune_candidates.insert((dir.split('/').count(), dir.to_string())); + } + } + for (_depth, rel_dir) in prune_candidates.into_iter().rev() { + let abs = format!("{}/{}", abs_prefix, rel_dir); + let is_empty = match crate::core::filesystem::FileSystem::read_dir( + self.vfs.as_ref(), + &abs, + ) + .await + { + Ok(entries) => entries.is_empty(), + // Missing or not a directory → nothing to prune. + Err(_) => false, + }; + if is_empty { + // Ignore failures: a concurrent writer may have repopulated the + // directory, or it may already be gone. Either way the restore + // itself has succeeded. + let _ = crate::core::filesystem::FileSystem::remove(self.vfs.as_ref(), &abs).await; + } + } + + // 10c. Partition the per-path results into success / failure buckets. + // `written_paths` / `deleted_paths` here only carry the paths + // that actually reached the VFS — callers use these lists to + // drive reindex, and a path whose write failed must not be + // reindexed (the file's blob never landed). + let mut written_paths: Vec = Vec::with_capacity(write_results.len()); + let mut failed_writes: Vec<(String, String)> = Vec::new(); + for (path, r) in write_results { + match r { + Ok(()) => written_paths.push(path), + Err(e) => failed_writes.push((path, e.to_string())), + } + } + let mut deleted_paths: Vec = Vec::with_capacity(delete_results.len()); + let mut failed_deletes: Vec<(String, String)> = Vec::new(); + for (path, r) in delete_results { + match r { + Ok(()) => deleted_paths.push(path), + Err(e) => failed_deletes.push((path, e.to_string())), + } + } + + let written_actual = written_paths.len(); + let deleted_actual = deleted_paths.len(); + + // 11. Partial failure path. The ref has already advanced, so we + // cannot rollback — surface a structured error carrying enough + // payload for the caller to schedule reindex for the paths that + // *did* succeed and to report the failures upward. + if !failed_writes.is_empty() || !failed_deletes.is_empty() { + return Err(GitError::RestoreWritebackPartial(Box::new( + crate::git::types::RestoreWritebackPartial { + new_commit_oid, + source_commit: source_oid, + parent_commit: head_oid, + written: written_actual, + deleted: deleted_actual, + unchanged: unchanged_count, + written_paths, + deleted_paths, + failed_writes, + failed_deletes, + }, + ))); + } + + Ok(RestoreResponse::Applied { + new_commit_oid, + source_commit: source_oid, + parent_commit: head_oid, + written: written_actual, + deleted: deleted_actual, + unchanged: unchanged_count, + written_paths, + deleted_paths, + }) + } +} + +/// Load a blob object and return only its payload bytes (header stripped). +/// +/// Errors out with `CorruptedObject` if the loaded object is not a blob — +/// this should not happen on a well-formed store but is cheap to verify. +async fn read_blob_payload( + store: &dyn ObjectStore, + account: &str, + blob_oid: &gix_hash::ObjectId, +) -> Result { + let raw = crate::git::util::read_object(store, account, blob_oid).await?; + let (kind, _, hdr) = crate::git::util::parse_object_header(&raw)?; + if kind != gix_object::Kind::Blob { + return Err(GitError::CorruptedObject(format!( + "expected blob, got {kind:?}" + ))); + } + Ok(raw.slice(hdr..)) +} + +/// Resolve `target_ref` to a commit OID. +/// +/// Accepts: +/// 1. 40-hex commit OID (validated by `ObjectId::from_hex`) +/// 2. Abbreviated OID (4–39 hex chars) — resolved by listing refs and +/// walking parent chains; returns `OidPrefixNotFound` or `AmbiguousOid` +/// on zero / multiple matches +/// 3. Full ref path beginning with `refs/` (passed through `validate_ref_name`, +/// then read from `ref_store`) +/// 4. Short branch name (e.g. "main") — auto-prefixed to `refs/heads/{name}`, +/// validated, then read from `ref_store` +/// +/// Returns `RefStoreError::NotFound` (wrapped) if the ref doesn't exist; +/// `GitError::Other` if `target_ref` is neither a valid OID nor a valid ref name. +/// +/// Note: a 40-char hex string is always interpreted as an OID, even if it +/// happens to also be a valid branch name (e.g. `deadbeefdeadbeef...`). +/// To disambiguate such a branch, pass the full ref path `refs/heads/`. +async fn resolve_ref( + ref_store: &dyn RefStore, + object_store: &dyn ObjectStore, + account: &str, + target_ref: &str, +) -> Result { + // 1. 40-hex commit OID — ASCII hex (case-insensitive), exactly len 40. + if target_ref.len() == 40 && target_ref.bytes().all(|b| b.is_ascii_hexdigit()) { + return ObjectId::from_hex(target_ref.as_bytes()) + .map_err(|e| GitError::Other(format!("invalid oid {target_ref}: {e}"))); + } + + // 2. Abbreviated OID (4–39 hex chars) — list refs and walk parent chains. + if target_ref.len() >= 4 && target_ref.bytes().all(|b| b.is_ascii_hexdigit()) { + return resolve_abbreviated_oid(ref_store, object_store, account, target_ref).await; + } + + // 3 & 4. Normalize to full ref path then read. + let full = if target_ref.starts_with("refs/") { + target_ref.to_string() + } else { + format!("refs/heads/{target_ref}") + }; + crate::git::util::validate_ref_name(&full)?; + Ok(ref_store.read(account, &full).await?) +} + +/// Decoded commit metadata used by `commit()` (just the tree) and `show()` +/// (full set). Owned so callers don't have to juggle the raw buffer. +struct CommitMeta { + tree: ObjectId, + parents: Vec, + author: crate::git::types::Actor, + committer: crate::git::types::Actor, + message: String, +} + +/// Read a commit object and return its decoded metadata. +async fn load_commit_meta( + store: &dyn ObjectStore, + account: &str, + commit_oid: &ObjectId, +) -> Result { + let raw = crate::git::util::read_object(store, account, commit_oid).await?; + let (kind, _, hdr) = crate::git::util::parse_object_header(&raw)?; + if kind != gix_object::Kind::Commit { + return Err(GitError::Other(format!( + "expected commit object, got {kind:?}" + ))); + } + let parsed = gix_object::CommitRef::from_bytes(&raw[hdr..]) + .map_err(|e| GitError::Other(format!("commit decode: {e}")))?; + Ok(CommitMeta { + tree: parsed.tree(), + parents: parsed.parents().collect(), + author: actor_from_signature_ref(&parsed.author), + committer: actor_from_signature_ref(&parsed.committer), + message: parsed.message.to_string(), + }) +} + +/// Resolve an abbreviated commit OID (4–39 hex chars) by walking the parent +/// chains from every ref tip in the account. The traversal is bounded by +/// `MAX_OID_RESOLVE_VISITED` to keep degenerate histories from running away. +/// +/// Returns: +/// - `Ok(oid)` if exactly one commit's hex starts with `prefix`. +/// - `Err(GitError::OidPrefixNotFound)` if no commit matches. +/// - `Err(GitError::AmbiguousOid)` if 2+ commits match (lists up to 5 candidates). +/// +/// Lowercases `prefix` before comparison; the input is already known to be +/// ASCII hex by the caller. +async fn resolve_abbreviated_oid( + ref_store: &dyn RefStore, + object_store: &dyn ObjectStore, + account: &str, + prefix: &str, +) -> Result { + use std::collections::HashSet; + + const MAX_OID_RESOLVE_VISITED: usize = 50_000; + const MAX_REPORTED_CANDIDATES: usize = 5; + + let prefix_lc = prefix.to_ascii_lowercase(); + + let refs = ref_store.list(account, "refs/").await?; + let mut visited: HashSet = HashSet::new(); + let mut queue: Vec = refs.into_iter().map(|(_, oid)| oid).collect(); + let mut matches: Vec = Vec::new(); + + while let Some(oid) = queue.pop() { + if !visited.insert(oid) { + continue; + } + if visited.len() > MAX_OID_RESOLVE_VISITED { + return Err(GitError::Other(format!( + "OID prefix resolution aborted: scanned over {MAX_OID_RESOLVE_VISITED} commits without converging" + ))); + } + if oid.to_hex().to_string().starts_with(&prefix_lc) { + matches.push(oid); + if matches.len() > MAX_REPORTED_CANDIDATES { + // Continue scanning a little longer to give a useful error, + // but we already know it's ambiguous. + break; + } + } + let meta = match load_commit_meta(object_store, account, &oid).await { + Ok(m) => m, + Err(GitError::ObjectStore(ObjectStoreError::NotFound(_))) => continue, + Err(GitError::Other(_)) => continue, // not a commit (tag etc.) — skip + Err(e) => return Err(e), + }; + for p in meta.parents { + if !visited.contains(&p) { + queue.push(p); + } + } + } + + match matches.len() { + 0 => Err(GitError::OidPrefixNotFound { + prefix: prefix.to_string(), + }), + 1 => Ok(matches.into_iter().next().unwrap()), + n => { + let listed: Vec = matches + .iter() + .take(MAX_REPORTED_CANDIDATES) + .map(|o| o.to_hex().to_string()) + .collect(); + Err(GitError::AmbiguousOid { + prefix: prefix.to_string(), + count: n, + candidates: listed.join(", "), + }) + } + } +} + +/// Project a borrowed `gix_actor::SignatureRef` into our owned `Actor` DTO. +/// +/// gix-actor 0.31.5 fields used: `SignatureRef.name: &BStr`, `.email: &BStr`, +/// `.time: gix_date::Time` (not the raw `&str` of later versions). `Time` +/// provides `.seconds: i64` and `.offset: i32`. +// TODO: gix_date::Time.sign dropped — Actor not roundtrip-safe for "-0000" +fn actor_from_signature_ref(sig: &gix_actor::SignatureRef<'_>) -> crate::git::types::Actor { + crate::git::types::Actor { + name: sig.name.to_string(), + email: sig.email.to_string(), + time_seconds: sig.time.seconds, + tz_offset_seconds: sig.time.offset, + } +} + +/// Return true iff `e` is `Error::NotFound(_)`. +fn is_not_found(e: &crate::core::errors::Error) -> bool { + matches!(e, crate::core::errors::Error::NotFound(_)) +} + +/// Project a `FileInfo` into the `(size, mtime_ns)` pair Fast Path 1 keys on. +/// +/// Returns `None` when the file's `mod_time` is unrepresentable (pre-epoch +/// times wider than `i128` can hold are degenerate). A `None` here means the +/// path simply will not participate in Fast Path 1 — the slow path +/// (read+SHA-1) is taken and the cache entry is dropped, never poisoned. +fn stat_signature(info: &FileInfo) -> Option<(u64, i128)> { + let dur = info.mod_time.duration_since(UNIX_EPOCH).ok()?; + let nanos: i128 = dur.as_nanos() as i128; + Some((info.size, nanos)) +} + +/// Validate an `account` id before it is used to build any filesystem path +/// (local backend) or S3 key prefix. This is the Rust-side equivalent of the +/// Python `validate_account_id` and is the single choke point that keeps a +/// crafted account (e.g. `../x`, `a/b`, `a\b`) from escaping its per-account +/// directory / key prefix when a binding is called directly. +/// +/// Rules (mirroring `openviking/core/identifiers.py`): +/// - non-empty +/// - not `.` or `..` +/// - only `[A-Za-z0-9_.@-]` (rejects `/`, `\`, whitespace, control chars, …) +/// - at most one `@` +/// - must not start with `_` +fn validate_account_id(account: &str) -> Result<(), GitError> { + if account.is_empty() { + return Err(GitError::InvalidAccountId("account_id is empty".into())); + } + if account == "." || account == ".." { + return Err(GitError::InvalidAccountId( + "account_id must not be '.' or '..'".into(), + )); + } + if !account + .bytes() + .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'_' | b'.' | b'@' | b'-')) + { + return Err(GitError::InvalidAccountId(format!( + "account_id must be an alphanumeric string: {account:?}" + ))); + } + if account.bytes().filter(|&b| b == b'@').count() > 1 { + return Err(GitError::InvalidAccountId( + "account_id must have at most one @".into(), + )); + } + if account.starts_with('_') { + return Err(GitError::InvalidAccountId( + "account_id cannot start with underscore _".into(), + )); + } + Ok(()) +} + +/// Validate `project_dir` matches the rules of `TreeEditor::upsert`: +/// non-empty, no leading/trailing `/`, no empty components, no `.` / `..` +/// segments, no backslash, no control characters. The traversal-related +/// rules guard the same boundary as `validate_account_id`: a direct PyO3 +/// caller could otherwise pass `project_dir="../other"` and have the +/// service splice or restore *outside* the account's tree once the path is +/// concatenated into `/local/{account}/{project_dir}/...`. +fn validate_project_dir(project_dir: &str) -> Result<(), GitError> { + if project_dir.is_empty() { + return Err(GitError::InvalidProjectDir( + "project_dir must be non-empty".into(), + )); + } + if project_dir.starts_with('/') || project_dir.ends_with('/') { + return Err(GitError::InvalidProjectDir(format!( + "project_dir must not start or end with '/': {project_dir:?}" + ))); + } + for c in project_dir.split('/') { + if c.is_empty() { + return Err(GitError::InvalidProjectDir(format!( + "project_dir contains empty segment: {project_dir:?}" + ))); + } + if c == "." || c == ".." { + return Err(GitError::InvalidProjectDir(format!( + "project_dir contains '.' or '..' segment: {project_dir:?}" + ))); + } + } + if project_dir.contains('\\') { + return Err(GitError::InvalidProjectDir(format!( + "project_dir must not contain backslash: {project_dir:?}" + ))); + } + if project_dir.bytes().any(|b| b < 0x20 || b == 0x7f) { + return Err(GitError::InvalidProjectDir(format!( + "project_dir contains control character: {project_dir:?}" + ))); + } + Ok(()) +} + +/// Validate a user-supplied relative path that will be concatenated with +/// `/local/{account}/` (commit) or looked up in a Git tree (show). Same +/// reasoning as `validate_account_id` / `validate_project_dir`: the Rust +/// GitService is a native boundary, so it must defend against `..` / +/// backslash / control chars itself rather than trust the caller (PyO3 +/// binding, future SDK consumer) to have normalized first. +/// +/// Rules: non-empty; no leading/trailing `/`; no empty, `.`, or `..` +/// segment; no backslash; no control character. +fn validate_relative_path(path: &str) -> Result<(), GitError> { + if path.is_empty() { + return Err(GitError::InvalidPath("path must be non-empty".into())); + } + if path.starts_with('/') || path.ends_with('/') { + return Err(GitError::InvalidPath(format!( + "path must not start or end with '/': {path:?}" + ))); + } + for c in path.split('/') { + if c.is_empty() { + return Err(GitError::InvalidPath(format!( + "path contains empty segment: {path:?}" + ))); + } + if c == "." || c == ".." { + return Err(GitError::InvalidPath(format!( + "path contains '.' or '..' segment: {path:?}" + ))); + } + } + if path.contains('\\') { + return Err(GitError::InvalidPath(format!( + "path must not contain backslash: {path:?}" + ))); + } + if path.bytes().any(|b| b < 0x20 || b == 0x7f) { + return Err(GitError::InvalidPath(format!( + "path contains control character: {path:?}" + ))); + } + Ok(()) +} + +/// Pure-function diff between two flattened subtrees. +/// +/// Both inputs are `(path, oid)` slices as returned by `tree_builder::flatten` +/// on a subtree OID — meaning the paths are already relative to the subtree +/// root (no `project_dir` prefix). Results are sorted by path. +fn compute_subtree_diff( + source: &[(String, gix_hash::ObjectId)], + head: &[(String, gix_hash::ObjectId)], +) -> crate::git::types::RestoreDiff { + use std::collections::HashMap; + let head_map: HashMap<&str, &gix_hash::ObjectId> = + head.iter().map(|(p, o)| (p.as_str(), o)).collect(); + let source_map: HashMap<&str, &gix_hash::ObjectId> = + source.iter().map(|(p, o)| (p.as_str(), o)).collect(); + + let mut to_write = Vec::new(); + let mut unchanged = Vec::new(); + for (path, oid) in source { + match head_map.get(path.as_str()) { + Some(head_oid) if *head_oid == oid => unchanged.push(path.clone()), + _ => to_write.push((path.clone(), *oid)), + } + } + let mut to_delete: Vec = head + .iter() + .filter(|(p, _)| !source_map.contains_key(p.as_str())) + .map(|(p, _)| p.clone()) + .collect(); + + to_write.sort_by(|a, b| a.0.cmp(&b.0)); + to_delete.sort(); + unchanged.sort(); + crate::git::types::RestoreDiff { + to_write, + to_delete, + unchanged, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use async_trait::async_trait; + use std::collections::{HashMap, HashSet}; + use std::sync::{Arc, Mutex}; + + use crate::core::errors::{Error, Result}; + use crate::core::filesystem::FileSystem; + use crate::core::types::{FileInfo, TreeEntry, WriteFlag}; + use crate::git::backends::local::{LocalObjectStore, LocalRefStore}; + use crate::git::error::ObjectStoreError; + use crate::git::error::RefStoreError; + use crate::git::tree_builder::{flatten, lookup}; + + /// In-memory VFS mock that owns a map from absolute path to bytes. + /// Root for the account is always `/local/{account}` — paths inserted + /// must be the absolute path including this prefix. + struct MockVfs { + account: String, + files: Arc>>>, + /// When true, `remove` returns NotFound for absent paths (like the real + /// VFS) instead of silently succeeding. Used to exercise the idempotent + /// delete path in restore. + strict_remove: bool, + /// Absolute paths whose `write` call should fail with an I/O error. + /// Used by restore-partial tests to force a per-path writeback failure + /// without otherwise breaking the mock VFS. + fail_writes: Arc>>, + /// Absolute paths whose `remove` call should fail with an I/O error + /// (NotFound-style errors are still produced by the existing + /// `strict_remove` flag, not via this set — that mirrors the real + /// "idempotent delete" semantics in service.rs). + fail_removes: Arc>>, + } + + impl MockVfs { + fn new(account: &str) -> Arc { + Arc::new(Self { + account: account.to_string(), + files: Arc::new(Mutex::new(HashMap::new())), + strict_remove: false, + fail_writes: Arc::new(Mutex::new(HashSet::new())), + fail_removes: Arc::new(Mutex::new(HashSet::new())), + }) + } + + fn new_strict_remove(account: &str) -> Arc { + Arc::new(Self { + account: account.to_string(), + files: Arc::new(Mutex::new(HashMap::new())), + strict_remove: true, + fail_writes: Arc::new(Mutex::new(HashSet::new())), + fail_removes: Arc::new(Mutex::new(HashSet::new())), + }) + } + + /// Insert/update file content. `rel` is account-relative. + fn put(&self, rel: &str, data: &[u8]) { + let abs = format!("/local/{}/{}", self.account, rel); + self.files.lock().unwrap().insert(abs, data.to_vec()); + } + + /// Delete a file by account-relative path. + fn delete(&self, rel: &str) { + let abs = format!("/local/{}/{}", self.account, rel); + self.files.lock().unwrap().remove(&abs); + } + + /// Cause subsequent `write` calls targeting `rel` (account-relative) + /// to return an I/O error. + fn fail_write(&self, rel: &str) { + let abs = format!("/local/{}/{}", self.account, rel); + self.fail_writes.lock().unwrap().insert(abs); + } + + /// Cause subsequent `remove` calls targeting `rel` (account-relative) + /// to return an I/O error. + fn fail_remove(&self, rel: &str) { + let abs = format!("/local/{}/{}", self.account, rel); + self.fail_removes.lock().unwrap().insert(abs); + } + } + + #[async_trait] + impl FileSystem for MockVfs { + async fn create(&self, _path: &str) -> Result<()> { + unimplemented!() + } + async fn mkdir(&self, _path: &str, _mode: u32) -> Result<()> { + // Directories are implicit in this flat file map, so creating one is + // a no-op. Defined (rather than unimplemented!) so the default + // `ensure_parent_dirs` used by restore's writeback succeeds. + Ok(()) + } + async fn remove(&self, path: &str) -> Result<()> { + if self.fail_removes.lock().unwrap().contains(path) { + return Err(Error::Internal(format!("forced remove failure: {path}"))); + } + let existed = self.files.lock().unwrap().remove(path).is_some(); + if self.strict_remove && !existed { + return Err(Error::not_found(path)); + } + Ok(()) + } + async fn remove_all(&self, _path: &str) -> Result<()> { + unimplemented!() + } + + async fn read(&self, path: &str, _offset: u64, _size: u64) -> Result> { + let g = self.files.lock().unwrap(); + match g.get(path) { + Some(bytes) => Ok(bytes.clone()), + None => Err(Error::not_found(path)), + } + } + + async fn write( + &self, + path: &str, + data: &[u8], + _offset: u64, + _flags: WriteFlag, + ) -> Result { + if self.fail_writes.lock().unwrap().contains(path) { + return Err(Error::Internal(format!("forced write failure: {path}"))); + } + self.files + .lock() + .unwrap() + .insert(path.to_string(), data.to_vec()); + Ok(data.len() as u64) + } + async fn read_dir(&self, _path: &str) -> Result> { + unimplemented!() + } + + async fn stat(&self, path: &str) -> Result { + let g = self.files.lock().unwrap(); + if let Some(bytes) = g.get(path) { + let name = path.rsplit('/').next().unwrap_or(path).to_string(); + return Ok(FileInfo::new_file(name, bytes.len() as u64, 0o644)); + } + Err(Error::not_found(path)) + } + + async fn rename(&self, _old_path: &str, _new_path: &str) -> Result<()> { + unimplemented!() + } + async fn chmod(&self, _path: &str, _mode: u32) -> Result<()> { + unimplemented!() + } + + async fn tree_directory( + &self, + path: &str, + _show_hidden: bool, + _node_limit: Option, + _level_limit: Option, + ) -> Result> { + let prefix = if path == "/" { + "/".to_string() + } else { + format!("{}/", path) + }; + let g = self.files.lock().unwrap(); + let mut out = Vec::new(); + for (full_path, _bytes) in g.iter() { + if !full_path.starts_with(&prefix) { + continue; + } + let rel = full_path + .strip_prefix(&prefix) + .unwrap_or(full_path) + .to_string(); + let name = full_path + .rsplit('/') + .next() + .unwrap_or(full_path) + .to_string(); + let info = FileInfo::new_file(name, 0, 0o644); + out.push(TreeEntry { + path: full_path.clone(), + rel_path: rel, + info, + extra: HashMap::new(), + }); + } + Ok(out) + } + } + + /// Helper: build a fresh GitService backed by a temp dir + a fresh + /// in-memory VFS for the given account. + fn make_service( + account: &str, + ) -> ( + tempfile::TempDir, + Arc, + Arc, + Arc, + GitService, + ) { + let dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(dir.path())); + let ref_store = Arc::new(LocalRefStore::new(dir.path())); + let vfs = MockVfs::new(account); + let svc = GitService::new( + vfs.clone() as Arc, + object_store.clone() as Arc, + ref_store.clone() as Arc, + ); + (dir, vfs, object_store, ref_store, svc) + } + + fn req( + account: &str, + branch: &str, + message: &str, + paths: Option>, + ) -> CommitRequest { + CommitRequest { + account: account.to_string(), + branch: branch.to_string(), + message: message.to_string(), + paths, + author_name: "tester".to_string(), + author_email: "tester@example.com".to_string(), + } + } + + /// Load a commit's parent OIDs from the object store. + async fn commit_parents( + store: &dyn ObjectStore, + account: &str, + commit_oid: ObjectId, + ) -> Vec { + let raw = crate::git::util::read_object(store, account, &commit_oid) + .await + .unwrap(); + let (_, _, hdr) = crate::git::util::parse_object_header(&raw).unwrap(); + let parsed = gix_object::CommitRef::from_bytes(&raw[hdr..]).unwrap(); + parsed.parents().collect() + } + + async fn commit_tree(store: &dyn ObjectStore, account: &str, commit_oid: ObjectId) -> ObjectId { + load_commit_meta(store, account, &commit_oid) + .await + .unwrap() + .tree + } + + /// Make a commit and return its OID. + async fn make_commit(svc: &GitService, account: &str, branch: &str, msg: &str) -> ObjectId { + match svc.commit(req(account, branch, msg, None)).await.unwrap() { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + } + } + + // ── 1 ────────────────────────────────────────────────────────────── + #[tokio::test] + async fn test_commit_first_creates_root_commit() { + let (_dir, vfs, object_store, ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"hello"); + + let resp = svc + .commit(req("acct", "main", "first", None)) + .await + .unwrap(); + + match resp { + CommitResponse::Created { + commit_oid, + changed, + } => { + assert!(changed >= 1, "should record at least one change"); + let parents = commit_parents( + object_store.as_ref() as &dyn ObjectStore, + "acct", + commit_oid, + ) + .await; + assert!(parents.is_empty(), "root commit must have no parents"); + let tree = commit_tree( + object_store.as_ref() as &dyn ObjectStore, + "acct", + commit_oid, + ) + .await; + assert_ne!(tree, ObjectId::empty_tree(gix_hash::Kind::Sha1)); + let head = ref_store.read("acct", "refs/heads/main").await.unwrap(); + assert_eq!(head, commit_oid); + } + other => panic!("expected Created, got {other:?}"), + } + } + + // ── 2 ────────────────────────────────────────────────────────────── + #[tokio::test] + async fn test_commit_second_links_to_first() { + let (_dir, vfs, object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"hello"); + let first = svc + .commit(req("acct", "main", "first", None)) + .await + .unwrap(); + let first_oid = match first { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + vfs.put("resources/a.md", b"world"); + let second = svc + .commit(req("acct", "main", "second", None)) + .await + .unwrap(); + let second_oid = match second { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + let parents = commit_parents( + object_store.as_ref() as &dyn ObjectStore, + "acct", + second_oid, + ) + .await; + assert_eq!(parents, vec![first_oid]); + } + + // ── 3 ────────────────────────────────────────────────────────────── + #[tokio::test] + async fn test_commit_noop_when_nothing_changed() { + let (_dir, vfs, _object_store, ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"hello"); + let first = svc + .commit(req("acct", "main", "first", None)) + .await + .unwrap(); + let first_oid = match first { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + let second = svc.commit(req("acct", "main", "noop", None)).await.unwrap(); + match second { + CommitResponse::Noop { commit_oid } => assert_eq!(commit_oid, first_oid), + other => panic!("expected Noop, got {other:?}"), + } + + let head = ref_store.read("acct", "refs/heads/main").await.unwrap(); + assert_eq!(head, first_oid); + } + + // ── 4 ────────────────────────────────────────────────────────────── + #[tokio::test] + async fn test_commit_handles_deletes() { + let (_dir, vfs, object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"hello"); + vfs.put("resources/b.md", b"world"); + let _ = svc + .commit(req("acct", "main", "first", None)) + .await + .unwrap(); + + vfs.delete("resources/a.md"); + let resp = svc + .commit(req( + "acct", + "main", + "delete-a", + Some(vec!["resources/a.md".to_string()]), + )) + .await + .unwrap(); + let second_oid = match resp { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + let tree = commit_tree( + object_store.as_ref() as &dyn ObjectStore, + "acct", + second_oid, + ) + .await; + let all = flatten( + object_store.as_ref() as &dyn ObjectStore, + "acct", + tree, + &None, + ) + .await + .unwrap(); + let paths: Vec = all.into_iter().map(|(p, _)| p).collect(); + assert_eq!(paths, vec!["resources/b.md".to_string()]); + } + + /// Helper: list every blob path in a commit's tree, sorted. + async fn commit_paths( + store: &dyn ObjectStore, + account: &str, + commit_oid: ObjectId, + ) -> Vec { + let tree = commit_tree(store, account, commit_oid).await; + let all = flatten(store, account, tree, &None).await.unwrap(); + all.into_iter().map(|(p, _)| p).collect() + } + + // ── 4b ───────────────────────────────────────────────────────────── + /// Full enumeration (`paths=None`) must capture a deletion: a file gone + /// from disk but present in prev_tree is dropped from the new snapshot. + #[tokio::test] + async fn test_full_commit_captures_delete() { + let (_dir, vfs, object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"hello"); + vfs.put("resources/b.md", b"world"); + let _ = make_commit(&svc, "acct", "main", "first").await; + + vfs.delete("resources/a.md"); + let resp = svc + .commit(req("acct", "main", "full-delete", None)) + .await + .unwrap(); + let oid = match resp { + CommitResponse::Created { + commit_oid, + changed, + } => { + assert_eq!(changed, 1, "exactly one path (a.md) was removed"); + commit_oid + } + other => panic!("expected Created, got {other:?}"), + }; + + let paths = commit_paths(object_store.as_ref() as &dyn ObjectStore, "acct", oid).await; + assert_eq!(paths, vec!["resources/b.md".to_string()]); + } + + // ── 4c ───────────────────────────────────────────────────────────── + /// Full enumeration must capture deletion of an entire subdirectory. + #[tokio::test] + async fn test_full_commit_captures_subdir_delete() { + let (_dir, vfs, object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/keep.md", b"keep"); + vfs.put("resources/sub/a.md", b"a"); + vfs.put("resources/sub/b.md", b"b"); + let _ = make_commit(&svc, "acct", "main", "first").await; + + vfs.delete("resources/sub/a.md"); + vfs.delete("resources/sub/b.md"); + let resp = svc + .commit(req("acct", "main", "drop-sub", None)) + .await + .unwrap(); + let oid = match resp { + CommitResponse::Created { + commit_oid, + changed, + } => { + assert_eq!(changed, 2, "both files under sub/ were removed"); + commit_oid + } + other => panic!("expected Created, got {other:?}"), + }; + + let paths = commit_paths(object_store.as_ref() as &dyn ObjectStore, "acct", oid).await; + assert_eq!(paths, vec!["resources/keep.md".to_string()]); + } + + // ── 4d ───────────────────────────────────────────────────────────── + /// Full enumeration must handle a file→dir transition: `foo` was a file, + /// now `foo/bar.md` is a directory entry. The stale blob is replaced. + #[tokio::test] + async fn test_full_commit_file_to_dir_transition() { + let (_dir, vfs, object_store, _ref_store, svc) = make_service("acct"); + vfs.put("foo", b"i am a file"); + let _ = make_commit(&svc, "acct", "main", "first").await; + + vfs.delete("foo"); + vfs.put("foo/bar.md", b"now a dir"); + let resp = svc + .commit(req("acct", "main", "file-to-dir", None)) + .await + .unwrap(); + let oid = match resp { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + let paths = commit_paths(object_store.as_ref() as &dyn ObjectStore, "acct", oid).await; + assert_eq!(paths, vec!["foo/bar.md".to_string()]); + } + + // ── 4e ───────────────────────────────────────────────────────────── + /// Full enumeration must handle a dir→file transition: `foo/bar.md` was a + /// directory, now `foo` is a file. The stale subtree is dropped. + #[tokio::test] + async fn test_full_commit_dir_to_file_transition() { + let (_dir, vfs, object_store, _ref_store, svc) = make_service("acct"); + vfs.put("foo/bar.md", b"i am in a dir"); + let _ = make_commit(&svc, "acct", "main", "first").await; + + vfs.delete("foo/bar.md"); + vfs.put("foo", b"now a file"); + let resp = svc + .commit(req("acct", "main", "dir-to-file", None)) + .await + .unwrap(); + let oid = match resp { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + let paths = commit_paths(object_store.as_ref() as &dyn ObjectStore, "acct", oid).await; + assert_eq!(paths, vec!["foo".to_string()]); + } + + // ── 4f ───────────────────────────────────────────────────────────── + /// Multi-level dir→file: `foo/bar/baz.md` collapses to a file `foo`. + #[tokio::test] + async fn test_full_commit_dir_to_file_transition_multilevel() { + let (_dir, vfs, object_store, _ref_store, svc) = make_service("acct"); + vfs.put("foo/bar/baz.md", b"deep"); + let _ = make_commit(&svc, "acct", "main", "first").await; + + vfs.delete("foo/bar/baz.md"); + vfs.put("foo", b"now a file"); + let resp = svc + .commit(req("acct", "main", "deep-collapse", None)) + .await + .unwrap(); + let oid = match resp { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + let paths = commit_paths(object_store.as_ref() as &dyn ObjectStore, "acct", oid).await; + assert_eq!(paths, vec!["foo".to_string()]); + } + + // ── 5 ────────────────────────────────────────────────────────────── + #[tokio::test] + async fn test_commit_with_explicit_paths_skips_others() { + let (_dir, vfs, object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"A"); + vfs.put("resources/b.md", b"B"); + vfs.put("resources/c.md", b"C"); + + let resp = svc + .commit(req( + "acct", + "main", + "only-a", + Some(vec!["resources/a.md".to_string()]), + )) + .await + .unwrap(); + let oid = match resp { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + let tree = commit_tree(object_store.as_ref() as &dyn ObjectStore, "acct", oid).await; + let all = flatten( + object_store.as_ref() as &dyn ObjectStore, + "acct", + tree, + &None, + ) + .await + .unwrap(); + let paths: Vec = all.into_iter().map(|(p, _)| p).collect(); + assert_eq!(paths, vec!["resources/a.md".to_string()]); + // Sanity-check the blob is reachable via lookup too. + let found = lookup( + object_store.as_ref() as &dyn ObjectStore, + "acct", + tree, + "resources/a.md", + ) + .await + .unwrap(); + assert!(found.is_some()); + } + + // ── 6 ────────────────────────────────────────────────────────────── + + /// Wrapping RefStore that forces the next `cas_update` call to fail + /// with `Conflict`, then delegates to the inner store afterwards. + struct ConflictOnceRef { + inner: Arc, + fired: Mutex, + actual: Option, + } + + #[async_trait] + impl RefStore for ConflictOnceRef { + async fn read( + &self, + account: &str, + ref_name: &str, + ) -> std::result::Result { + self.inner.read(account, ref_name).await + } + + async fn cas_update( + &self, + account: &str, + ref_name: &str, + expected: Option, + new: ObjectId, + ) -> std::result::Result<(), RefStoreError> { + let should_conflict = { + let mut fired = self.fired.lock().unwrap(); + if !*fired { + *fired = true; + true + } else { + false + } + }; + if should_conflict { + return Err(RefStoreError::Conflict { + expected, + actual: self.actual, + }); + } + self.inner + .cas_update(account, ref_name, expected, new) + .await + } + + async fn list( + &self, + account: &str, + prefix: &str, + ) -> std::result::Result, RefStoreError> { + self.inner.list(account, prefix).await + } + } + + #[tokio::test] + async fn test_commit_cas_conflict_surfaces_as_error() { + let dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(dir.path())); + let inner_ref = Arc::new(LocalRefStore::new(dir.path())); + let bogus = ObjectId::from_hex(b"deadbeefdeadbeefdeadbeefdeadbeefdeadbeef").unwrap(); + let ref_store = Arc::new(ConflictOnceRef { + inner: inner_ref.clone(), + fired: Mutex::new(false), + actual: Some(bogus), + }); + let vfs = MockVfs::new("acct"); + vfs.put("resources/a.md", b"hello"); + let svc = GitService::new( + vfs.clone() as Arc, + object_store.clone() as Arc, + ref_store.clone() as Arc, + ); + + let result = svc.commit(req("acct", "main", "boom", None)).await; + match result { + Err(GitError::ConcurrentCommit { + ref_name, + expected, + actual, + }) => { + assert_eq!(ref_name, "refs/heads/main"); + assert_eq!(expected, None); + assert_eq!(actual, Some(bogus)); + } + other => panic!("expected ConcurrentCommit, got {other:?}"), + } + } + + // ── 7 ────────────────────────────────────────────────────────────── + // Verifies the incremental commit path reuses unchanged subtree OIDs: + // modifying a file under `resources/` must NOT rewrite the `agent/` + // subtree object — its OID must be byte-identical across commits. + #[tokio::test] + async fn test_commit_incremental_reuses_unchanged_subtree_oids() { + let (_dir, vfs, object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"hello"); + vfs.put("agent/b.py", b"print('hi')"); + + let first = svc + .commit(req("acct", "main", "first", None)) + .await + .unwrap(); + let first_oid = match first { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + let first_tree = + commit_tree(object_store.as_ref() as &dyn ObjectStore, "acct", first_oid).await; + let agent_first = lookup( + object_store.as_ref() as &dyn ObjectStore, + "acct", + first_tree, + "agent", + ) + .await + .unwrap() + .expect("agent subtree must exist after first commit"); + assert!(agent_first.1.is_tree(), "agent entry must be a tree"); + + // Touch only resources/a.md. + vfs.put("resources/a.md", b"world"); + let second = svc + .commit(req("acct", "main", "second", None)) + .await + .unwrap(); + let second_oid = match second { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + let second_tree = commit_tree( + object_store.as_ref() as &dyn ObjectStore, + "acct", + second_oid, + ) + .await; + assert_ne!( + first_tree, second_tree, + "root tree must change because resources/a.md changed", + ); + let agent_second = lookup( + object_store.as_ref() as &dyn ObjectStore, + "acct", + second_tree, + "agent", + ) + .await + .unwrap() + .expect("agent subtree must still exist after second commit"); + + assert_eq!( + agent_first.0, agent_second.0, + "unchanged agent/ subtree OID must be reused across commits", + ); + } + + // ── 8 ────────────────────────────────────────────────────────────── + #[tokio::test] + async fn test_commit_skips_pruned_paths() { + let (_dir, vfs, object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"hello"); + vfs.put("resources/x.faiss", b"FAISS"); + vfs.put("_system/lock", b"L"); + + let resp = svc + .commit(req("acct", "main", "filtered", None)) + .await + .unwrap(); + let oid = match resp { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + let tree = commit_tree(object_store.as_ref() as &dyn ObjectStore, "acct", oid).await; + let all = flatten( + object_store.as_ref() as &dyn ObjectStore, + "acct", + tree, + &None, + ) + .await + .unwrap(); + let paths: Vec = all.into_iter().map(|(p, _)| p).collect(); + assert_eq!(paths, vec!["resources/a.md".to_string()]); + } + + // ── commit: paths supports directories ────────────────────────────── + /// A directory in `paths` is expanded to every file under it that + /// survives pruning. Files under the directory that were in the + /// previous tree but have since been deleted from the VFS must drop + /// out of the new snapshot. + /// + /// Backed by `LocalFileSystem`: `MockVfs::stat` returns NotFound for + /// any directory entry, which would route this test through Step 2.5's + /// NotFound branch instead of the Directory branch. A real filesystem + /// is the only fixture where `stat("/local/acct/docs")` returns + /// `is_dir = true`. + #[tokio::test] + async fn test_commit_paths_expands_directory_and_drops_deleted_files() { + use crate::plugins::localfs::LocalFileSystem; + + let store_dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(store_dir.path())); + let ref_store = Arc::new(LocalRefStore::new(store_dir.path())); + let work_dir = tempfile::tempdir().unwrap(); + let acct_root = work_dir.path().join("local").join("acct"); + std::fs::create_dir_all(acct_root.join("docs")).unwrap(); + std::fs::create_dir_all(acct_root.join("other")).unwrap(); + std::fs::write(acct_root.join("docs/a.md"), b"AA").unwrap(); + std::fs::write(acct_root.join("docs/b.md"), b"BB").unwrap(); + std::fs::write(acct_root.join("other/c.md"), b"CC").unwrap(); + let vfs: Arc = + Arc::new(LocalFileSystem::new(work_dir.path().to_str().unwrap()).unwrap()); + let svc = GitService::new(vfs, object_store.clone(), ref_store); + + let _ = make_commit(&svc, "acct", "main", "first").await; + + // Delete b.md from VFS, add d.md, leave a.md unchanged. + std::fs::remove_file(acct_root.join("docs/b.md")).unwrap(); + std::fs::write(acct_root.join("docs/d.md"), b"DD").unwrap(); + + let resp = svc + .commit(req("acct", "main", "scoped", Some(vec!["docs".into()]))) + .await + .unwrap(); + let commit_oid = match resp { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + // Verify the new tree through show(): + // docs/a.md still present, docs/b.md gone, docs/d.md present, + // other/c.md untouched. + let oid_hex = commit_oid.to_hex().to_string(); + assert!(matches!( + svc.show(ShowRequest { + account: "acct".into(), + target_ref: oid_hex.clone(), + path: Some("docs/a.md".into()), + }) + .await, + Ok(ShowResponse::Blob { .. }) + )); + assert!(matches!( + svc.show(ShowRequest { + account: "acct".into(), + target_ref: oid_hex.clone(), + path: Some("docs/b.md".into()), + }) + .await, + Err(GitError::PathNotFound(_)) + )); + assert!(matches!( + svc.show(ShowRequest { + account: "acct".into(), + target_ref: oid_hex.clone(), + path: Some("docs/d.md".into()), + }) + .await, + Ok(ShowResponse::Blob { .. }) + )); + assert!(matches!( + svc.show(ShowRequest { + account: "acct".into(), + target_ref: oid_hex, + path: Some("other/c.md".into()), + }) + .await, + Ok(ShowResponse::Blob { .. }) + )); + } + + /// If the directory passed in `paths` does not exist in the VFS at all, + /// every file under that prefix in prev_tree is dropped from the new + /// snapshot. A `warn!` is emitted but no error is returned. + /// Uses MockVfs: the directory is "missing" so Step 2.5 sees NotFound. + #[tokio::test] + async fn test_commit_paths_notfound_directory_drops_subtree() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("docs/a.md", b"AA"); + vfs.put("docs/b.md", b"BB"); + vfs.put("other/c.md", b"CC"); + let _ = make_commit(&svc, "acct", "main", "first").await; + + // Whole directory disappears. + vfs.delete("docs/a.md"); + vfs.delete("docs/b.md"); + + let resp = svc + .commit(req("acct", "main", "drop dir", Some(vec!["docs".into()]))) + .await + .unwrap(); + let commit_oid = match resp { + CommitResponse::Created { + commit_oid, + changed, + } => { + assert_eq!(changed, 3, "three files removed from snapshot"); + commit_oid + } + other => panic!("expected Created, got {other:?}"), + }; + + let oid_hex = commit_oid.to_hex().to_string(); + assert!(matches!( + svc.show(ShowRequest { + account: "acct".into(), + target_ref: oid_hex.clone(), + path: Some("docs/a.md".into()), + }) + .await, + Err(GitError::PathNotFound(_)) + )); + assert!(matches!( + svc.show(ShowRequest { + account: "acct".into(), + target_ref: oid_hex, + path: Some("other/c.md".into()), + }) + .await, + Ok(ShowResponse::Blob { .. }) + )); + } + + /// Pruning applies to explicit directories: passing `_system` results + /// in a Noop commit (the directory does not exist in the VFS, but even + /// if it did, every entry under it would be pruned). + #[tokio::test] + async fn test_commit_paths_pruned_directory_is_noop() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"AA"); + let first = make_commit(&svc, "acct", "main", "first").await; + + let resp = svc + .commit(req( + "acct", + "main", + "pruned dir", + Some(vec!["_system".into()]), + )) + .await + .unwrap(); + match resp { + CommitResponse::Noop { commit_oid } => assert_eq!(commit_oid, first), + other => panic!("expected Noop, got {other:?}"), + } + } + + /// Pruning applies to explicit files: passing a pruned file path is + /// equivalent to passing nothing. Noop on top of an existing commit. + #[tokio::test] + async fn test_commit_paths_pruned_file_is_noop() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"AA"); + vfs.put("_system/lock", b"LL"); // pruned, never committed + let first = make_commit(&svc, "acct", "main", "first").await; + + let resp = svc + .commit(req( + "acct", + "main", + "pruned file", + Some(vec!["_system/lock".into()]), + )) + .await + .unwrap(); + match resp { + CommitResponse::Noop { commit_oid } => assert_eq!(commit_oid, first), + other => panic!("expected Noop, got {other:?}"), + } + } + + /// Mixing a file and a directory containing that file processes each + /// candidate exactly once. The resulting commit must record exactly + /// the directory's content, not double-process the listed file. Uses + /// LocalFileSystem so the Directory branch actually runs. + #[tokio::test] + async fn test_commit_paths_mixed_file_and_dir_dedup() { + use crate::plugins::localfs::LocalFileSystem; + + let store_dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(store_dir.path())); + let ref_store = Arc::new(LocalRefStore::new(store_dir.path())); + let work_dir = tempfile::tempdir().unwrap(); + let acct_root = work_dir.path().join("local").join("acct"); + std::fs::create_dir_all(acct_root.join("docs")).unwrap(); + std::fs::write(acct_root.join("docs/a.md"), b"AA").unwrap(); + std::fs::write(acct_root.join("docs/b.md"), b"BB").unwrap(); + let vfs: Arc = + Arc::new(LocalFileSystem::new(work_dir.path().to_str().unwrap()).unwrap()); + let svc = GitService::new(vfs, object_store, ref_store); + + let _ = make_commit(&svc, "acct", "main", "first").await; + + // Mutate one file, then commit with both an exact file path and + // its parent directory. + std::fs::write(acct_root.join("docs/a.md"), b"AA2").unwrap(); + let resp = svc + .commit(req( + "acct", + "main", + "mixed", + Some(vec!["docs/a.md".into(), "docs".into()]), + )) + .await + .unwrap(); + match resp { + CommitResponse::Created { changed, .. } => { + assert_eq!(changed, 1, "only docs/a.md content changed"); + } + other => panic!("expected Created, got {other:?}"), + } + } + + // ── 9: show ──────────────────────────────────────────────────────── + #[tokio::test] + async fn test_show_commit_meta_by_oid() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"hello"); + let oid = make_commit(&svc, "acct", "main", "first").await; + + let resp = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: oid.to_hex().to_string(), + path: None, + }) + .await + .unwrap(); + + match resp { + ShowResponse::Commit { + oid: returned, + parents, + message, + author, + committer, + tree, + } => { + assert_eq!(returned, oid); + assert!(parents.is_empty(), "root commit"); + assert_eq!(message, "first"); + assert_eq!(author.name, "tester"); + assert_eq!(author.email, "tester@example.com"); + assert_eq!(committer.name, "tester"); + assert_ne!(tree, ObjectId::empty_tree(gix_hash::Kind::Sha1)); + } + other => panic!("expected Commit, got {other:?}"), + } + } + + // ── 10 ───────────────────────────────────────────────────────────── + #[tokio::test] + async fn test_show_resolves_branch_name_and_full_ref() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"hello"); + let oid = make_commit(&svc, "acct", "main", "first").await; + + for tref in ["main", "refs/heads/main"] { + let resp = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: tref.into(), + path: None, + }) + .await + .unwrap(); + match resp { + ShowResponse::Commit { oid: returned, .. } => assert_eq!(returned, oid), + other => panic!("{tref}: expected Commit, got {other:?}"), + } + } + } + + // ── 10b: abbreviated OID resolution ──────────────────────────────── + #[tokio::test] + async fn test_show_resolves_short_oid_unique() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"hello"); + let oid = make_commit(&svc, "acct", "main", "first").await; + let full = oid.to_hex().to_string(); + + for len in [4usize, 7, 12, 39] { + let short = &full[..len]; + let resp = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: short.into(), + path: None, + }) + .await + .unwrap_or_else(|e| { + panic!("short oid {short} (len {len}) should resolve, got {e}") + }); + match resp { + ShowResponse::Commit { oid: returned, .. } => assert_eq!(returned, oid), + other => panic!("len {len}: expected Commit, got {other:?}"), + } + } + } + + #[tokio::test] + async fn test_show_short_oid_not_found_distinguished_from_branch() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"hello"); + let _ = make_commit(&svc, "acct", "main", "first").await; + + // A 4-hex string that almost-certainly does not match any commit. + // (SHA-1 collision against a single commit is astronomically unlikely + // for "ffff" — the first commit's hex is deterministic given the + // test's actor/time-zero, so this is a stable miss.) + let bogus = "ffff"; + let err = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: bogus.into(), + path: None, + }) + .await + .unwrap_err(); + assert!( + matches!(err, GitError::OidPrefixNotFound { ref prefix } if prefix == bogus), + "expected OidPrefixNotFound({bogus}), got {err:?}", + ); + } + + #[tokio::test] + async fn test_short_oid_three_chars_falls_through_to_ref_lookup() { + // 3 hex chars is below the 4-char floor for abbreviated OID; it + // should be treated as a branch name (which doesn't exist), giving + // a RefStore::NotFound error — NOT OidPrefixNotFound. + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"hello"); + let _ = make_commit(&svc, "acct", "main", "first").await; + + let err = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "abc".into(), + path: None, + }) + .await + .unwrap_err(); + assert!( + matches!(err, GitError::RefStore(RefStoreError::NotFound(_))), + "expected RefStore::NotFound for 3-char input, got {err:?}", + ); + } + + // ── 11 ───────────────────────────────────────────────────────────── + #[tokio::test] + async fn test_show_blob_round_trip() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + let body = b"hello world\n"; + vfs.put("resources/a.md", body); + let _ = make_commit(&svc, "acct", "main", "first").await; + + let resp = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "main".into(), + path: Some("resources/a.md".into()), + }) + .await + .unwrap(); + + match resp { + ShowResponse::Blob { + bytes, + size, + oid: _, + } => { + assert_eq!(bytes.as_ref(), body); + assert_eq!(size, body.len() as u64); + } + other => panic!("expected Blob, got {other:?}"), + } + } + + // ── 12 ───────────────────────────────────────────────────────────── + #[tokio::test] + async fn test_show_blob_path_not_found() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"x"); + let _ = make_commit(&svc, "acct", "main", "first").await; + + let err = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "main".into(), + path: Some("resources/missing.md".into()), + }) + .await + .unwrap_err(); + + match err { + GitError::PathNotFound(p) => assert_eq!(p, "resources/missing.md"), + other => panic!("expected PathNotFound, got {other:?}"), + } + } + + // ── 13 ───────────────────────────────────────────────────────────── + #[tokio::test] + async fn test_show_blob_rejects_directory_path() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"x"); + let _ = make_commit(&svc, "acct", "main", "first").await; + + let err = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "main".into(), + path: Some("resources".into()), + }) + .await + .unwrap_err(); + + match err { + GitError::PathIsDirectory(p) => assert_eq!(p, "resources"), + other => panic!("expected PathIsDirectory, got {other:?}"), + } + } + + // ── 14 ───────────────────────────────────────────────────────────── + #[tokio::test] + async fn test_show_unknown_ref() { + let (_dir, _vfs, _object_store, _ref_store, svc) = make_service("acct"); + let err = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "nonexistent".into(), + path: None, + }) + .await + .unwrap_err(); + + match err { + GitError::RefStore(RefStoreError::NotFound(name)) => { + assert_eq!(name, "refs/heads/nonexistent"); + } + other => panic!("expected RefStore NotFound, got {other:?}"), + } + } + + // ── 15 ───────────────────────────────────────────────────────────── + #[tokio::test] + async fn test_show_malformed_oid_input() { + let (_dir, _vfs, _object_store, _ref_store, svc) = make_service("acct"); + let err = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "z".repeat(40), + path: None, + }) + .await + .unwrap_err(); + assert!(matches!(err, GitError::Other(_) | GitError::RefStore(_))); + } + + // ── 16 ───────────────────────────────────────────────────────────── + /// Blob bytes survive a round-trip even when they contain NUL bytes, + /// non-UTF-8 sequences, and multiple newlines. Guards against any + /// future "treat blobs as strings" regression. + #[tokio::test] + async fn test_show_blob_binary_and_multiline() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + // NUL, invalid UTF-8 (0xC3 0x28 is an invalid 2-byte sequence), CRLF, LF. + let body: Vec = vec![ + b'h', b'i', 0x00, 0xC3, 0x28, b'\r', b'\n', b'l', b'i', b'n', b'e', b'2', b'\n', 0xFF, + 0xFE, 0xFD, + ]; + vfs.put("resources/bin.dat", &body); + let _ = make_commit(&svc, "acct", "main", "first").await; + + let resp = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "main".into(), + path: Some("resources/bin.dat".into()), + }) + .await + .unwrap(); + + match resp { + ShowResponse::Blob { bytes, size, .. } => { + assert_eq!(bytes, body); + assert_eq!(size as usize, body.len()); + } + other => panic!("expected Blob, got {other:?}"), + } + } + + // ── 17 ───────────────────────────────────────────────────────────── + /// Construct a commit whose author and committer differ, write it + /// directly via `util::write_object`, point a ref at it, and verify + /// `show()` decodes the two signatures into the two Actor fields + /// without crossing them. Bypasses `commit()` because the public + /// `CommitRequest` API only accepts one author (used for both). + #[tokio::test] + async fn test_show_distinguishes_committer_from_author() { + use gix_object::{bstr::BString, Commit, WriteTo}; + + let (_dir, vfs, object_store, ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"x"); + // First, create a normal commit just to get a real tree OID. + let seed_oid = make_commit(&svc, "acct", "main", "seed").await; + let seed_tree = + load_commit_meta(object_store.as_ref() as &dyn ObjectStore, "acct", &seed_oid) + .await + .unwrap() + .tree; + + // Build a commit with deliberately mismatched author/committer. + let author = gix_actor::Signature { + name: "Alice Author".into(), + email: "alice@example.com".into(), + time: gix_date::Time { + seconds: 1_700_000_000, + offset: 3600, + sign: gix_date::time::Sign::Plus, + }, + }; + let committer = gix_actor::Signature { + name: "Carol Committer".into(), + email: "carol@example.com".into(), + time: gix_date::Time { + seconds: 1_700_000_100, + offset: -7200, + sign: gix_date::time::Sign::Minus, + }, + }; + let commit = Commit { + tree: seed_tree, + parents: Vec::new().into(), + author, + committer, + encoding: None, + message: BString::from("split-actors"), + extra_headers: Vec::new(), + }; + let mut buf = Vec::new(); + commit.write_to(&mut buf).unwrap(); + let oid = crate::git::util::write_object( + object_store.as_ref() as &dyn ObjectStore, + "acct", + gix_object::Kind::Commit, + &buf, + ) + .await + .unwrap(); + + // Point a fresh branch at it so show() can find it by name. + ref_store + .cas_update("acct", "refs/heads/split", None, oid) + .await + .unwrap(); + + let resp = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "split".into(), + path: None, + }) + .await + .unwrap(); + + match resp { + ShowResponse::Commit { + author, committer, .. + } => { + assert_eq!(author.name, "Alice Author"); + assert_eq!(author.email, "alice@example.com"); + assert_eq!(author.time_seconds, 1_700_000_000); + assert_eq!(author.tz_offset_seconds, 3600); + assert_eq!(committer.name, "Carol Committer"); + assert_eq!(committer.email, "carol@example.com"); + assert_eq!(committer.time_seconds, 1_700_000_100); + assert_eq!(committer.tz_offset_seconds, -7200); + } + other => panic!("expected Commit, got {other:?}"), + } + } + + // ── 18 ───────────────────────────────────────────────────────────── + /// When an intermediate path component is a blob (not a tree), + /// `tree_builder::lookup` returns `Ok(None)`, which `show()` maps + /// to `PathNotFound`. Pin this so a future change can't silently + /// reinterpret it as `PathIsDirectory` or `CorruptedObject`. + #[tokio::test] + async fn test_show_intermediate_path_component_is_blob() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"x"); + let _ = make_commit(&svc, "acct", "main", "first").await; + + let err = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "main".into(), + path: Some("resources/a.md/oops".into()), + }) + .await + .unwrap_err(); + + match err { + GitError::PathNotFound(p) => assert_eq!(p, "resources/a.md/oops"), + other => panic!("expected PathNotFound, got {other:?}"), + } + } + + // ── 19 ───────────────────────────────────────────────────────────── + /// `show()` validates the `path` argument up front. Empty string, a + /// leading or trailing `/`, and embedded `//` all fail + /// `validate_relative_path` before any tree lookup runs — callers see + /// `InvalidPath` rather than mixed `Other` / `PathNotFound` results. + /// This pins the contract guarding the native binding boundary against + /// traversal-style input (`..`, `/abs`, …) being silently accepted. + #[tokio::test] + async fn test_show_path_with_invalid_form() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"x"); + let _ = make_commit(&svc, "acct", "main", "first").await; + + for bad in ["", "/x", "x/", "a//b"] { + let err = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "main".into(), + path: Some(bad.into()), + }) + .await + .unwrap_err(); + assert!( + matches!(err, GitError::InvalidPath(_)), + "path {bad:?}: expected InvalidPath, got {err:?}", + ); + } + } + + // ── 20 ───────────────────────────────────────────────────────────── + /// If the commit's loose object file is removed from the store + /// after the ref still points at it, `show()` must surface + /// `ObjectStoreError::NotFound` (wrapped in `GitError::ObjectStore`). + /// Guards against any future "swallow missing objects" regression + /// inside `load_commit_meta`. + #[tokio::test] + async fn test_show_commit_object_missing_from_store() { + let (dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"x"); + let oid = make_commit(&svc, "acct", "main", "first").await; + + // LocalObjectStore layout: {base_dir}/{account}/objects/{aa}/{bb...} + let hex = oid.to_hex().to_string(); + let path = dir + .path() + .join("acct") + .join("objects") + .join(&hex[..2]) + .join(&hex[2..]); + std::fs::remove_file(&path).expect("loose commit object must exist before removal"); + + let err = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "main".into(), + path: None, + }) + .await + .unwrap_err(); + + match err { + GitError::ObjectStore(ObjectStoreError::NotFound(missing)) => { + assert_eq!(missing, oid); + } + other => panic!("expected ObjectStore(NotFound), got {other:?}"), + } + } + + #[tokio::test] + async fn test_mock_vfs_write_then_read_round_trip() { + let vfs = MockVfs::new("acct"); + let path = "/local/acct/x.md"; + vfs.files + .lock() + .unwrap() + .insert(path.to_string(), Vec::new()); + FileSystem::write(vfs.as_ref(), path, b"hello", 0, WriteFlag::Create) + .await + .unwrap(); + let got = FileSystem::read(vfs.as_ref(), path, 0, 0).await.unwrap(); + assert_eq!(got, b"hello"); + FileSystem::remove(vfs.as_ref(), path).await.unwrap(); + let err = FileSystem::read(vfs.as_ref(), path, 0, 0) + .await + .unwrap_err(); + assert!(matches!(err, Error::NotFound(_))); + } + + // ── restore: dry_run ─────────────────────────────────────────────── + #[tokio::test] + async fn test_restore_dry_run_reports_diff_and_writes_nothing() { + let (_dir, vfs, object_store, ref_store, svc) = make_service("acct"); + // Source state: resources/proj_a has files a.md, b.md + vfs.put("resources/proj_a/a.md", b"A v1"); + vfs.put("resources/proj_a/b.md", b"B v1"); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + + // HEAD state: a.md is rewritten, b.md is deleted, c.md is created. + // We pass explicit paths (including the deleted b.md) so commit() + // sees the tombstone — collect_all() only enumerates surviving files. + vfs.put("resources/proj_a/a.md", b"A v2"); + vfs.delete("resources/proj_a/b.md"); + vfs.put("resources/proj_a/c.md", b"C new"); + let head_oid = match svc + .commit(req( + "acct", + "main", + "head", + Some(vec![ + "resources/proj_a/a.md".to_string(), + "resources/proj_a/b.md".to_string(), + "resources/proj_a/c.md".to_string(), + ]), + )) + .await + .unwrap() + { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + let resp = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: true, + message: None, + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .unwrap(); + + match resp { + RestoreResponse::DryRun { diff, head, source } => { + assert_eq!(source, source_oid); + assert_eq!(head, head_oid); + // a.md needs to roll back to v1, b.md needs to come back, + // c.md needs to go away. Sorted alphabetically by path. + assert_eq!(diff.to_write.len(), 2); + assert_eq!(diff.to_write[0].0, "a.md"); + assert_eq!(diff.to_write[1].0, "b.md"); + assert_eq!(diff.to_delete, vec!["c.md".to_string()]); + assert!(diff.unchanged.is_empty()); + } + other => panic!("expected DryRun, got {other:?}"), + } + + // CRITICAL: dry_run wrote nothing through the VFS — c.md and the v2 + // version of a.md must still be visible on disk. + let files = vfs.files.lock().unwrap(); + assert_eq!( + files.get("/local/acct/resources/proj_a/a.md").unwrap(), + b"A v2", + "dry_run must not overwrite a.md", + ); + assert!( + files.contains_key("/local/acct/resources/proj_a/c.md"), + "dry_run must not delete c.md", + ); + // Branch ref must still point at head_oid. + let head_after = ref_store.read("acct", "refs/heads/main").await.unwrap(); + assert_eq!(head_after, head_oid); + let _ = object_store; // silence unused warning + } + + // ── restore: apply ───────────────────────────────────────────────── + #[tokio::test] + async fn test_restore_apply_writes_new_commit_with_head_as_parent() { + let (_dir, vfs, object_store, ref_store, svc) = make_service("acct"); + vfs.put("resources/proj_a/a.md", b"A v1"); + vfs.put("resources/proj_a/b.md", b"B v1"); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + + vfs.put("resources/proj_a/a.md", b"A v2"); + vfs.delete("resources/proj_a/b.md"); + vfs.put("resources/proj_a/c.md", b"C new"); + // IMPORTANT: use explicit paths so the deletion of b.md is captured + let head_oid = match svc + .commit(req( + "acct", + "main", + "head", + Some(vec![ + "resources/proj_a/a.md".to_string(), + "resources/proj_a/b.md".to_string(), + "resources/proj_a/c.md".to_string(), + ]), + )) + .await + .unwrap() + { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + let resp = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: Some("rewind proj_a".into()), + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .unwrap(); + + let new_oid = match resp { + RestoreResponse::Applied { + new_commit_oid, + source_commit, + parent_commit, + written, + deleted, + unchanged, + written_paths, + deleted_paths, + } => { + assert_eq!(source_commit, source_oid); + assert_eq!(parent_commit, head_oid, "parent MUST be HEAD, NOT source"); + assert_eq!(written, 2, "a.md (rewrite) + b.md (recreate) = 2"); + assert_eq!(deleted, 1, "c.md"); + assert_eq!(unchanged, 0); + assert_eq!(written_paths.len(), 2); + assert_eq!(deleted_paths.len(), 1); + // Paths should be account-relative (project_dir-prefixed). + for p in &written_paths { + assert!( + p.starts_with("resources/proj_a/"), + "written path missing project_dir prefix: {p}" + ); + } + for p in &deleted_paths { + assert!( + p.starts_with("resources/proj_a/"), + "deleted path missing project_dir prefix: {p}" + ); + } + new_commit_oid + } + other => panic!("expected Applied, got {other:?}"), + }; + + // Ref now points at new_oid. + assert_eq!( + ref_store.read("acct", "refs/heads/main").await.unwrap(), + new_oid + ); + // New commit's parents = [head_oid] (NOT source_oid — this is the key + // invariant of restore vs. plain checkout). + let parents = + commit_parents(object_store.as_ref() as &dyn ObjectStore, "acct", new_oid).await; + assert_eq!(parents, vec![head_oid]); + + // VFS rolled back as expected. + let files = vfs.files.lock().unwrap(); + assert_eq!( + files.get("/local/acct/resources/proj_a/a.md").unwrap(), + b"A v1", + "a.md rolled back", + ); + assert_eq!( + files.get("/local/acct/resources/proj_a/b.md").unwrap(), + b"B v1", + "b.md restored", + ); + assert!( + !files.contains_key("/local/acct/resources/proj_a/c.md"), + "c.md deleted", + ); + } + + // Partial-writeback regression suite. Before this fix, a single failed + // `FileSystem::write` (or non-NotFound `remove`) during step 10 would let + // the ref keep pointing at the new commit while `try_collect` short- + // circuited the rest of the writeback. The caller then saw a generic + // RuntimeError and never scheduled reindex, leaving HEAD and the working + // tree (and any vector index) inconsistent. These tests pin the new + // behavior: every per-path op runs to completion and partial failures + // surface as a structured `GitError::RestoreWritebackPartial`. + + /// A forced write failure for one path must produce + /// `GitError::RestoreWritebackPartial` whose payload still reports the + /// other writes/deletes as succeeded so the caller can reindex them. + #[tokio::test] + async fn test_restore_writeback_partial_returns_partial_error_on_write_failure() { + let (_dir, vfs, _object_store, ref_store, svc) = make_service("acct"); + // Source: two files at v1. + vfs.put("resources/proj_a/a.md", b"A v1"); + vfs.put("resources/proj_a/b.md", b"B v1"); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + + // HEAD diverges: a.md updated, b.md updated, c.md added. + vfs.put("resources/proj_a/a.md", b"A v2"); + vfs.put("resources/proj_a/b.md", b"B v2"); + vfs.put("resources/proj_a/c.md", b"C new"); + let _head_oid = match svc + .commit(req( + "acct", + "main", + "head", + Some(vec![ + "resources/proj_a/a.md".to_string(), + "resources/proj_a/b.md".to_string(), + "resources/proj_a/c.md".to_string(), + ]), + )) + .await + .unwrap() + { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + // Force restore's writeback of a.md to fail. The diff also rewrites + // b.md (success) and deletes c.md (success), so we expect a partial + // error with exactly one failed write and the other operations + // reported under the success buckets. + vfs.fail_write("resources/proj_a/a.md"); + + let err = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: Some("rewind partial".into()), + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .expect_err("restore must surface partial failure"); + + let partial = match err { + GitError::RestoreWritebackPartial(p) => p, + other => panic!("expected RestoreWritebackPartial, got {other:?}"), + }; + + // Ref already advanced — partial must report the new HEAD so the + // caller knows the commit is durable even though writeback failed. + let head_after = ref_store.read("acct", "refs/heads/main").await.unwrap(); + assert_eq!(partial.new_commit_oid, head_after); + + assert_eq!(partial.failed_writes.len(), 1, "exactly one write failed"); + assert_eq!(partial.failed_writes[0].0, "resources/proj_a/a.md"); + assert!( + !partial.failed_writes[0].1.is_empty(), + "failure entry must carry a message" + ); + assert!(partial.failed_deletes.is_empty(), "no deletes should fail"); + + // The other write (b.md) succeeded and so must show up under + // written_paths; c.md was deleted and lands under deleted_paths. + assert_eq!(partial.written_paths, vec!["resources/proj_a/b.md"]); + assert_eq!(partial.deleted_paths, vec!["resources/proj_a/c.md"]); + assert_eq!(partial.written, 1); + assert_eq!(partial.deleted, 1); + } + + /// With two forced write failures we must still collect *both* — the + /// stream must not short-circuit after the first one. This is the + /// behavior change relative to the old `try_collect`. + #[tokio::test] + async fn test_restore_writeback_partial_continues_after_failure() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/proj_a/a.md", b"A v1"); + vfs.put("resources/proj_a/b.md", b"B v1"); + vfs.put("resources/proj_a/c.md", b"C v1"); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + + vfs.put("resources/proj_a/a.md", b"A v2"); + vfs.put("resources/proj_a/b.md", b"B v2"); + vfs.put("resources/proj_a/c.md", b"C v2"); + let _head_oid = match svc + .commit(req( + "acct", + "main", + "head", + Some(vec![ + "resources/proj_a/a.md".to_string(), + "resources/proj_a/b.md".to_string(), + "resources/proj_a/c.md".to_string(), + ]), + )) + .await + .unwrap() + { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + // Two of the three writes fail. + vfs.fail_write("resources/proj_a/a.md"); + vfs.fail_write("resources/proj_a/c.md"); + + let err = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: None, + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .expect_err("partial expected"); + + let partial = match err { + GitError::RestoreWritebackPartial(p) => p, + other => panic!("expected RestoreWritebackPartial, got {other:?}"), + }; + assert_eq!( + partial.failed_writes.len(), + 2, + "stream must not short-circuit on the first failure" + ); + let mut failed: Vec = partial + .failed_writes + .iter() + .map(|(p, _)| p.clone()) + .collect(); + failed.sort(); + assert_eq!( + failed, + vec![ + "resources/proj_a/a.md".to_string(), + "resources/proj_a/c.md".to_string(), + ] + ); + // b.md still rolled back. + assert_eq!(partial.written_paths, vec!["resources/proj_a/b.md"]); + } + + /// A forced delete failure (non-NotFound) must surface in + /// `failed_deletes` without aborting the rest of the stream. + #[tokio::test] + async fn test_restore_delete_failure_does_not_short_circuit() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + // Source has only a.md. + vfs.put("resources/proj_a/a.md", b"A v1"); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + + // HEAD adds b.md and c.md — restore must delete both. + vfs.put("resources/proj_a/b.md", b"B new"); + vfs.put("resources/proj_a/c.md", b"C new"); + let _head_oid = match svc + .commit(req( + "acct", + "main", + "head", + Some(vec![ + "resources/proj_a/a.md".to_string(), + "resources/proj_a/b.md".to_string(), + "resources/proj_a/c.md".to_string(), + ]), + )) + .await + .unwrap() + { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + // Force b.md's delete to fail; c.md must still be deleted. + vfs.fail_remove("resources/proj_a/b.md"); + + let err = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: None, + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .expect_err("partial expected"); + + let partial = match err { + GitError::RestoreWritebackPartial(p) => p, + other => panic!("expected RestoreWritebackPartial, got {other:?}"), + }; + assert_eq!(partial.failed_deletes.len(), 1); + assert_eq!(partial.failed_deletes[0].0, "resources/proj_a/b.md"); + assert_eq!(partial.deleted_paths, vec!["resources/proj_a/c.md"]); + assert!(partial.failed_writes.is_empty()); + } + + /// `Error::NotFound` from `remove` is idempotent (the path was already + /// gone) and must NOT count as a failure. With strict_remove enabled + /// and a delete target that is already missing, restore must still + /// return `Applied`. + #[tokio::test] + async fn test_restore_delete_notfound_not_counted_as_failure() { + // Use new_strict_remove so absent paths produce NotFound rather than + // silently succeeding — that's what the real LocalFileSystem does. + let dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(dir.path())); + let ref_store = Arc::new(LocalRefStore::new(dir.path())); + let vfs = MockVfs::new_strict_remove("acct"); + let svc = GitService::new( + vfs.clone() as Arc, + object_store.clone() as Arc, + ref_store.clone() as Arc, + ); + + vfs.put("resources/proj_a/a.md", b"A v1"); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + + vfs.put("resources/proj_a/b.md", b"B new"); + let _head_oid = match svc + .commit(req( + "acct", + "main", + "head", + Some(vec![ + "resources/proj_a/a.md".to_string(), + "resources/proj_a/b.md".to_string(), + ]), + )) + .await + .unwrap() + { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + // Out-of-band: remove b.md from the VFS so the diff's delete plan + // hits the NotFound path. Restore must still return Applied. + vfs.delete("resources/proj_a/b.md"); + + let resp = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: None, + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .expect("idempotent delete must stay on the Applied path"); + + match resp { + RestoreResponse::Applied { + deleted_paths, + deleted, + .. + } => { + assert_eq!( + deleted, 1, + "b.md counts as deleted even though already gone" + ); + assert_eq!(deleted_paths, vec!["resources/proj_a/b.md"]); + } + other => panic!("expected Applied, got {other:?}"), + } + } + + #[tokio::test] + async fn test_restore_full_tree_apply_replaces_account_tree() { + let (_dir, vfs, object_store, ref_store, svc) = make_service("acct"); + vfs.put("a.md", b"A v1"); + vfs.put("b.md", b"B v1"); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + + vfs.put("a.md", b"A v2"); + vfs.put("c.md", b"C new at head"); + vfs.delete("b.md"); + let head_oid = match svc + .commit(req( + "acct", + "main", + "head", + Some(vec![ + "a.md".to_string(), + "c.md".to_string(), + "b.md".to_string(), + ]), + )) + .await + .unwrap() + { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + let resp = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: None, + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: None, + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .unwrap(); + + let new_oid = match resp { + RestoreResponse::Applied { + new_commit_oid, + source_commit, + parent_commit, + written, + deleted, + unchanged, + written_paths, + deleted_paths, + } => { + assert_eq!(source_commit, source_oid); + assert_eq!(parent_commit, head_oid); + assert_eq!(written, 2); + assert_eq!(deleted, 1); + assert_eq!(unchanged, 0); + assert_eq!(written_paths, vec!["a.md".to_string(), "b.md".to_string()]); + assert_eq!(deleted_paths, vec!["c.md".to_string()]); + new_commit_oid + } + other => panic!("expected Applied, got {other:?}"), + }; + + assert_eq!( + ref_store.read("acct", "refs/heads/main").await.unwrap(), + new_oid + ); + let parents = + commit_parents(object_store.as_ref() as &dyn ObjectStore, "acct", new_oid).await; + assert_eq!(parents, vec![head_oid]); + + let files = vfs.files.lock().unwrap(); + assert_eq!(files.get("/local/acct/a.md").unwrap(), b"A v1"); + assert_eq!(files.get("/local/acct/b.md").unwrap(), b"B v1"); + assert!(!files.contains_key("/local/acct/c.md")); + } + + // Regression: restoring to a revision where a whole subdirectory's files + // are gone must not leave an empty directory husk behind. Git does not + // track directories, so the delete diff only lists files — restore is + // responsible for pruning directories emptied by those deletes. + // + // Backed by a real `LocalFileSystem`: the in-memory `MockVfs` models + // directories implicitly (deleting the last file makes the dir vanish for + // free) and so cannot reproduce the husk. LocalFS keeps the directory on + // disk, exactly like production, which is what makes this test meaningful. + #[tokio::test] + async fn test_restore_prunes_directories_emptied_by_delete() { + use crate::plugins::localfs::LocalFileSystem; + + let store_dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(store_dir.path())); + let ref_store = Arc::new(LocalRefStore::new(store_dir.path())); + + // Working tree root: /local/acct lives under this temp dir. + let work_dir = tempfile::tempdir().unwrap(); + let acct_root = work_dir.path().join("local").join("acct"); + std::fs::create_dir_all(&acct_root).unwrap(); + let vfs: Arc = + Arc::new(LocalFileSystem::new(work_dir.path().to_str().unwrap()).unwrap()); + + let svc = GitService::new(vfs.clone(), object_store.clone(), ref_store.clone()); + + // Source commit: keeper.md at the project root only. + std::fs::create_dir_all(acct_root.join("resources/proj_a")).unwrap(); + std::fs::write(acct_root.join("resources/proj_a/keeper.md"), b"keep").unwrap(); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + + // HEAD adds a nested subdir whose only files restore will delete. + std::fs::create_dir_all(acct_root.join("resources/proj_a/nested/deep")).unwrap(); + std::fs::write(acct_root.join("resources/proj_a/nested/x.md"), b"x").unwrap(); + std::fs::write(acct_root.join("resources/proj_a/nested/deep/y.md"), b"y").unwrap(); + let _head_oid = make_commit(&svc, "acct", "main", "head").await; + + svc.restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: Some("rewind".into()), + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .unwrap(); + + // Files are gone, and so are the now-empty directories that held them + // (deepest first: deep/, then nested/). + assert!( + !acct_root.join("resources/proj_a/nested/deep/y.md").exists(), + "nested/deep/y.md must be deleted", + ); + assert!( + !acct_root.join("resources/proj_a/nested/x.md").exists(), + "nested/x.md must be deleted", + ); + assert!( + !acct_root.join("resources/proj_a/nested/deep").exists(), + "emptied directory nested/deep must be pruned", + ); + assert!( + !acct_root.join("resources/proj_a/nested").exists(), + "emptied directory nested must be pruned", + ); + // The surviving file and its (non-empty) parent are untouched. + assert!( + acct_root.join("resources/proj_a/keeper.md").exists(), + "keeper.md must survive", + ); + assert!( + acct_root.join("resources/proj_a").is_dir(), + "project_dir itself must remain (still holds keeper.md)", + ); + } + + // Regression: restoring to a source whose subtree contains files under a + // directory that HEAD removed entirely (e.g. `rm -r` recorded as a commit + // deletion) must recreate the missing directory chain before writing the + // blobs back. Before `ensure_parent_dirs` was added to the writeback, this + // aborted with `vfs: not found: .../resources/proj_a/nested`. + #[tokio::test] + async fn test_restore_recreates_directory_removed_by_head() { + use crate::plugins::localfs::LocalFileSystem; + + let store_dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(store_dir.path())); + let ref_store = Arc::new(LocalRefStore::new(store_dir.path())); + + let work_dir = tempfile::tempdir().unwrap(); + let acct_root = work_dir.path().join("local").join("acct"); + std::fs::create_dir_all(&acct_root).unwrap(); + let vfs: Arc = + Arc::new(LocalFileSystem::new(work_dir.path().to_str().unwrap()).unwrap()); + + let svc = GitService::new(vfs.clone(), object_store.clone(), ref_store.clone()); + + // Source commit: a nested directory with a file plus a top-level keeper. + std::fs::create_dir_all(acct_root.join("resources/proj_a/nested/deep")).unwrap(); + std::fs::write(acct_root.join("resources/proj_a/keeper.md"), b"keep").unwrap(); + std::fs::write(acct_root.join("resources/proj_a/nested/x.md"), b"x v1").unwrap(); + std::fs::write(acct_root.join("resources/proj_a/nested/deep/y.md"), b"y v1").unwrap(); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + + // HEAD removes the whole `nested/` directory from disk (rm -r) and the + // full-enumeration commit records the deletion, so HEAD's tree has no + // `nested/` subtree at all. + std::fs::remove_dir_all(acct_root.join("resources/proj_a/nested")).unwrap(); + assert!(!acct_root.join("resources/proj_a/nested").exists()); + let _head_oid = make_commit(&svc, "acct", "main", "head").await; + + // Restore back to source: the writeback must recreate `nested/` and + // `nested/deep/` on disk before writing x.md / y.md. + svc.restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: Some("rewind".into()), + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .unwrap(); + + assert_eq!( + std::fs::read(acct_root.join("resources/proj_a/nested/x.md")).unwrap(), + b"x v1", + "nested/x.md must be recreated with v1 content", + ); + assert_eq!( + std::fs::read(acct_root.join("resources/proj_a/nested/deep/y.md")).unwrap(), + b"y v1", + "nested/deep/y.md must be recreated with v1 content", + ); + assert!( + acct_root.join("resources/proj_a/keeper.md").exists(), + "keeper.md must survive", + ); + } + + // Regression: a path the restore diff wants to delete may already be absent + // from the VFS (e.g. a derived file like `.abstract.md` removed out of + // band). The delete must be idempotent — restore should succeed and advance + // the branch ref rather than aborting with a `vfs: not found` error. + #[tokio::test] + async fn test_restore_tolerates_already_deleted_path() { + let dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(dir.path())); + let ref_store = Arc::new(LocalRefStore::new(dir.path())); + let vfs = MockVfs::new_strict_remove("acct"); + let svc = GitService::new( + vfs.clone() as Arc, + object_store.clone() as Arc, + ref_store.clone() as Arc, + ); + + // Source commit: a.md plus a derived file the diff will later delete. + vfs.put("resources/proj_a/a.md", b"A v1"); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + + // HEAD adds the derived file, so restoring source wants to delete it. + vfs.put("resources/proj_a/.abstract.md", b"derived"); + vfs.put("resources/proj_a/a.md", b"A v2"); + let head_oid = match svc + .commit(req( + "acct", + "main", + "head", + Some(vec![ + "resources/proj_a/a.md".to_string(), + "resources/proj_a/.abstract.md".to_string(), + ]), + )) + .await + .unwrap() + { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + // Simulate the derived file vanishing from the VFS out of band, so the + // restore's delete step hits a missing path. + vfs.delete("resources/proj_a/.abstract.md"); + + let resp = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: Some("rewind".into()), + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .expect("restore must tolerate an already-deleted path"); + + let new_oid = match resp { + RestoreResponse::Applied { + deleted, + deleted_paths, + new_commit_oid, + .. + } => { + // The diff still *plans* the delete (count is unchanged); the + // VFS apply just no-ops on the missing path. + assert_eq!(deleted, 1, ".abstract.md"); + assert_eq!(deleted_paths.len(), 1); + new_commit_oid + } + other => panic!("expected Applied, got {other:?}"), + }; + + // Branch ref advanced to the new commit on top of HEAD. + assert_eq!( + ref_store.read("acct", "refs/heads/main").await.unwrap(), + new_oid + ); + let parents = + commit_parents(object_store.as_ref() as &dyn ObjectStore, "acct", new_oid).await; + assert_eq!(parents, vec![head_oid]); + } + + #[tokio::test] + async fn test_restore_noop_when_source_equals_head() { + let (_dir, vfs, _object_store, ref_store, svc) = make_service("acct"); + vfs.put("resources/proj_a/a.md", b"only file"); + let only_oid = make_commit(&svc, "acct", "main", "only").await; + + // No further changes to proj_a — restoring from `only_oid` is a noop. + let resp = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: only_oid.to_hex().to_string(), + dry_run: false, + message: None, + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .unwrap(); + + match resp { + RestoreResponse::Noop { head, source } => { + assert_eq!(head, only_oid); + assert_eq!(source, only_oid); + } + other => panic!("expected Noop, got {other:?}"), + } + // Ref unchanged. + assert_eq!( + ref_store.read("acct", "refs/heads/main").await.unwrap(), + only_oid + ); + } + + #[tokio::test] + async fn test_restore_full_tree_dry_run_reports_account_relative_diff() { + let (_dir, vfs, _object_store, ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"A v1"); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + + vfs.put("resources/a.md", b"A v2"); + vfs.put("memory/new.md", b"new"); + let head_oid = match svc + .commit(req( + "acct", + "main", + "head", + Some(vec![ + "resources/a.md".to_string(), + "memory/new.md".to_string(), + ]), + )) + .await + .unwrap() + { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + let resp = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: None, + source_commit: source_oid.to_hex().to_string(), + dry_run: true, + message: None, + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .unwrap(); + + match resp { + RestoreResponse::DryRun { diff, head, source } => { + assert_eq!(head, head_oid); + assert_eq!(source, source_oid); + assert_eq!(diff.to_write.len(), 1); + assert_eq!(diff.to_write[0].0, "resources/a.md"); + assert_eq!(diff.to_delete, vec!["memory/new.md".to_string()]); + } + other => panic!("expected DryRun, got {other:?}"), + } + + let files = vfs.files.lock().unwrap(); + assert_eq!(files.get("/local/acct/resources/a.md").unwrap(), b"A v2"); + assert!(files.contains_key("/local/acct/memory/new.md")); + drop(files); + assert_eq!( + ref_store.read("acct", "refs/heads/main").await.unwrap(), + head_oid + ); + } + + #[tokio::test] + async fn test_restore_full_tree_noop_when_source_equals_head() { + let (_dir, vfs, _object_store, ref_store, svc) = make_service("acct"); + vfs.put("resources/a.md", b"A v1"); + let head_oid = make_commit(&svc, "acct", "main", "source").await; + + let resp = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: None, + source_commit: head_oid.to_hex().to_string(), + dry_run: false, + message: None, + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .unwrap(); + + match resp { + RestoreResponse::Noop { head, source } => { + assert_eq!(head, head_oid); + assert_eq!(source, head_oid); + } + other => panic!("expected Noop, got {other:?}"), + } + assert_eq!( + ref_store.read("acct", "refs/heads/main").await.unwrap(), + head_oid + ); + } + + #[tokio::test] + async fn test_restore_invalid_project_dir() { + let (_dir, _vfs, _object_store, _ref_store, svc) = make_service("acct"); + let err = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("".into()), // empty + source_commit: "main".into(), + dry_run: true, + message: None, + author_name: "x".into(), + author_email: "x@x".into(), + }) + .await + .unwrap_err(); + assert!(matches!(err, GitError::InvalidProjectDir(_))); + } + + #[tokio::test] + async fn test_restore_unknown_source_ref() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/proj_a/a.md", b"x"); + let _ = make_commit(&svc, "acct", "main", "init").await; + let err = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: "does-not-exist".into(), + dry_run: true, + message: None, + author_name: "x".into(), + author_email: "x@x".into(), + }) + .await + .unwrap_err(); + assert!(matches!( + err, + GitError::RefStore(RefStoreError::NotFound(_)) + )); + } + + #[tokio::test] + async fn test_restore_unknown_branch_head() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + vfs.put("resources/proj_a/a.md", b"x"); + let only = make_commit(&svc, "acct", "main", "only").await; + let err = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "ghost".into(), // doesn't exist + project_dir: Some("resources/proj_a".into()), + source_commit: only.to_hex().to_string(), + dry_run: true, + message: None, + author_name: "x".into(), + author_email: "x@x".into(), + }) + .await + .unwrap_err(); + assert!(matches!( + err, + GitError::RefStore(RefStoreError::NotFound(_)) + )); + } + + #[tokio::test] + async fn test_restore_project_dir_missing_in_source_commit() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + // Source commit only has resources/other_proj. + vfs.put("resources/other_proj/x.md", b"x"); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + // HEAD has the project we will try to restore. + vfs.put("resources/proj_a/a.md", b"a"); + let _ = make_commit(&svc, "acct", "main", "head").await; + + let err = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: true, + message: None, + author_name: "x".into(), + author_email: "x@x".into(), + }) + .await + .unwrap_err(); + match err { + GitError::SubtreeNotFoundInCommit { + project_dir, + commit, + } => { + assert_eq!(project_dir, "resources/proj_a"); + assert_eq!(commit, source_oid); + } + other => panic!("expected SubtreeNotFoundInCommit, got {other:?}"), + } + } + + #[tokio::test] + async fn test_restore_cas_conflict_surfaces_as_error() { + let dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(dir.path())); + let inner_ref = Arc::new(LocalRefStore::new(dir.path())); + let vfs = MockVfs::new("acct"); + + // Build a real first commit through a plain service so we have a HEAD. + let bootstrap_svc = GitService::new( + vfs.clone() as Arc, + object_store.clone() as Arc, + inner_ref.clone() as Arc, + ); + vfs.put("resources/proj_a/a.md", b"v1"); + let source_oid = make_commit(&bootstrap_svc, "acct", "main", "source").await; + vfs.put("resources/proj_a/a.md", b"v2"); + let head_oid = make_commit(&bootstrap_svc, "acct", "main", "head").await; + + // Now wrap the ref store to force the first cas_update to fail. + let bogus = ObjectId::from_hex(b"deadbeefdeadbeefdeadbeefdeadbeefdeadbeef").unwrap(); + let conflict_ref = Arc::new(ConflictOnceRef { + inner: inner_ref.clone(), + fired: Mutex::new(false), + actual: Some(bogus), + }); + let svc = GitService::new( + vfs.clone() as Arc, + object_store.clone() as Arc, + conflict_ref as Arc, + ); + + let err = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: None, + author_name: "x".into(), + author_email: "x@x".into(), + }) + .await + .unwrap_err(); + match err { + GitError::ConcurrentCommit { + ref_name, + expected, + actual, + } => { + assert_eq!(ref_name, "refs/heads/main"); + assert_eq!(expected, Some(head_oid)); + assert_eq!(actual, Some(bogus)); + } + other => panic!("expected ConcurrentCommit, got {other:?}"), + } + } + + /// Regression: a losing CAS race during restore must leave the VFS + /// byte-identical to its pre-restore (HEAD) state. The ref-consistency + /// protocol now runs before any writeback, so a `ConcurrentCommit` error + /// implies zero working-tree mutations — neither `to_write` content nor + /// `to_delete` removals are applied. This keeps the failed request, the + /// on-disk working tree, and the caller's reindex decision consistent. + #[tokio::test] + async fn test_restore_cas_conflict_leaves_vfs_unchanged() { + let dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(dir.path())); + let inner_ref = Arc::new(LocalRefStore::new(dir.path())); + let vfs = MockVfs::new("acct"); + + let bootstrap_svc = GitService::new( + vfs.clone() as Arc, + object_store.clone() as Arc, + inner_ref.clone() as Arc, + ); + + // Source commit: a single file under the project dir. + vfs.put("resources/proj_a/a.md", b"v1"); + let source_oid = make_commit(&bootstrap_svc, "acct", "main", "source").await; + + // HEAD commit: a.md is modified (would be a `to_write`) and a brand new + // file is added (absent in source → would be a `to_delete` on restore). + vfs.put("resources/proj_a/a.md", b"v2"); + vfs.put("resources/proj_a/b.md", b"new"); + let head_oid = make_commit(&bootstrap_svc, "acct", "main", "head").await; + + // Snapshot the working tree exactly as it stands at HEAD. + let before = vfs.files.lock().unwrap().clone(); + + // Force the first cas_update to conflict. + let bogus = ObjectId::from_hex(b"deadbeefdeadbeefdeadbeefdeadbeefdeadbeef").unwrap(); + let conflict_ref = Arc::new(ConflictOnceRef { + inner: inner_ref.clone(), + fired: Mutex::new(false), + actual: Some(bogus), + }); + let svc = GitService::new( + vfs.clone() as Arc, + object_store.clone() as Arc, + conflict_ref as Arc, + ); + + let err = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: None, + author_name: "x".into(), + author_email: "x@x".into(), + }) + .await + .unwrap_err(); + assert!( + matches!(err, GitError::ConcurrentCommit { .. }), + "expected ConcurrentCommit, got {err:?}" + ); + + // The working tree must be untouched: a.md still v2 (not rewritten to + // v1) and b.md still present (not deleted). + let after = vfs.files.lock().unwrap().clone(); + assert_eq!(after, before, "VFS must not change on a CAS conflict"); + assert_eq!( + after + .get("/local/acct/resources/proj_a/a.md") + .map(|v| v.as_slice()), + Some(b"v2".as_slice()), + "a.md must keep its HEAD content" + ); + assert!( + after.contains_key("/local/acct/resources/proj_a/b.md"), + "b.md must not be deleted" + ); + + // And HEAD must still point at the original commit. + let head_now = inner_ref.read("acct", "refs/heads/main").await.unwrap(); + assert_eq!(head_now, head_oid, "branch ref must be unchanged"); + } + + #[tokio::test] + async fn test_restore_does_not_touch_paths_outside_project_dir() { + let (_dir, vfs, object_store, _ref_store, svc) = make_service("acct"); + + // Source: resources/proj_a + an UNRELATED file in another scope. + vfs.put("resources/proj_a/a.md", b"A v1"); + vfs.put("agent/skills/unrelated.py", b"unrelated v1"); + let source_oid = make_commit(&svc, "acct", "main", "source").await; + + // HEAD: modify proj_a AND the unrelated file. Note we don't delete + // anything in this test, so make_commit (which uses collect_all) is + // fine — all files still exist in the VFS. + vfs.put("resources/proj_a/a.md", b"A v2"); + vfs.put("agent/skills/unrelated.py", b"unrelated v2"); + vfs.put("agent/skills/new_skill.py", b"brand new"); + let _ = make_commit(&svc, "acct", "main", "head").await; + + let resp = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: None, + author_name: "x".into(), + author_email: "x@x".into(), + }) + .await + .unwrap(); + + let new_oid = match resp { + RestoreResponse::Applied { new_commit_oid, .. } => new_commit_oid, + other => panic!("expected Applied, got {other:?}"), + }; + + // Verify the VFS: unrelated files keep their v2 / new state. + let files = vfs.files.lock().unwrap(); + assert_eq!( + files.get("/local/acct/agent/skills/unrelated.py").unwrap(), + b"unrelated v2", + "restore must NOT roll back unrelated.py", + ); + assert!( + files.contains_key("/local/acct/agent/skills/new_skill.py"), + "restore must NOT delete new_skill.py", + ); + // And proj_a/a.md DID roll back. + assert_eq!( + files.get("/local/acct/resources/proj_a/a.md").unwrap(), + b"A v1", + ); + drop(files); + + // Verify the tree: the new commit's tree should contain the v2 content + // of unrelated.py and new_skill.py at their original oids. The easiest + // way: lookup the oid of agent/skills/unrelated.py in both source and + // new — they must DIFFER (source had v1, new still has v2). + let new_tree = + load_commit_meta(object_store.as_ref() as &dyn ObjectStore, "acct", &new_oid) + .await + .unwrap() + .tree; + let source_tree = load_commit_meta( + object_store.as_ref() as &dyn ObjectStore, + "acct", + &source_oid, + ) + .await + .unwrap() + .tree; + let unrelated_in_new = crate::git::tree_builder::lookup( + object_store.as_ref() as &dyn ObjectStore, + "acct", + new_tree, + "agent/skills/unrelated.py", + ) + .await + .unwrap() + .unwrap(); + let unrelated_in_source = crate::git::tree_builder::lookup( + object_store.as_ref() as &dyn ObjectStore, + "acct", + source_tree, + "agent/skills/unrelated.py", + ) + .await + .unwrap() + .unwrap(); + assert_ne!( + unrelated_in_new.0, unrelated_in_source.0, + "agent/skills/unrelated.py in the new tree must be HEAD's v2 oid, not source's v1 oid", + ); + assert!( + crate::git::tree_builder::lookup( + object_store.as_ref() as &dyn ObjectStore, + "acct", + new_tree, + "agent/skills/new_skill.py", + ) + .await + .unwrap() + .is_some(), + "new_skill.py must still be present in the new tree", + ); + } + + #[tokio::test] + async fn test_restore_then_show_reflects_old_content() { + let (_dir, vfs, _object_store, _ref_store, svc) = make_service("acct"); + + vfs.put("resources/proj_a/note.md", b"original"); + let src = make_commit(&svc, "acct", "main", "src").await; + + vfs.put("resources/proj_a/note.md", b"edited"); + let _ = make_commit(&svc, "acct", "main", "edit").await; + + // Sanity: show on HEAD shows "edited". + let head_show = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "main".into(), + path: Some("resources/proj_a/note.md".into()), + }) + .await + .unwrap(); + match head_show { + ShowResponse::Blob { bytes, .. } => assert_eq!(bytes.as_ref(), b"edited"), + other => panic!("expected Blob, got {other:?}"), + } + + // Restore. + let new_oid = match svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: src.to_hex().to_string(), + dry_run: false, + message: Some("rewind".into()), + author_name: "x".into(), + author_email: "x@x".into(), + }) + .await + .unwrap() + { + RestoreResponse::Applied { new_commit_oid, .. } => new_commit_oid, + other => panic!("expected Applied, got {other:?}"), + }; + + // After restore: show on main should reflect the original content. + let after_show = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "main".into(), + path: Some("resources/proj_a/note.md".into()), + }) + .await + .unwrap(); + match after_show { + ShowResponse::Blob { bytes, .. } => assert_eq!(bytes.as_ref(), b"original"), + other => panic!("expected Blob, got {other:?}"), + } + + // And show on the new oid by hex resolves to the same content. + let by_oid = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: new_oid.to_hex().to_string(), + path: Some("resources/proj_a/note.md".into()), + }) + .await + .unwrap(); + match by_oid { + ShowResponse::Blob { bytes, .. } => assert_eq!(bytes.as_ref(), b"original"), + other => panic!("expected Blob, got {other:?}"), + } + } + + // ── Fast Path 3: blob exists precheck ─────────────────────────────── + /// ObjectStore wrapper that counts `put` / `exists` calls, delegating to + /// an inner `LocalObjectStore`. + struct CountingObjectStore { + inner: LocalObjectStore, + puts: std::sync::atomic::AtomicUsize, + exists_calls: std::sync::atomic::AtomicUsize, + } + + #[async_trait] + impl ObjectStore for CountingObjectStore { + async fn put( + &self, + account: &str, + oid: &ObjectId, + zlib_body: bytes::Bytes, + ) -> std::result::Result<(), ObjectStoreError> { + self.puts.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + self.inner.put(account, oid, zlib_body).await + } + async fn get( + &self, + account: &str, + oid: &ObjectId, + ) -> std::result::Result { + self.inner.get(account, oid).await + } + async fn exists( + &self, + account: &str, + oid: &ObjectId, + ) -> std::result::Result { + self.exists_calls + .fetch_add(1, std::sync::atomic::Ordering::SeqCst); + self.inner.exists(account, oid).await + } + } + + #[tokio::test] + async fn test_commit_fast_path_3_skips_put_for_duplicate_blob() { + use std::sync::atomic::Ordering; + let dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(CountingObjectStore { + inner: LocalObjectStore::new(dir.path()), + puts: std::sync::atomic::AtomicUsize::new(0), + exists_calls: std::sync::atomic::AtomicUsize::new(0), + }); + let ref_store = Arc::new(LocalRefStore::new(dir.path())); + let vfs = MockVfs::new("acct"); + // No index store → slow path runs → Fast Path 3 active (default on). + let svc = GitService::new( + vfs.clone() as Arc, + object_store.clone() as Arc, + ref_store as Arc, + ); + + vfs.put("a.md", b"dup"); + svc.commit(req("acct", "main", "first", None)) + .await + .unwrap(); + + // Commit a second file with identical content → same blob oid. The + // blob `put` must be skipped (exists hit); only the new root tree and + // commit object are written (2 puts, no blob put). + vfs.put("b.md", b"dup"); + let puts_before = object_store.puts.load(Ordering::SeqCst); + let exists_before = object_store.exists_calls.load(Ordering::SeqCst); + svc.commit(req("acct", "main", "second", Some(vec!["b.md".into()]))) + .await + .unwrap(); + let put_delta = object_store.puts.load(Ordering::SeqCst) - puts_before; + + // exists() was consulted on the second commit's slow path. + assert!(object_store.exists_calls.load(Ordering::SeqCst) > exists_before); + // Only tree + commit objects were put — the duplicate blob was skipped. + assert_eq!(put_delta, 2, "duplicate blob must not be re-put"); + } + + #[tokio::test] + async fn test_commit_fast_path_3_disabled_reputs_duplicate_blob() { + use std::sync::atomic::Ordering; + let dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(CountingObjectStore { + inner: LocalObjectStore::new(dir.path()), + puts: std::sync::atomic::AtomicUsize::new(0), + exists_calls: std::sync::atomic::AtomicUsize::new(0), + }); + let ref_store = Arc::new(LocalRefStore::new(dir.path())); + let vfs = MockVfs::new("acct"); + let svc = GitService::new( + vfs.clone() as Arc, + object_store.clone() as Arc, + ref_store as Arc, + ) + .with_blob_exists_precheck(false); + + vfs.put("a.md", b"dup"); + svc.commit(req("acct", "main", "first", None)) + .await + .unwrap(); + let exists_after_first = object_store.exists_calls.load(Ordering::SeqCst); + + vfs.put("b.md", b"dup"); + let puts_before = object_store.puts.load(Ordering::SeqCst); + svc.commit(req("acct", "main", "second", Some(vec!["b.md".into()]))) + .await + .unwrap(); + + // With precheck off, the slow path calls write_object unconditionally + // → at least one put for the dup blob (idempotent at the backend). + assert!(object_store.puts.load(Ordering::SeqCst) > puts_before); + // And no extra exists() calls were issued from the blob write path on + // the second commit (precheck disabled). The backend's own put + // idempotency uses try_exists internally, not ObjectStore::exists. + assert_eq!( + object_store.exists_calls.load(Ordering::SeqCst), + exists_after_first + ); + + // Result correctness: both files resolve to the same content. + let show_b = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "main".into(), + path: Some("b.md".into()), + }) + .await + .unwrap(); + match show_b { + ShowResponse::Blob { bytes, .. } => assert_eq!(bytes.as_ref(), b"dup"), + other => panic!("expected Blob, got {other:?}"), + } + } + + // ── prev_tree lookup cache: each ancestor tree loaded once per commit ── + /// ObjectStore wrapper that records every `get` oid so we can prove the + /// commit loop re-fetches each prev_tree subtree at most once. + struct GetSpyObjectStore { + inner: LocalObjectStore, + gets: std::sync::Mutex>, + } + + impl GetSpyObjectStore { + fn count_gets(&self, oid: &ObjectId) -> usize { + self.gets + .lock() + .unwrap() + .iter() + .filter(|o| *o == oid) + .count() + } + fn reset(&self) { + self.gets.lock().unwrap().clear(); + } + } + + #[async_trait] + impl ObjectStore for GetSpyObjectStore { + async fn put( + &self, + account: &str, + oid: &ObjectId, + zlib_body: bytes::Bytes, + ) -> std::result::Result<(), ObjectStoreError> { + self.inner.put(account, oid, zlib_body).await + } + async fn get( + &self, + account: &str, + oid: &ObjectId, + ) -> std::result::Result { + self.gets.lock().unwrap().push(*oid); + self.inner.get(account, oid).await + } + async fn exists( + &self, + account: &str, + oid: &ObjectId, + ) -> std::result::Result { + self.inner.exists(account, oid).await + } + } + + #[tokio::test] + async fn test_commit_prev_tree_lookup_cache_amortises_ancestors() { + // Build a prev tree where many candidates share the same depth-3 + // ancestor chain (root → resources → docs). Without the lookup cache, + // each candidate would re-fetch all three trees from object_store on + // its `lookup(prev_tree, ...)` call. With the cache, every ancestor + // along that chain is fetched at most once for the whole commit. + let dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(GetSpyObjectStore { + inner: LocalObjectStore::new(dir.path()), + gets: std::sync::Mutex::new(Vec::new()), + }); + let ref_store = Arc::new(LocalRefStore::new(dir.path())); + let vfs = MockVfs::new("acct"); + let svc = GitService::new( + vfs.clone() as Arc, + object_store.clone() as Arc, + ref_store as Arc, + ); + + // First commit: seed prev_tree with 5 files all under resources/docs/. + for name in ["a", "b", "c", "d", "e"] { + vfs.put(&format!("resources/docs/{}.md", name), name.as_bytes()); + } + svc.commit(req("acct", "main", "seed", None)).await.unwrap(); + + // Capture root tree oid and its ancestor chain to resources/docs. + let head_resp = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "main".into(), + path: None, + }) + .await + .unwrap(); + let root_tree_oid = match head_resp { + ShowResponse::Commit { tree, .. } => tree, + _ => panic!("expected commit"), + }; + // Resolve resources/docs tree oid by walking root once. + let mut cache = crate::git::tree_builder::TreeLookupCache::new(); + let (resources_oid, _) = crate::git::tree_builder::lookup_cached( + object_store.as_ref(), + "acct", + root_tree_oid, + "resources", + &mut cache, + ) + .await + .unwrap() + .unwrap(); + let (docs_oid, _) = crate::git::tree_builder::lookup_cached( + object_store.as_ref(), + "acct", + root_tree_oid, + "resources/docs", + &mut cache, + ) + .await + .unwrap() + .unwrap(); + + // Reset spy, then run a commit that touches all 5 candidates with the + // same content (so every Fast Path 1 miss path runs the prev lookup + // for the no-op skip check). The assertion proves each ancestor tree + // is fetched at most once across all 5 lookups. + object_store.reset(); + let candidates: Vec = ["a", "b", "c", "d", "e"] + .iter() + .map(|n| format!("resources/docs/{}.md", n)) + .collect(); + svc.commit(req("acct", "main", "rewrite", Some(candidates))) + .await + .unwrap(); + + // Each ancestor on the prev_tree chain was fetched at most once. + assert!( + object_store.count_gets(&root_tree_oid) <= 1, + "root tree was fetched {} times, expected ≤1 (cache miss)", + object_store.count_gets(&root_tree_oid) + ); + assert!( + object_store.count_gets(&resources_oid) <= 1, + "resources tree was fetched {} times, expected ≤1 (cache miss)", + object_store.count_gets(&resources_oid) + ); + assert!( + object_store.count_gets(&docs_oid) <= 1, + "resources/docs tree was fetched {} times, expected ≤1 (cache miss)", + object_store.count_gets(&docs_oid) + ); + } + + // ── account id validation ─────────────────────────────────────────── + #[test] + fn validate_account_id_accepts_valid() { + for ok in ["acct", "a", "user-1", "u_2", "name.tag", "a@b", "ABC123"] { + assert!(validate_account_id(ok).is_ok(), "{ok:?} should be valid"); + } + } + + #[test] + fn validate_account_id_rejects_malicious() { + for bad in [ + "", // empty + ".", // dot + "..", // parent + "../x", // traversal + "a/b", // slash + "a\\b", // backslash + "a\0b", // NUL + "a\nb", // newline / control + "a b", // space + "a@b@c", // multiple @ + "_system", // leading underscore + ] { + assert!( + matches!(validate_account_id(bad), Err(GitError::InvalidAccountId(_))), + "{bad:?} should be rejected", + ); + } + } + + #[tokio::test] + async fn commit_rejects_traversal_account() { + // A crafted account must be rejected before any path is built, so the + // ref store is never even touched. + let (_dir, _vfs, _object_store, _ref_store, svc) = make_service("acct"); + let err = svc.commit(req("../escape", "main", "msg", None)).await; + assert!(matches!(err, Err(GitError::InvalidAccountId(_)))); + } + + #[tokio::test] + async fn commit_rejects_slash_account() { + let (_dir, _vfs, _object_store, _ref_store, svc) = make_service("acct"); + let err = svc.commit(req("a/b", "main", "msg", None)).await; + assert!(matches!(err, Err(GitError::InvalidAccountId(_)))); + } + + #[tokio::test] + async fn show_rejects_traversal_account() { + let (_dir, _vfs, _object_store, _ref_store, svc) = make_service("acct"); + let err = svc + .show(ShowRequest { + account: "../escape".into(), + target_ref: "main".into(), + path: None, + }) + .await; + assert!(matches!(err, Err(GitError::InvalidAccountId(_)))); + } + + #[tokio::test] + async fn restore_rejects_traversal_account() { + let (_dir, _vfs, _object_store, _ref_store, svc) = make_service("acct"); + let err = svc + .restore(RestoreRequest { + account: "../escape".into(), + branch: "main".into(), + project_dir: Some("resources/x".into()), + source_commit: "deadbeef".into(), + dry_run: false, + message: None, + author_name: "n".into(), + author_email: "e".into(), + }) + .await; + assert!(matches!(err, Err(GitError::InvalidAccountId(_)))); + } + + // ── direct-binding traversal defence: account is OK, but the caller + // tries to escape via paths / project_dir / show path. These must be + // rejected before any VFS / object-store I/O. ────────────────────── + #[tokio::test] + async fn commit_rejects_traversal_in_paths() { + let (_dir, _vfs, _object_store, _ref_store, svc) = make_service("acct"); + for bad in [ + "../other/file.md", + "a/../../other.md", + "/abs.md", + "a/./b.md", + "a\\b.md", + ] { + let err = svc + .commit(req("acct", "main", "msg", Some(vec![bad.to_string()]))) + .await; + assert!( + matches!(err, Err(GitError::InvalidPath(_))), + "{bad:?} should yield InvalidPath, got {err:?}", + ); + } + } + + #[tokio::test] + async fn show_rejects_traversal_path() { + let (_dir, _vfs, _object_store, _ref_store, svc) = make_service("acct"); + for bad in ["../other.md", ".", "..", "a/../b", ""] { + let err = svc + .show(ShowRequest { + account: "acct".into(), + target_ref: "main".into(), + path: Some(bad.to_string()), + }) + .await; + assert!( + matches!(err, Err(GitError::InvalidPath(_))), + "{bad:?} should yield InvalidPath, got {err:?}", + ); + } + } + + #[tokio::test] + async fn restore_rejects_traversal_project_dir() { + let (_dir, _vfs, _object_store, _ref_store, svc) = make_service("acct"); + for bad in ["../other", ".", "..", "a/../b", "a/./b", "a\\b"] { + let err = svc + .restore(RestoreRequest { + account: "acct".into(), + branch: "main".into(), + project_dir: Some(bad.to_string()), + source_commit: "deadbeef".into(), + dry_run: false, + message: None, + author_name: "n".into(), + author_email: "e".into(), + }) + .await; + assert!( + matches!(err, Err(GitError::InvalidProjectDir(_))), + "{bad:?} should yield InvalidProjectDir, got {err:?}", + ); + } + } +} + +#[cfg(test)] +mod diff_tests { + use super::*; + use crate::git::types::RestoreDiff; + use gix_hash::ObjectId; + + fn oid(byte: u8) -> ObjectId { + let mut bytes = [0u8; 20]; + bytes.fill(byte); + ObjectId::from_bytes_or_panic(&bytes) + } + + #[test] + fn diff_empty_both() { + let got = compute_subtree_diff(&[], &[]); + assert_eq!( + got, + RestoreDiff { + to_write: vec![], + to_delete: vec![], + unchanged: vec![] + } + ); + } + + #[test] + fn diff_all_writes_when_head_empty() { + let source = vec![("a.md".to_string(), oid(0xAA))]; + let got = compute_subtree_diff(&source, &[]); + assert_eq!(got.to_write, vec![("a.md".to_string(), oid(0xAA))]); + assert!(got.to_delete.is_empty()); + assert!(got.unchanged.is_empty()); + } + + #[test] + fn diff_all_deletes_when_source_empty() { + let head = vec![("b.md".to_string(), oid(0xBB))]; + let got = compute_subtree_diff(&[], &head); + assert!(got.to_write.is_empty()); + assert_eq!(got.to_delete, vec!["b.md".to_string()]); + assert!(got.unchanged.is_empty()); + } + + #[test] + fn diff_unchanged_same_oid_same_path() { + let entries = vec![("a.md".to_string(), oid(0xCC))]; + let got = compute_subtree_diff(&entries, &entries); + assert!(got.to_write.is_empty()); + assert!(got.to_delete.is_empty()); + assert_eq!(got.unchanged, vec!["a.md".to_string()]); + } + + #[test] + fn diff_overwrite_when_same_path_different_oid() { + let source = vec![("a.md".to_string(), oid(0xAA))]; + let head = vec![("a.md".to_string(), oid(0xBB))]; + let got = compute_subtree_diff(&source, &head); + assert_eq!(got.to_write, vec![("a.md".to_string(), oid(0xAA))]); + assert!(got.to_delete.is_empty()); + assert!(got.unchanged.is_empty()); + } + + #[test] + fn diff_mixed_buckets_sorted_deterministically() { + let source = vec![ + ("keep.md".to_string(), oid(0x11)), + ("change.md".to_string(), oid(0x22)), + ("new.md".to_string(), oid(0x33)), + ]; + let head = vec![ + ("keep.md".to_string(), oid(0x11)), + ("change.md".to_string(), oid(0x99)), + ("gone.md".to_string(), oid(0x44)), + ]; + let got = compute_subtree_diff(&source, &head); + assert_eq!( + got.to_write, + vec![ + ("change.md".to_string(), oid(0x22)), + ("new.md".to_string(), oid(0x33)), + ] + ); + assert_eq!(got.to_delete, vec!["gone.md".to_string()]); + assert_eq!(got.unchanged, vec!["keep.md".to_string()]); + } + + #[test] + fn diff_handles_nested_paths() { + let source = vec![ + ("docs/a.md".to_string(), oid(0xAA)), + ("docs/sub/b.md".to_string(), oid(0xBB)), + ]; + let head = vec![("docs/a.md".to_string(), oid(0xAA))]; + let got = compute_subtree_diff(&source, &head); + assert_eq!(got.to_write, vec![("docs/sub/b.md".to_string(), oid(0xBB))]); + assert!(got.to_delete.is_empty()); + assert_eq!(got.unchanged, vec!["docs/a.md".to_string()]); + } + + #[test] + fn validate_rejects_empty_string() { + let err = validate_project_dir("").unwrap_err(); + assert!(matches!(err, GitError::InvalidProjectDir(_))); + } + + #[test] + fn validate_rejects_leading_slash() { + assert!(matches!( + validate_project_dir("/resources/proj_a").unwrap_err(), + GitError::InvalidProjectDir(_) + )); + } + + #[test] + fn validate_rejects_trailing_slash() { + assert!(matches!( + validate_project_dir("resources/proj_a/").unwrap_err(), + GitError::InvalidProjectDir(_) + )); + } + + #[test] + fn validate_rejects_double_slash() { + assert!(matches!( + validate_project_dir("resources//proj_a").unwrap_err(), + GitError::InvalidProjectDir(_) + )); + } + + #[test] + fn validate_accepts_simple_path() { + validate_project_dir("resources/proj_a").unwrap(); + } + + #[test] + fn validate_accepts_single_segment() { + validate_project_dir("resources").unwrap(); + } + + // ── project_dir hardening (traversal / backslash / control) ───────── + #[test] + fn validate_project_dir_rejects_dotdot_segment() { + for bad in ["..", "../other", "resources/../other", "a/.."] { + assert!( + matches!( + validate_project_dir(bad), + Err(GitError::InvalidProjectDir(_)) + ), + "{bad:?} should be rejected", + ); + } + } + + #[test] + fn validate_project_dir_rejects_dot_segment() { + for bad in [".", "./x", "a/./b"] { + assert!(matches!( + validate_project_dir(bad), + Err(GitError::InvalidProjectDir(_)) + )); + } + } + + #[test] + fn validate_project_dir_rejects_backslash_and_control() { + for bad in ["a\\b", "a\0b", "a\nb"] { + assert!( + matches!( + validate_project_dir(bad), + Err(GitError::InvalidProjectDir(_)) + ), + "{bad:?} should be rejected", + ); + } + } + + // ── relative path validation (commit / show) ──────────────────────── + #[test] + fn validate_relative_path_accepts_normal_paths() { + for ok in ["a.md", "dir/a.md", "a/b/c/d.txt", "..hidden", "a..b"] { + validate_relative_path(ok) + .unwrap_or_else(|e| panic!("{ok:?} should be valid, got {e:?}")); + } + } + + #[test] + fn validate_relative_path_rejects_malicious() { + for bad in [ + "", // empty + "/abs/path", // leading slash + "trailing/", // trailing slash + "a//b", // empty segment + ".", // dot + "..", // dotdot + "../escape", // traversal at root + "a/../b", // traversal mid-path + "a/./b", // dot mid-path + "a\\b", // backslash + "a\0b", // NUL + "a\nb", // control char + ] { + assert!( + matches!(validate_relative_path(bad), Err(GitError::InvalidPath(_))), + "{bad:?} should be rejected", + ); + } + } +} + +#[cfg(test)] +mod fast_path1_tests { + //! Tests for Fast Path 1 (the persistent stat cache). The strategy: + //! - Wrap `MockFsCounting` around a tiny in-memory map with controllable + //! `mod_time` per path AND a `reads` counter. + //! - Wrap `LocalIndexStore` in a temp dir as the index backend. + //! - Run two commits and assert `reads` only goes up by the expected + //! amount on the second one. + //! + //! Each assertion pins a distinct invariant of Fast Path 1: cache hit + //! skips read; (size, mtime_ns) mismatch invalidates; parent_oid mismatch + //! disables the cache; corruption is silent; partial-paths preserves + //! uncovered entries; deletion removes entries. + use super::*; + use async_trait::async_trait; + use std::collections::HashMap; + use std::sync::atomic::{AtomicU64, Ordering}; + use std::sync::{Arc, Mutex}; + use std::time::{Duration, SystemTime, UNIX_EPOCH}; + + use crate::core::errors::{Error, Result}; + use crate::core::filesystem::FileSystem; + use crate::core::types::{FileInfo, TreeEntry, WriteFlag}; + use crate::git::backends::local::{LocalIndexStore, LocalObjectStore, LocalRefStore}; + use crate::git::index_store::IndexStore; + + struct CountingVfs { + account: String, + // path -> (bytes, mtime_ns) + files: Arc, i128)>>>, + reads: AtomicU64, + } + + impl CountingVfs { + fn new(account: &str) -> Arc { + Arc::new(Self { + account: account.to_string(), + files: Arc::new(Mutex::new(HashMap::new())), + reads: AtomicU64::new(0), + }) + } + + fn put(&self, rel: &str, data: &[u8], mtime_ns: i128) { + let abs = format!("/local/{}/{}", self.account, rel); + self.files + .lock() + .unwrap() + .insert(abs, (data.to_vec(), mtime_ns)); + } + + fn delete(&self, rel: &str) { + let abs = format!("/local/{}/{}", self.account, rel); + self.files.lock().unwrap().remove(&abs); + } + + fn reads(&self) -> u64 { + self.reads.load(Ordering::SeqCst) + } + } + + fn nanos_to_systemtime(ns: i128) -> SystemTime { + // Tests use small positive nanos, so the cast is lossless. + let secs = (ns / 1_000_000_000) as u64; + let sub = (ns % 1_000_000_000) as u32; + UNIX_EPOCH + Duration::new(secs, sub) + } + + #[async_trait] + impl FileSystem for CountingVfs { + async fn create(&self, _path: &str) -> Result<()> { + unimplemented!() + } + async fn mkdir(&self, _path: &str, _mode: u32) -> Result<()> { + unimplemented!() + } + async fn remove(&self, path: &str) -> Result<()> { + self.files.lock().unwrap().remove(path); + Ok(()) + } + async fn remove_all(&self, _path: &str) -> Result<()> { + unimplemented!() + } + + async fn read(&self, path: &str, _offset: u64, _size: u64) -> Result> { + let g = self.files.lock().unwrap(); + match g.get(path) { + Some((bytes, _)) => { + self.reads.fetch_add(1, Ordering::SeqCst); + Ok(bytes.clone()) + } + None => Err(Error::not_found(path)), + } + } + + async fn write( + &self, + path: &str, + data: &[u8], + _offset: u64, + _flags: WriteFlag, + ) -> Result { + self.files + .lock() + .unwrap() + .insert(path.to_string(), (data.to_vec(), 0)); + Ok(data.len() as u64) + } + async fn read_dir(&self, _path: &str) -> Result> { + unimplemented!() + } + + async fn stat(&self, path: &str) -> Result { + let g = self.files.lock().unwrap(); + if let Some((bytes, mtime_ns)) = g.get(path) { + let name = path.rsplit('/').next().unwrap_or(path).to_string(); + return Ok(FileInfo::new( + name, + bytes.len() as u64, + 0o644, + nanos_to_systemtime(*mtime_ns), + false, + )); + } + Err(Error::not_found(path)) + } + + async fn rename(&self, _o: &str, _n: &str) -> Result<()> { + unimplemented!() + } + async fn chmod(&self, _path: &str, _mode: u32) -> Result<()> { + unimplemented!() + } + + async fn tree_directory( + &self, + path: &str, + _show_hidden: bool, + _node_limit: Option, + _level_limit: Option, + ) -> Result> { + let prefix = if path == "/" { + "/".to_string() + } else { + format!("{}/", path) + }; + let g = self.files.lock().unwrap(); + let mut out = Vec::new(); + for (full_path, (_bytes, _mtime)) in g.iter() { + if !full_path.starts_with(&prefix) { + continue; + } + let rel = full_path + .strip_prefix(&prefix) + .unwrap_or(full_path) + .to_string(); + let name = full_path + .rsplit('/') + .next() + .unwrap_or(full_path) + .to_string(); + out.push(TreeEntry { + path: full_path.clone(), + rel_path: rel, + info: FileInfo::new_file(name, 0, 0o644), + extra: HashMap::new(), + }); + } + Ok(out) + } + } + + fn make_service_with_index( + account: &str, + ) -> ( + tempfile::TempDir, + Arc, + Arc, + GitService, + ) { + let dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(dir.path())); + let ref_store = Arc::new(LocalRefStore::new(dir.path())); + let index_store = Arc::new(LocalIndexStore::new(dir.path())); + let vfs = CountingVfs::new(account); + let svc = GitService::with_index( + vfs.clone() as Arc, + object_store as Arc, + ref_store as Arc, + Some(index_store.clone() as Arc), + ); + (dir, vfs, index_store, svc) + } + + fn make_service_no_index(account: &str) -> (tempfile::TempDir, Arc, GitService) { + let dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(dir.path())); + let ref_store = Arc::new(LocalRefStore::new(dir.path())); + let vfs = CountingVfs::new(account); + let svc = GitService::new( + vfs.clone() as Arc, + object_store as Arc, + ref_store as Arc, + ); + (dir, vfs, svc) + } + + fn req(account: &str, branch: &str, paths: Option>) -> CommitRequest { + CommitRequest { + account: account.to_string(), + branch: branch.to_string(), + message: "m".to_string(), + paths, + author_name: "tester".to_string(), + author_email: "t@x".to_string(), + } + } + + #[tokio::test] + async fn cached_stat_match_skips_read() { + // Two files. First commit reads both. Second commit, with + // identical (size, mtime_ns), reads NEITHER — Fast Path 1 hits. + // mtimes are well in the past (year 2001 / 2004) so they predate the + // real index file's save time → not racy → Fast Path 1 is trusted. + let (_dir, vfs, _idx, svc) = make_service_with_index("acct"); + vfs.put("a.md", b"hello", 1_000_000_000_000_000_000); + vfs.put("b.md", b"world", 1_100_000_000_000_000_000); + + let _ = svc.commit(req("acct", "main", None)).await.unwrap(); + let reads_after_first = vfs.reads(); + assert_eq!(reads_after_first, 2, "first commit must read both files"); + + // Second commit, no changes → Noop, but commit() still walks + // candidates and decides whether to read each. Fast Path 1 means + // (size, mtime_ns) match → no read needed. + let resp = svc.commit(req("acct", "main", None)).await.unwrap(); + match resp { + CommitResponse::Noop { .. } => {} + other => panic!("expected Noop, got {other:?}"), + } + assert_eq!( + vfs.reads(), + reads_after_first, + "Fast Path 1 hit: no extra reads on second commit", + ); + } + + #[tokio::test] + async fn racy_clean_same_size_same_mtime_is_not_lost() { + // Regression for the "racy clean" data-loss bug: a file changed to the + // same byte length within the same filesystem clock tick as the index + // write must NOT be skipped by Fast Path 1. + // + // We simulate "the file's mtime is in (or after) the index's clock + // tick" by giving the working-tree file a mtime in the far future + // (year 2033), which is guaranteed to be >= the real index file's + // save time. Both versions are 2 bytes with the SAME (size, mtime), + // exactly the signature Fast Path 1 keys on. + let racy_mtime = 2_000_000_000_000_000_000; // ~year 2033, >= index save time + let (_dir, vfs, _idx, svc) = make_service_with_index("acct"); + + vfs.put("a.md", b"v1", racy_mtime); + let _ = svc.commit(req("acct", "main", None)).await.unwrap(); + let reads_after_v1 = vfs.reads(); + + // Change content to a different 2-byte value, keeping (size, mtime) + // identical — the pathological case the stat signature cannot detect. + vfs.put("a.md", b"v2", racy_mtime); + let resp = svc.commit(req("acct", "main", None)).await.unwrap(); + + // The racy-clean guard must force a slow-path read and capture the + // change as a real commit. Without the guard this would be a Noop + // (Fast Path 1 reusing v1's blob oid) — silently dropping v2. + match resp { + CommitResponse::Created { .. } => {} + other => panic!("expected Created (v2 must be committed), got {other:?}"), + } + assert_eq!( + vfs.reads(), + reads_after_v1 + 1, + "racy entry (mtime >= index save time) must be re-read, not trusted", + ); + + // And the committed blob must actually be v2, not the stale v1. + let shown = svc + .show(ShowRequest { + account: "acct".to_string(), + target_ref: "main".to_string(), + path: Some("a.md".to_string()), + }) + .await + .unwrap(); + match shown { + ShowResponse::Blob { bytes, .. } => { + assert_eq!(&bytes[..], b"v2", "committed blob must reflect v2, not v1"); + } + other => panic!("expected Blob, got {other:?}"), + } + } + + #[tokio::test] + async fn size_mismatch_invalidates_cache_entry() { + let (_dir, vfs, _idx, svc) = make_service_with_index("acct"); + vfs.put("a.md", b"hello", 1_000_000_000_000_000_000); + let _ = svc.commit(req("acct", "main", None)).await.unwrap(); + let reads_after_first = vfs.reads(); + + // Same mtime, different size → Fast Path 1 must MISS for this file. + vfs.put("a.md", b"helloX", 1_000_000_000_000_000_000); + let _ = svc.commit(req("acct", "main", None)).await.unwrap(); + assert_eq!( + vfs.reads(), + reads_after_first + 1, + "size mismatch must trigger one extra read", + ); + } + + #[tokio::test] + async fn mtime_mismatch_invalidates_cache_entry() { + let (_dir, vfs, _idx, svc) = make_service_with_index("acct"); + vfs.put("a.md", b"hello", 1_000_000_000_000_000_000); + let _ = svc.commit(req("acct", "main", None)).await.unwrap(); + let reads_after_first = vfs.reads(); + + // Same size, different mtime → cache miss. + vfs.put("a.md", b"hello", 2_000_000_000_000_000_000); + let resp = svc.commit(req("acct", "main", None)).await.unwrap(); + // Same content → identical oid → no editor change → Noop. + match resp { + CommitResponse::Noop { .. } => {} + other => panic!("expected Noop, got {other:?}"), + } + assert_eq!( + vfs.reads(), + reads_after_first + 1, + "mtime mismatch must trigger one extra read", + ); + } + + #[tokio::test] + async fn parent_oid_mismatch_disables_cache() { + // Build a service whose index file's parent_oid is stale relative + // to the branch HEAD: drop in a hand-crafted CommitIndex pointing at + // a bogus parent, then commit and assert ALL files were re-read. + let (dir, vfs, idx, svc) = make_service_with_index("acct"); + vfs.put("a.md", b"hello", 1_000_000_000_000_000_000); + vfs.put("b.md", b"world", 2_000_000_000_000_000_000); + let _ = svc.commit(req("acct", "main", None)).await.unwrap(); + let reads_after_first = vfs.reads(); + + // Overwrite the index on disk with one whose parent_oid is bogus. + let bogus = ObjectId::from_hex(b"deadbeefdeadbeefdeadbeefdeadbeefdeadbeef").unwrap(); + let stale = CommitIndex { + parent_oid: bogus, + entries: HashMap::new(), // doesn't matter; whole file is rejected + saved_at_ns: None, + }; + idx.save("acct", "main", &stale).await.unwrap(); + let _ = dir; // keep tempdir alive + + // Re-commit with same contents. Cache parent_oid != HEAD → cache + // discarded entirely → both files re-read. + let _ = svc.commit(req("acct", "main", None)).await.unwrap(); + assert_eq!( + vfs.reads(), + reads_after_first + 2, + "parent_oid mismatch must force read of every candidate", + ); + } + + #[tokio::test] + async fn deleted_path_is_removed_from_index() { + let (_dir, vfs, idx, svc) = make_service_with_index("acct"); + vfs.put("a.md", b"a", 1_000_000_000_000_000_000); + vfs.put("b.md", b"b", 2_000_000_000_000_000_000); + let _ = svc.commit(req("acct", "main", None)).await.unwrap(); + + // Delete a.md, commit with explicit paths so the deletion is + // observed. Commit succeeds; the persisted index must drop a.md + // and keep b.md. + vfs.delete("a.md"); + let _ = svc + .commit(req( + "acct", + "main", + Some(vec!["a.md".into(), "b.md".into()]), + )) + .await + .unwrap(); + + let saved = idx.load("acct", "main").await.unwrap().unwrap(); + assert!( + !saved.entries.contains_key("a.md"), + "deleted path must be removed from the index", + ); + assert!( + saved.entries.contains_key("b.md"), + "surviving path must remain in the index", + ); + } + + #[tokio::test] + async fn corrupted_index_falls_back_silently() { + // Drop a malformed file at the index path BEFORE the first commit, + // then commit normally. The corrupt file makes load() return None, + // so commit takes the slow path — but it MUST still succeed and + // overwrite the corrupt file with a valid one. + let (dir, vfs, idx, svc) = make_service_with_index("acct"); + let path = dir.path().join("acct").join("index").join("main.json"); + tokio::fs::create_dir_all(path.parent().unwrap()) + .await + .unwrap(); + tokio::fs::write(&path, b"NOT-JSON-AT-ALL").await.unwrap(); + + vfs.put("a.md", b"hi", 1_000_000_000_000_000_000); + let resp = svc.commit(req("acct", "main", None)).await.unwrap(); + assert!(matches!(resp, CommitResponse::Created { .. })); + // The save after commit succeeded → load now returns a real index. + let loaded = idx.load("acct", "main").await.unwrap().unwrap(); + assert!(loaded.entries.contains_key("a.md")); + } + + #[tokio::test] + async fn partial_paths_preserves_uncovered_entries() { + // First commit covers a + b. + // Second commit lists ONLY [a]; b is never enumerated. The new + // index must still contain b's entry so a future full-enum commit + // can still hit the cache for b. + let (_dir, vfs, idx, svc) = make_service_with_index("acct"); + vfs.put("a.md", b"a", 1_000_000_000_000_000_000); + vfs.put("b.md", b"b", 2_000_000_000_000_000_000); + let _ = svc.commit(req("acct", "main", None)).await.unwrap(); + + // Touch a (mtime change) — partial commit on a.md alone. + vfs.put("a.md", b"a", 3_000_000_000_000_000_000); + let _ = svc + .commit(req("acct", "main", Some(vec!["a.md".into()]))) + .await + .unwrap(); + + let saved = idx.load("acct", "main").await.unwrap().unwrap(); + let a = saved + .entries + .get("a.md") + .expect("a.md must be in the index"); + let b = saved.entries.get("b.md").expect( + "b.md was uncovered by paths=[a.md] but must be preserved \ + from the previous index", + ); + assert_eq!(a.mtime_ns, 3_000_000_000_000_000_000); + assert_eq!(b.mtime_ns, 2_000_000_000_000_000_000); + } + + #[tokio::test] + async fn no_index_store_disables_fast_path() { + // Sanity: with index_store=None the slow path runs every commit; + // a noop second commit still reads every file. + let (_dir, vfs, svc) = make_service_no_index("acct"); + vfs.put("a.md", b"hi", 1_000_000_000_000_000_000); + let _ = svc.commit(req("acct", "main", None)).await.unwrap(); + let reads_after_first = vfs.reads(); + + let _ = svc.commit(req("acct", "main", None)).await.unwrap(); + assert_eq!( + vfs.reads(), + reads_after_first + 1, + "without an IndexStore the slow path runs every commit", + ); + } + + /// Partial commit with a directory entry must purge `new_index_entries` + /// by prefix — otherwise a file deleted under that directory leaves a + /// stale row in the persisted commit index, which the *next* commit + /// might serve to fast-path 1 as a valid cached oid. + /// + /// Uses `LocalFileSystem` because the Directory branch of Step 2.5 + /// only runs when `stat` returns `is_dir = true`. `CountingVfs::stat` + /// returns NotFound for any directory and so would route this test + /// through the NotFound branch — which has different prefix-cleanup + /// behavior (NotFound also clears the exact key). + #[tokio::test] + async fn partial_commit_with_directory_path_purges_index_by_prefix() { + use crate::git::object_store::ObjectStore; + use crate::git::ref_store::RefStore; + use crate::plugins::localfs::LocalFileSystem; + + let store_dir = tempfile::tempdir().unwrap(); + let object_store = Arc::new(LocalObjectStore::new(store_dir.path())); + let ref_store = Arc::new(LocalRefStore::new(store_dir.path())); + let index_store = Arc::new(LocalIndexStore::new(store_dir.path())); + + let work_dir = tempfile::tempdir().unwrap(); + let acct_root = work_dir.path().join("local").join("acct"); + std::fs::create_dir_all(acct_root.join("docs")).unwrap(); + std::fs::create_dir_all(acct_root.join("other")).unwrap(); + std::fs::write(acct_root.join("docs/a.md"), b"AA").unwrap(); + std::fs::write(acct_root.join("docs/b.md"), b"BB").unwrap(); + std::fs::write(acct_root.join("other/c.md"), b"CC").unwrap(); + + let vfs: Arc = + Arc::new(LocalFileSystem::new(work_dir.path().to_str().unwrap()).unwrap()); + let svc = GitService::with_index( + vfs, + object_store as Arc, + ref_store as Arc, + Some(index_store.clone() as Arc), + ); + + let full = CommitRequest { + account: "acct".into(), + branch: "main".into(), + message: "m".into(), + paths: None, + author_name: "tester".into(), + author_email: "t@x".into(), + }; + let _ = svc.commit(full).await.unwrap(); + let loaded = index_store.load("acct", "main").await.unwrap().unwrap(); + assert!(loaded.entries.contains_key("docs/a.md")); + assert!(loaded.entries.contains_key("docs/b.md")); + assert!(loaded.entries.contains_key("other/c.md")); + + // Delete docs/b.md, partial-commit with paths=["docs"]. + std::fs::remove_file(acct_root.join("docs/b.md")).unwrap(); + let partial = CommitRequest { + account: "acct".into(), + branch: "main".into(), + message: "m".into(), + paths: Some(vec!["docs".into()]), + author_name: "tester".into(), + author_email: "t@x".into(), + }; + let _ = svc.commit(partial).await.unwrap(); + + let loaded = index_store.load("acct", "main").await.unwrap().unwrap(); + assert!( + !loaded.entries.contains_key("docs/b.md"), + "stale entry for deleted docs/b.md must not survive prefix cleanup" + ); + assert!( + loaded.entries.contains_key("docs/a.md"), + "surviving file under docs/ must have a fresh entry" + ); + assert!( + loaded.entries.contains_key("other/c.md"), + "files outside the partial scope must be preserved verbatim" + ); + } +} diff --git a/crates/ragfs/src/git/tree_builder.rs b/crates/ragfs/src/git/tree_builder.rs new file mode 100644 index 0000000000..34e66b8deb --- /dev/null +++ b/crates/ragfs/src/git/tree_builder.rs @@ -0,0 +1,1360 @@ +//! Async-native Git tree editor for building and modifying tree objects. + +use std::collections::{BTreeMap, HashMap}; + +use gix_hash::ObjectId; +use gix_object::bstr::{BString, ByteSlice}; +use gix_object::tree::{self, EntryKind}; +use gix_object::{Tree, TreeRef, WriteTo}; + +use crate::git::error::GitError; +use crate::git::object_store::ObjectStore; +use crate::git::util::{parse_object_header, read_object, write_object}; + +/// Type alias for tree entries mapping path components to tree entries +pub type TreeEntries = BTreeMap; + +/// Editor for constructing and modifying Git tree objects +pub struct TreeEditor { + pub(crate) root: TreeEntries, + pub(crate) subtrees: HashMap, +} + +impl TreeEditor { + /// Create a new empty TreeEditor + pub fn empty() -> Self { + Self { + root: BTreeMap::new(), + subtrees: HashMap::new(), + } + } + + /// Split a path into components, validating each component. + fn split_path(path: &str) -> Result, GitError> { + if path.is_empty() { + return Err(GitError::Other("empty path".into())); + } + + let components: Vec<&str> = path.split('/').collect(); + for comp in &components { + if comp.is_empty() { + return Err(GitError::Other("empty path component".into())); + } + } + Ok(components) + } + + /// Join path components into a `dir1/dir2/...` BString key. + fn join_prefix(parts: &[&str]) -> BString { + let mut out = BString::default(); + for (i, p) in parts.iter().enumerate() { + if i > 0 { + out.push(b'/'); + } + out.extend_from_slice(p.as_bytes()); + } + out + } + + /// Ensure every subtree along `parent_dirs` is loaded into `self.subtrees`. + /// + /// Walks top-down. For each level not yet present in `self.subtrees`, it + /// reads the directory entry from the parent level and lazily loads the + /// referenced tree object (Fast Path 2: untouched subtrees are never + /// loaded). Returns `Ok(false)` if a component is missing and + /// `create_missing` is false (used by `remove` for its no-op semantics); + /// `Ok(true)` once all levels are present. + async fn ensure_path_loaded( + &mut self, + store: &dyn ObjectStore, + account: &str, + parent_dirs: &[&str], + create_missing: bool, + ) -> Result { + for depth in 1..=parent_dirs.len() { + let dir_name = parent_dirs[depth - 1]; + let child_key = Self::join_prefix(&parent_dirs[..depth]); + + if self.subtrees.contains_key(&child_key) { + continue; + } + + // Inspect the directory entry in the parent level. Copy out the oid + // (or the decision) before any await to avoid holding a borrow of + // `self.subtrees`/`self.root` across the load. + enum Action { + Load(ObjectId), + Empty, + Missing, + } + let action = { + let parent: &TreeEntries = if depth == 1 { + &self.root + } else { + let parent_key = Self::join_prefix(&parent_dirs[..depth - 1]); + self.subtrees.get(&parent_key).ok_or_else(|| { + GitError::Other(format!("subtree not loaded: {parent_key}")) + })? + }; + match parent.get(dir_name.as_bytes().as_bstr()) { + // Existing directory entry: either an empty subtree or one + // to load. Null and the well-known empty-tree oid are both + // treated as empty — the empty tree object is conventionally + // never physically stored, so we must not try to load it. + Some(entry) if entry.mode == EntryKind::Tree.into() => { + if entry.oid.is_null() + || entry.oid == ObjectId::empty_tree(gix_hash::Kind::Sha1) + { + Action::Empty + } else { + Action::Load(entry.oid) + } + } + // Either no entry, or one that exists but is a file/symlink + // (a file→dir transition). Treat both as missing: the create + // path overwrites the stale opposite-kind entry with a fresh + // directory; the no-create path (remove) no-ops. + _ => Action::Missing, + } + }; + + match action { + Action::Load(oid) => { + let entries = load_tree_entries(store, account, &oid).await?; + self.subtrees.insert(child_key, entries); + } + Action::Empty => { + self.subtrees.insert(child_key, BTreeMap::new()); + } + Action::Missing => { + if !create_missing { + return Ok(false); + } + // Create the directory entry in the parent level, then an + // empty subtree for it. + let parent: &mut TreeEntries = if depth == 1 { + &mut self.root + } else { + let parent_key = Self::join_prefix(&parent_dirs[..depth - 1]); + self.subtrees.entry(parent_key).or_insert_with(BTreeMap::new) + }; + parent.insert( + dir_name.into(), + tree::Entry { + mode: EntryKind::Tree.into(), + filename: dir_name.into(), + oid: ObjectId::null(gix_hash::Kind::Sha1), + }, + ); + self.subtrees.insert(child_key, BTreeMap::new()); + } + } + } + Ok(true) + } + + /// Upsert a blob object at the given path. + pub async fn upsert( + &mut self, + store: &dyn ObjectStore, + account: &str, + path: &str, + oid: ObjectId, + ) -> Result<(), GitError> { + let components = Self::split_path(path)?; + let (filename, parent_dirs) = components + .split_last() + .ok_or_else(|| GitError::Other("empty path".into()))?; + + let leaf = tree::Entry { + mode: EntryKind::Blob.into(), + filename: (*filename).into(), + oid, + }; + + if parent_dirs.is_empty() { + self.root.insert((*filename).into(), leaf); + return Ok(()); + } + + self.ensure_path_loaded(store, account, parent_dirs, true) + .await?; + + let leaf_key = Self::join_prefix(parent_dirs); + let subtree = self.subtrees.entry(leaf_key).or_insert_with(BTreeMap::new); + subtree.insert((*filename).into(), leaf); + + Ok(()) + } + + /// Remove a path from the tree. No-op if the path does not exist. + pub async fn remove( + &mut self, + store: &dyn ObjectStore, + account: &str, + path: &str, + ) -> Result<(), GitError> { + let components = Self::split_path(path)?; + let (filename, parent_dirs) = components + .split_last() + .ok_or_else(|| GitError::Other("empty path".into()))?; + + if parent_dirs.is_empty() { + self.root.remove(filename.as_bytes().as_bstr()); + return Ok(()); + } + + // Missing ancestor → nothing to remove (keep no-op semantics, do not + // create directories). + if !self + .ensure_path_loaded(store, account, parent_dirs, false) + .await? + { + return Ok(()); + } + + let prefix = Self::join_prefix(parent_dirs); + if let Some(subtree) = self.subtrees.get_mut(&prefix) { + subtree.remove(filename.as_bytes().as_bstr()); + } + Ok(()) + } + + /// Splice an existing subtree (referenced by its OID) into the editor at the + /// given path. The path's intermediate ancestors are created as needed. + /// + /// Any in-memory editor state under `path` is discarded — subsequent + /// `write()` calls will reference `subtree_oid` directly without rebuilding + /// the subtree. This is the API `restore` uses to swap a whole project + /// directory to a historical version without enumerating every file. + /// + /// Note: if you later call `upsert`/`remove` *inside* the spliced subtree + /// (e.g. `upsert_subtree("a/b", oid); upsert("a/b/x.txt", ...)`), the + /// in-memory state for "a/b" is rebuilt from those edits alone — the + /// contents of `subtree_oid` are not merged in. Splice, then edit, is a + /// destructive pattern. + pub async fn upsert_subtree( + &mut self, + store: &dyn ObjectStore, + account: &str, + path: &str, + subtree_oid: ObjectId, + ) -> Result<(), GitError> { + let components = Self::split_path(path)?; + let (dirname, parent_dirs) = components + .split_last() + .ok_or_else(|| GitError::Other("empty path".into()))?; + + // Ensure each ancestor directory's subtree is loaded (so sibling + // entries are preserved), creating missing ancestors as needed. + self.ensure_path_loaded(store, account, parent_dirs, true) + .await?; + + // Insert the leaf Tree entry pointing at the precomputed subtree. + let leaf_entry = tree::Entry { + mode: EntryKind::Tree.into(), + filename: (*dirname).into(), + oid: subtree_oid, + }; + let leaf_parent: &mut TreeEntries = if parent_dirs.is_empty() { + &mut self.root + } else { + let key = Self::join_prefix(parent_dirs); + self.subtrees.entry(key).or_insert_with(BTreeMap::new) + }; + leaf_parent.insert((*dirname).into(), leaf_entry); + + // Drop any stale in-memory state at or beneath `path` so write_subtree + // doesn't recurse — it will reuse `subtree_oid` directly. + let prefix = Self::join_prefix(&components); + let prefix_slash: Vec = { + let mut v = Vec::with_capacity(prefix.len() + 1); + v.extend_from_slice(prefix.as_slice()); + v.push(b'/'); + v + }; + let to_remove: Vec = self + .subtrees + .keys() + .filter(|k| { + k.as_slice() == prefix.as_slice() + || k.as_slice().starts_with(&prefix_slash) + }) + .cloned() + .collect(); + for k in to_remove { + self.subtrees.remove(&k); + } + + Ok(()) + } + + /// Load an existing tree from ObjectStore as the editing base. + /// + /// Only the root tree is loaded eagerly; subtrees are loaded lazily on + /// first `upsert`/`remove`/`upsert_subtree` that touches them. Untouched + /// subtrees are never read into memory and are reused as-is during + /// `write` (Fast Path 2). + pub async fn from_tree( + store: &dyn ObjectStore, + account: &str, + tree_oid: ObjectId, + ) -> Result { + let mut editor = Self::empty(); + editor.root = load_tree_entries(store, account, &tree_oid).await?; + Ok(editor) + } + + /// Write all in-memory trees to ObjectStore, returning the root tree oid. + /// Writes bottom-up: leaf subtrees first, then their parents. + /// Empty subtrees are pruned. + pub async fn write( + &mut self, + store: &dyn ObjectStore, + account: &str, + ) -> Result { + self.write_subtree(store, account, &BString::default()).await + } + + fn write_subtree<'a>( + &'a mut self, + store: &'a dyn ObjectStore, + account: &'a str, + prefix: &'a BString, + ) -> std::pin::Pin> + Send + 'a>> + { + Box::pin(async move { + // Snapshot entry keys so we can mutate self.subtrees during recursion. + let entry_specs: Vec<(BString, tree::Entry)> = { + let entries = if prefix.is_empty() { + &self.root + } else { + self.subtrees.get(prefix).ok_or_else(|| { + GitError::Other(format!("subtree not found: {prefix}")) + })? + }; + entries.iter().map(|(k, v)| (k.clone(), v.clone())).collect() + }; + + let mut result_entries: Vec = Vec::with_capacity(entry_specs.len()); + for (name, entry) in entry_specs { + if entry.mode.is_tree() { + let child_prefix = if prefix.is_empty() { + name.clone() + } else { + let mut p = prefix.clone(); + p.push(b'/'); + p.extend_from_slice(&name); + p + }; + + match self.subtrees.get(&child_prefix) { + Some(child_entries) if child_entries.is_empty() => { + // Prune empty subtree. + continue; + } + Some(_) => { + // Subtree has in-memory edits — recurse to write them. + let child_oid = self.write_subtree(store, account, &child_prefix).await?; + result_entries.push(tree::Entry { + mode: EntryKind::Tree.into(), + filename: name, + oid: child_oid, + }); + } + None => { + // No in-memory state: use the entry's existing OID as-is + // (e.g. placed by upsert_subtree or from_tree for untouched + // subtrees). This is the Fast Path 2 optimisation. + result_entries.push(entry); + } + } + } else { + result_entries.push(entry); + } + } + + result_entries.sort(); + let tree = Tree { entries: result_entries }; + let mut buf = Vec::new(); + tree.write_to(&mut buf) + .map_err(|e| GitError::Other(format!("tree serialization: {e}")))?; + + let oid = write_object(store, account, gix_object::Kind::Tree, &buf).await?; + Ok(oid) + }) + } +} + +/// Convert BTreeMap entries to a `gix_object::Tree` with Git sort order. +#[cfg(test)] +fn entries_to_tree(entries: &TreeEntries) -> Tree { + let mut sorted: Vec = entries.values().cloned().collect(); + sorted.sort(); + Tree { entries: sorted } +} + +/// Read and parse a tree object from ObjectStore. +async fn load_tree( + store: &dyn ObjectStore, + account: &str, + oid: &ObjectId, +) -> Result { + let raw = read_object(store, account, oid).await?; + let (_, _, header_len) = parse_object_header(&raw)?; + let tree_ref = TreeRef::from_bytes(&raw[header_len..]) + .map_err(|e| GitError::CorruptedObject(format!("invalid tree: {e}")))?; + Ok(Tree::from(tree_ref)) +} + +/// Load a tree object and convert it into a `TreeEntries` map keyed by filename. +async fn load_tree_entries( + store: &dyn ObjectStore, + account: &str, + oid: &ObjectId, +) -> Result { + let tree = load_tree(store, account, oid).await?; + let mut entries = BTreeMap::new(); + for entry in tree.entries { + entries.insert(entry.filename.clone(), entry); + } + Ok(entries) +} + +/// Recursively flatten a tree into (path, blob_oid) pairs. +/// +/// If `path_filter` is Some, only include blob paths whose prefix matches +/// any of the filter prefixes. Subtrees are descended into only when relevant. +pub async fn flatten( + store: &dyn ObjectStore, + account: &str, + tree_oid: ObjectId, + path_filter: &Option>, +) -> Result, GitError> { + let mut result = Vec::new(); + let mut stack: Vec<(String, ObjectId)> = vec![(String::new(), tree_oid)]; + + while let Some((prefix, oid)) = stack.pop() { + let tree = load_tree(store, account, &oid).await?; + for entry in tree.entries { + let path = if prefix.is_empty() { + entry.filename.to_string() + } else { + format!("{}/{}", prefix, entry.filename) + }; + + if entry.mode.is_tree() { + let should_descend = match path_filter { + None => true, + Some(filters) => filters + .iter() + .any(|f| path.starts_with(f) || f.starts_with(&path)), + }; + if should_descend { + stack.push((path, entry.oid)); + } + } else { + let include = match path_filter { + None => true, + Some(filters) => filters.iter().any(|f| path.starts_with(f)), + }; + if include { + result.push((path, entry.oid)); + } + } + } + } + + result.sort_by(|a, b| a.0.cmp(&b.0)); + Ok(result) +} + +/// Look up a single path in a tree, returning the entry's oid and mode. +/// Returns `Ok(None)` if the path doesn't exist. +pub async fn lookup( + store: &dyn ObjectStore, + account: &str, + tree_oid: ObjectId, + path: &str, +) -> Result, GitError> { + let mut cache = TreeLookupCache::new(); + lookup_cached(store, account, tree_oid, path, &mut cache).await +} + +/// In-memory cache of decoded tree objects keyed by their OID. Intended for use +/// across many `lookup_cached` calls that share the same root (e.g. the commit +/// hot loop, where K candidate paths each walk depth-D ancestor trees that +/// otherwise get re-fetched + re-zlib-decoded K×D times). +/// +/// Entries are `Arc`-shared so a clone is cheap; the cache is single-writer +/// (the caller's `&mut`) so no internal locking is needed. +pub struct TreeLookupCache { + by_oid: HashMap>, +} + +impl TreeLookupCache { + /// Create an empty cache. + pub fn new() -> Self { + Self { + by_oid: HashMap::new(), + } + } + + /// Pre-seed the cache with an already-decoded tree's entries. Useful when + /// the caller has the root entries on hand (e.g. from `TreeEditor::from_tree`) + /// and wants the very first `lookup_cached` to skip the redundant fetch. + pub fn seed(&mut self, oid: ObjectId, entries: TreeEntries) { + self.by_oid.insert(oid, std::sync::Arc::new(entries)); + } +} + +impl Default for TreeLookupCache { + fn default() -> Self { + Self::new() + } +} + +/// Same as [`lookup`], but reuses an external [`TreeLookupCache`] across calls +/// so each tree object is fetched + decoded at most once. The cache is keyed +/// on the *content-addressed* tree OID, so it stays correct across calls with +/// different starting roots. +pub async fn lookup_cached( + store: &dyn ObjectStore, + account: &str, + tree_oid: ObjectId, + path: &str, + cache: &mut TreeLookupCache, +) -> Result, GitError> { + if path.is_empty() { + return Err(GitError::Other("empty path".into())); + } + let components: Vec<&str> = path.split('/').collect(); + let mut current_oid = tree_oid; + + for (i, component) in components.iter().enumerate() { + if component.is_empty() { + return Err(GitError::Other("empty path component".into())); + } + let entries = match cache.by_oid.get(¤t_oid) { + Some(e) => e.clone(), + None => { + let loaded = std::sync::Arc::new( + load_tree_entries(store, account, ¤t_oid).await?, + ); + cache.by_oid.insert(current_oid, loaded.clone()); + loaded + } + }; + let filename = component.as_bytes(); + let is_last = i == components.len() - 1; + + match entries.get(filename.as_bstr()) { + Some(entry) => { + if is_last { + return Ok(Some((entry.oid, entry.mode))); + } else if entry.mode.is_tree() { + current_oid = entry.oid; + } else { + return Ok(None); + } + } + None => return Ok(None), + } + } + + Ok(None) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn dummy_oid() -> ObjectId { + ObjectId::null(gix_hash::Kind::Sha1) + } + + #[test] + fn test_empty_editor() { + let editor = TreeEditor::empty(); + assert!(editor.root.is_empty()); + assert!(editor.subtrees.is_empty()); + } + + #[tokio::test] + async fn test_upsert_single_file() { + let (_d, store) = make_store(); + let mut editor = TreeEditor::empty(); + let oid = dummy_oid(); + + editor.upsert(&store, "acc", "file.txt", oid).await.unwrap(); + + assert_eq!(editor.root.len(), 1); + let entry = editor.root.get("file.txt".as_bytes().as_bstr()).unwrap(); + assert_eq!(entry.mode, EntryKind::Blob.into()); + assert_eq!(entry.oid, oid); + assert_eq!(entry.filename, "file.txt"); + } + + #[tokio::test] + async fn test_upsert_nested_path() { + let (_d, store) = make_store(); + let mut editor = TreeEditor::empty(); + let oid = dummy_oid(); + + editor + .upsert(&store, "acc", "dir/subdir/file.txt", oid) + .await + .unwrap(); + + // Root has dir + assert_eq!(editor.root.len(), 1); + let dir_entry = editor.root.get("dir".as_bytes().as_bstr()).unwrap(); + assert_eq!(dir_entry.mode, EntryKind::Tree.into()); + + // Subtrees has dir + let dir_subtree = editor.subtrees.get("dir".as_bytes().as_bstr()).unwrap(); + assert_eq!(dir_subtree.len(), 1); + let subdir_entry = dir_subtree.get("subdir".as_bytes().as_bstr()).unwrap(); + assert_eq!(subdir_entry.mode, EntryKind::Tree.into()); + + // Subdir subtree + let subdir_subtree = editor.subtrees.get("dir/subdir".as_bytes().as_bstr()).unwrap(); + assert_eq!(subdir_subtree.len(), 1); + let file_entry = subdir_subtree.get("file.txt".as_bytes().as_bstr()).unwrap(); + assert_eq!(file_entry.mode, EntryKind::Blob.into()); + assert_eq!(file_entry.oid, oid); + } + + #[tokio::test] + async fn test_upsert_overwrite() { + let (_d, store) = make_store(); + let mut editor = TreeEditor::empty(); + let oid1 = dummy_oid(); + let oid2 = ObjectId::from_hex(b"abcdef1234567890abcdef1234567890abcdef12").unwrap(); + + editor.upsert(&store, "acc", "file.txt", oid1).await.unwrap(); + editor.upsert(&store, "acc", "file.txt", oid2).await.unwrap(); + + let entry = editor.root.get("file.txt".as_bytes().as_bstr()).unwrap(); + assert_eq!(entry.oid, oid2); + } + + #[tokio::test] + async fn test_upsert_empty_component_rejected() { + let (_d, store) = make_store(); + let mut editor = TreeEditor::empty(); + let oid = dummy_oid(); + + assert!(editor.upsert(&store, "acc", "", oid).await.is_err()); + assert!(editor.upsert(&store, "acc", "file//txt", oid).await.is_err()); + assert!(editor.upsert(&store, "acc", "/file.txt", oid).await.is_err()); + assert!(editor.upsert(&store, "acc", "file.txt/", oid).await.is_err()); + } + + #[tokio::test] + async fn test_remove_existing() { + let (_d, store) = make_store(); + let mut editor = TreeEditor::empty(); + let oid = dummy_oid(); + + editor.upsert(&store, "acc", "dir/file.txt", oid).await.unwrap(); + assert_eq!(editor.root.len(), 1); + + editor.remove(&store, "acc", "dir/file.txt").await.unwrap(); + + let dir_subtree = editor.subtrees.get("dir".as_bytes().as_bstr()).unwrap(); + assert!(dir_subtree.is_empty()); + } + + #[tokio::test] + async fn test_remove_nonexistent_is_noop() { + let (_d, store) = make_store(); + let mut editor = TreeEditor::empty(); + editor.remove(&store, "acc", "nonexistent.txt").await.unwrap(); + editor.remove(&store, "acc", "dir/nonexistent.txt").await.unwrap(); + } + + #[tokio::test] + async fn test_upsert_top_level_file() { + let (_d, store) = make_store(); + let mut editor = TreeEditor::empty(); + let oid = dummy_oid(); + + editor.upsert(&store, "acc", "top-level.txt", oid).await.unwrap(); + + assert_eq!(editor.root.len(), 1); + let entry = editor.root.get("top-level.txt".as_bytes().as_bstr()).unwrap(); + assert_eq!(entry.mode, EntryKind::Blob.into()); + assert_eq!(entry.filename, "top-level.txt"); + assert_eq!(entry.oid, oid); + } + + #[tokio::test] + async fn test_remove_top_level_file() { + let (_d, store) = make_store(); + let mut editor = TreeEditor::empty(); + let oid = dummy_oid(); + + editor.upsert(&store, "acc", "single.txt", oid).await.unwrap(); + assert_eq!(editor.root.len(), 1); + + editor.remove(&store, "acc", "single.txt").await.unwrap(); + assert_eq!(editor.root.len(), 0); + } + + // --- Test helpers --- + + fn make_store() -> (tempfile::TempDir, crate::git::backends::local::LocalObjectStore) { + let dir = tempfile::tempdir().unwrap(); + let store = crate::git::backends::local::LocalObjectStore::new(dir.path()); + (dir, store) + } + + fn serialize_tree(tree: &Tree) -> Vec { + let mut buf = Vec::new(); + tree.write_to(&mut buf).unwrap(); + buf + } + + fn oid_hex(hex: &[u8; 40]) -> ObjectId { + ObjectId::from_hex(hex).unwrap() + } + + // --- from_tree --- + + #[tokio::test] + async fn test_from_tree_empty() { + let (_d, store) = make_store(); + let empty_tree = Tree { entries: Vec::new() }; + let oid = write_object(&store, "acc", gix_object::Kind::Tree, &serialize_tree(&empty_tree)) + .await + .unwrap(); + + let editor = TreeEditor::from_tree(&store, "acc", oid).await.unwrap(); + assert!(editor.root.is_empty()); + assert!(editor.subtrees.is_empty()); + } + + #[tokio::test] + async fn test_from_tree_with_entries() { + let (_d, store) = make_store(); + + let blob_a = oid_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + let blob_b = oid_hex(b"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"); + + let resources_tree = Tree { + entries: vec![tree::Entry { + mode: EntryKind::Blob.into(), + filename: "b.md".into(), + oid: blob_b, + }], + }; + let resources_oid = write_object(&store, "acc", gix_object::Kind::Tree, &serialize_tree(&resources_tree)) + .await + .unwrap(); + + let root_tree = Tree { + entries: vec![ + tree::Entry { + mode: EntryKind::Blob.into(), + filename: "a.md".into(), + oid: blob_a, + }, + tree::Entry { + mode: EntryKind::Tree.into(), + filename: "resources".into(), + oid: resources_oid, + }, + ], + }; + let root_oid = write_object(&store, "acc", gix_object::Kind::Tree, &serialize_tree(&root_tree)) + .await + .unwrap(); + + let editor = TreeEditor::from_tree(&store, "acc", root_oid).await.unwrap(); + assert_eq!(editor.root.len(), 2); + assert!(editor.root.contains_key("a.md".as_bytes().as_bstr())); + assert!(editor.root.contains_key("resources".as_bytes().as_bstr())); + + // Lazy loading: from_tree only loads root; subtrees are not read yet. + assert!(editor.subtrees.is_empty()); + } + + // --- write --- + + #[tokio::test] + async fn test_write_empty_tree() { + let (_d, store) = make_store(); + let mut editor = TreeEditor::empty(); + let oid = editor.write(&store, "acc").await.unwrap(); + assert_eq!(oid, ObjectId::empty_tree(gix_hash::Kind::Sha1)); + } + + #[tokio::test] + async fn test_write_single_blob() { + let (_d, store) = make_store(); + let blob_oid = oid_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "README.md", blob_oid).await.unwrap(); + let root_oid = editor.write(&store, "acc").await.unwrap(); + + let tree = load_tree(&store, "acc", &root_oid).await.unwrap(); + assert_eq!(tree.entries.len(), 1); + assert_eq!(tree.entries[0].filename, "README.md"); + assert_eq!(tree.entries[0].oid, blob_oid); + } + + #[tokio::test] + async fn test_write_nested_structure() { + let (_d, store) = make_store(); + let oid1 = oid_hex(b"1111111111111111111111111111111111111111"); + let oid2 = oid_hex(b"2222222222222222222222222222222222222222"); + + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "README.md", oid1).await.unwrap(); + editor.upsert(&store, "acc", "resources/docs/a.md", oid2).await.unwrap(); + let root_oid = editor.write(&store, "acc").await.unwrap(); + + let root_tree = load_tree(&store, "acc", &root_oid).await.unwrap(); + assert_eq!(root_tree.entries.len(), 2); + + let resources_entry = root_tree + .entries + .iter() + .find(|e| e.filename == "resources") + .unwrap(); + assert!(resources_entry.mode.is_tree()); + + let res_tree = load_tree(&store, "acc", &resources_entry.oid).await.unwrap(); + assert_eq!(res_tree.entries.len(), 1); + + let docs_entry = &res_tree.entries[0]; + assert!(docs_entry.mode.is_tree()); + let docs_tree = load_tree(&store, "acc", &docs_entry.oid).await.unwrap(); + assert_eq!(docs_tree.entries.len(), 1); + assert_eq!(docs_tree.entries[0].oid, oid2); + } + + #[tokio::test] + async fn test_round_trip_from_tree_upsert_write() { + let (_d, store) = make_store(); + let oid1 = oid_hex(b"1111111111111111111111111111111111111111"); + let oid2 = oid_hex(b"2222222222222222222222222222222222222222"); + + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "a.md", oid1).await.unwrap(); + let first_oid = editor.write(&store, "acc").await.unwrap(); + + let mut editor2 = TreeEditor::from_tree(&store, "acc", first_oid).await.unwrap(); + editor2.upsert(&store, "acc", "b.md", oid2).await.unwrap(); + let second_oid = editor2.write(&store, "acc").await.unwrap(); + + let tree = load_tree(&store, "acc", &second_oid).await.unwrap(); + assert_eq!(tree.entries.len(), 2); + } + + #[tokio::test] + async fn test_file_to_dir_transition_overwrites_blob() { + // Prev tree: `foo` is a file (Blob). Then a path appears *under* it + // (`foo/bar.md`), i.e. `foo` is now a directory. Upsert must replace + // the stale Blob entry with a Tree and place the child inside it. + let (_d, store) = make_store(); + let blob_foo = oid_hex(b"1111111111111111111111111111111111111111"); + let blob_bar = oid_hex(b"2222222222222222222222222222222222222222"); + + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "foo", blob_foo).await.unwrap(); + let first_oid = editor.write(&store, "acc").await.unwrap(); + + let mut editor2 = TreeEditor::from_tree(&store, "acc", first_oid).await.unwrap(); + editor2 + .upsert(&store, "acc", "foo/bar.md", blob_bar) + .await + .unwrap(); + let second_oid = editor2.write(&store, "acc").await.unwrap(); + + let root = load_tree(&store, "acc", &second_oid).await.unwrap(); + assert_eq!(root.entries.len(), 1); + let foo_entry = &root.entries[0]; + assert_eq!(foo_entry.filename, "foo"); + assert!(foo_entry.mode.is_tree(), "foo must now be a directory"); + + let foo_tree = load_tree(&store, "acc", &foo_entry.oid).await.unwrap(); + assert_eq!(foo_tree.entries.len(), 1); + assert_eq!(foo_tree.entries[0].filename, "bar.md"); + assert_eq!(foo_tree.entries[0].oid, blob_bar); + } + + #[tokio::test] + async fn test_remove_under_file_is_noop() { + // Prev tree: `foo` is a file. Removing `foo/bar.md` (treating `foo` as + // a dir) must be a silent no-op — not an error — and leave `foo` intact. + let (_d, store) = make_store(); + let blob_foo = oid_hex(b"1111111111111111111111111111111111111111"); + + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "foo", blob_foo).await.unwrap(); + let first_oid = editor.write(&store, "acc").await.unwrap(); + + let mut editor2 = TreeEditor::from_tree(&store, "acc", first_oid).await.unwrap(); + editor2.remove(&store, "acc", "foo/bar.md").await.unwrap(); + let second_oid = editor2.write(&store, "acc").await.unwrap(); + + let root = load_tree(&store, "acc", &second_oid).await.unwrap(); + assert_eq!(root.entries.len(), 1); + assert_eq!(root.entries[0].filename, "foo"); + assert!(root.entries[0].mode == EntryKind::Blob.into()); + assert_eq!(root.entries[0].oid, blob_foo); + } + + // --- flatten --- + + #[tokio::test] + async fn test_flatten_empty_tree() { + let (_d, store) = make_store(); + let empty_tree = Tree { entries: Vec::new() }; + let oid = write_object(&store, "acc", gix_object::Kind::Tree, &serialize_tree(&empty_tree)) + .await + .unwrap(); + let result = flatten(&store, "acc", oid, &None).await.unwrap(); + assert!(result.is_empty()); + } + + #[tokio::test] + async fn test_flatten_nested_tree() { + let (_d, store) = make_store(); + let oid1 = oid_hex(b"1111111111111111111111111111111111111111"); + let oid2 = oid_hex(b"2222222222222222222222222222222222222222"); + + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "README.md", oid1).await.unwrap(); + editor.upsert(&store, "acc", "resources/docs/a.md", oid2).await.unwrap(); + let root_oid = editor.write(&store, "acc").await.unwrap(); + + let result = flatten(&store, "acc", root_oid, &None).await.unwrap(); + assert_eq!(result.len(), 2); + assert_eq!(result[0].0, "README.md"); + assert_eq!(result[0].1, oid1); + assert_eq!(result[1].0, "resources/docs/a.md"); + assert_eq!(result[1].1, oid2); + } + + #[tokio::test] + async fn test_flatten_with_path_filter() { + let (_d, store) = make_store(); + let oid1 = oid_hex(b"1111111111111111111111111111111111111111"); + let oid2 = oid_hex(b"2222222222222222222222222222222222222222"); + + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "a.md", oid1).await.unwrap(); + editor.upsert(&store, "acc", "resources/b.md", oid2).await.unwrap(); + let root_oid = editor.write(&store, "acc").await.unwrap(); + + let filter = Some(vec!["resources".to_string()]); + let result = flatten(&store, "acc", root_oid, &filter).await.unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].0, "resources/b.md"); + assert_eq!(result[0].1, oid2); + } + + // --- lookup --- + + #[tokio::test] + async fn test_lookup_blob_in_root() { + let (_d, store) = make_store(); + let blob_oid = oid_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "README.md", blob_oid).await.unwrap(); + let root_oid = editor.write(&store, "acc").await.unwrap(); + + let (found, mode) = lookup(&store, "acc", root_oid, "README.md") + .await + .unwrap() + .unwrap(); + assert_eq!(found, blob_oid); + assert!(mode.is_blob()); + } + + #[tokio::test] + async fn test_lookup_nested_blob() { + let (_d, store) = make_store(); + let blob_oid = oid_hex(b"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"); + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "resources/a.md", blob_oid).await.unwrap(); + let root_oid = editor.write(&store, "acc").await.unwrap(); + + let (found, _) = lookup(&store, "acc", root_oid, "resources/a.md") + .await + .unwrap() + .unwrap(); + assert_eq!(found, blob_oid); + } + + #[tokio::test] + async fn test_lookup_not_found() { + let (_d, store) = make_store(); + let empty_tree = Tree { entries: Vec::new() }; + let root_oid = write_object(&store, "acc", gix_object::Kind::Tree, &serialize_tree(&empty_tree)) + .await + .unwrap(); + let result = lookup(&store, "acc", root_oid, "nonexistent.md").await.unwrap(); + assert!(result.is_none()); + } + + #[tokio::test] + async fn test_lookup_tree_entry() { + let (_d, store) = make_store(); + let blob_oid = oid_hex(b"cccccccccccccccccccccccccccccccccccccccc"); + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "resources/a.md", blob_oid).await.unwrap(); + let root_oid = editor.write(&store, "acc").await.unwrap(); + + let (found, mode) = lookup(&store, "acc", root_oid, "resources") + .await + .unwrap() + .unwrap(); + assert!(mode.is_tree()); + // Verify by loading and confirming it has 1 entry + let subtree = load_tree(&store, "acc", &found).await.unwrap(); + assert_eq!(subtree.entries.len(), 1); + assert_eq!(subtree.entries[0].filename, "a.md"); + } + + // --- Sort order --- + + #[test] + fn test_git_sort_order_preserved() { + // Git sorts trees as if their name had a trailing '/'. + // So blob "foo.c" comes before tree "foo" (which sorts as "foo/"). + let oid = dummy_oid(); + let mut entries = BTreeMap::new(); + entries.insert( + "foo.c".into(), + tree::Entry { + mode: EntryKind::Blob.into(), + filename: "foo.c".into(), + oid, + }, + ); + entries.insert( + "foo".into(), + tree::Entry { + mode: EntryKind::Tree.into(), + filename: "foo".into(), + oid, + }, + ); + + let git_tree = entries_to_tree(&entries); + assert_eq!(git_tree.entries[0].filename, "foo.c"); + assert_eq!(git_tree.entries[1].filename, "foo"); + assert!(git_tree.entries[0].mode.is_blob()); + assert!(git_tree.entries[1].mode.is_tree()); + } + + // --- Upsert subtree --- + + #[tokio::test] + async fn test_upsert_subtree_root_level() { + let (_d, store) = make_store(); + let mut editor = TreeEditor::empty(); + let tree_oid = ObjectId::empty_tree(gix_hash::Kind::Sha1); + + editor.upsert_subtree(&store, "acc", "subdir", tree_oid).await.unwrap(); + + assert_eq!(editor.root.len(), 1); + let entry = editor.root.get("subdir".as_bytes().as_bstr()).unwrap(); + assert!(entry.mode.is_tree()); + assert_eq!(entry.oid, tree_oid); + + // write() should reuse the OID directly (no recursion into self.subtrees) + let root_oid = editor.write(&store, "acc").await.unwrap(); + let root = load_tree(&store, "acc", &root_oid).await.unwrap(); + assert_eq!(root.entries.len(), 1); + assert_eq!(root.entries[0].filename, "subdir"); + assert_eq!(root.entries[0].oid, tree_oid); + } + + #[tokio::test] + async fn test_upsert_subtree_nested() { + let (_d, store) = make_store(); + let mut editor = TreeEditor::empty(); + let tree_oid = ObjectId::empty_tree(gix_hash::Kind::Sha1); + + editor.upsert_subtree(&store, "acc", "a/b/c", tree_oid).await.unwrap(); + + assert_eq!(editor.root.len(), 1); + assert!(editor.root.get("a".as_bytes().as_bstr()).unwrap().mode.is_tree()); + assert!(editor.root.get("a".as_bytes().as_bstr()).unwrap().oid.is_null()); + + let a_sub = editor.subtrees.get("a".as_bytes().as_bstr()).unwrap(); + assert_eq!(a_sub.len(), 1); + assert!(a_sub.get("b".as_bytes().as_bstr()).unwrap().mode.is_tree()); + + let ab_sub = editor.subtrees.get("a/b".as_bytes().as_bstr()).unwrap(); + assert_eq!(ab_sub.len(), 1); + assert!(ab_sub.get("c".as_bytes().as_bstr()).unwrap().mode.is_tree()); + assert_eq!(ab_sub.get("c".as_bytes().as_bstr()).unwrap().oid, tree_oid); + + // No in-memory state for "a/b/c" — written directly. + assert!(editor.subtrees.get("a/b/c".as_bytes().as_bstr()).is_none()); + + let root_oid = editor.write(&store, "acc").await.unwrap(); + let root = load_tree(&store, "acc", &root_oid).await.unwrap(); + assert_eq!(root.entries.len(), 1); + let a_oid = root.entries[0].oid; + let a_tree = load_tree(&store, "acc", &a_oid).await.unwrap(); + assert_eq!(a_tree.entries.len(), 1); + assert_eq!(a_tree.entries[0].filename, "b"); + } + + #[tokio::test] + async fn test_upsert_subtree_clears_existing_state() { + let (_d, store) = make_store(); + let oid1 = oid_hex(b"1111111111111111111111111111111111111111"); + let oid2 = oid_hex(b"2222222222222222222222222222222222222222"); + + // Build editor with a/b/x.txt and a/b/y.txt + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "a/b/x.txt", oid1).await.unwrap(); + editor.upsert(&store, "acc", "a/b/y.txt", oid2).await.unwrap(); + assert!(editor.subtrees.contains_key("a/b".as_bytes().as_bstr())); + + // Replace a/b with an empty subtree + let empty_tree = ObjectId::empty_tree(gix_hash::Kind::Sha1); + editor.upsert_subtree(&store, "acc", "a/b", empty_tree).await.unwrap(); + + // Stale "a/b" subtree should be gone + assert!(editor.subtrees.get("a/b".as_bytes().as_bstr()).is_none()); + + let root_oid = editor.write(&store, "acc").await.unwrap(); + let root = load_tree(&store, "acc", &root_oid).await.unwrap(); + let b_entry = root.entries.iter().find(|e| e.filename == "a").unwrap(); + let a_tree = load_tree(&store, "acc", &b_entry.oid).await.unwrap(); + assert_eq!(a_tree.entries.len(), 1); + assert_eq!(a_tree.entries[0].filename, "b"); + assert_eq!(a_tree.entries[0].oid, empty_tree); + } + + #[tokio::test] + async fn test_upsert_subtree_then_upsert_inside() { + let (_d, store) = make_store(); + let oid = oid_hex(b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + + let mut editor = TreeEditor::empty(); + editor.upsert_subtree(&store, "acc", "a/b", ObjectId::empty_tree(gix_hash::Kind::Sha1)).await.unwrap(); + + // Upsert inside the spliced subtree creates new in-memory state from scratch. + editor.upsert(&store, "acc", "a/b/c.txt", oid).await.unwrap(); + + let root_oid = editor.write(&store, "acc").await.unwrap(); + let root = load_tree(&store, "acc", &root_oid).await.unwrap(); + let a_entry = root.entries.iter().find(|e| e.filename == "a").unwrap(); + let a_tree = load_tree(&store, "acc", &a_entry.oid).await.unwrap(); + let b_entry = a_tree.entries.iter().find(|e| e.filename == "b").unwrap(); + let b_tree = load_tree(&store, "acc", &b_entry.oid).await.unwrap(); + assert_eq!(b_tree.entries.len(), 1); + assert_eq!(b_tree.entries[0].filename, "c.txt"); + assert_eq!(b_tree.entries[0].oid, oid); + } + + // --- Integration --- + + #[tokio::test] + async fn test_commit_flow_round_trip() { + let (_d, store) = make_store(); + let oid_readme = oid_hex(b"1111111111111111111111111111111111111111"); + let oid_doc = oid_hex(b"2222222222222222222222222222222222222222"); + let oid_skill = oid_hex(b"3333333333333333333333333333333333333333"); + + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "README.md", oid_readme).await.unwrap(); + editor.upsert(&store, "acc", "resources/docs/a.md", oid_doc).await.unwrap(); + editor.upsert(&store, "acc", "agent/skills/b.py", oid_skill).await.unwrap(); + let commit1_oid = editor.write(&store, "acc").await.unwrap(); + + let all_files = flatten(&store, "acc", commit1_oid, &None).await.unwrap(); + assert_eq!(all_files.len(), 3); + + assert_eq!( + lookup(&store, "acc", commit1_oid, "README.md").await.unwrap().unwrap().0, + oid_readme + ); + assert_eq!( + lookup(&store, "acc", commit1_oid, "resources/docs/a.md").await.unwrap().unwrap().0, + oid_doc + ); + assert_eq!( + lookup(&store, "acc", commit1_oid, "agent/skills/b.py").await.unwrap().unwrap().0, + oid_skill + ); + + let resources_only = flatten( + &store, + "acc", + commit1_oid, + &Some(vec!["resources".to_string()]), + ) + .await + .unwrap(); + assert_eq!(resources_only.len(), 1); + assert_eq!(resources_only[0].0, "resources/docs/a.md"); + + let mut editor2 = TreeEditor::from_tree(&store, "acc", commit1_oid).await.unwrap(); + let oid_new = oid_hex(b"4444444444444444444444444444444444444444"); + editor2.upsert(&store, "acc", "resources/docs/a.md", oid_new).await.unwrap(); + editor2.remove(&store, "acc", "agent/skills/b.py").await.unwrap(); + editor2.upsert(&store, "acc", "agent/skills/c.py", oid_new).await.unwrap(); + let commit2_oid = editor2.write(&store, "acc").await.unwrap(); + + let all_files2 = flatten(&store, "acc", commit2_oid, &None).await.unwrap(); + assert_eq!(all_files2.len(), 3); + + assert_eq!( + lookup(&store, "acc", commit2_oid, "resources/docs/a.md").await.unwrap().unwrap().0, + oid_new + ); + assert!(lookup(&store, "acc", commit2_oid, "agent/skills/b.py").await.unwrap().is_none()); + assert_eq!( + lookup(&store, "acc", commit2_oid, "agent/skills/c.py").await.unwrap().unwrap().0, + oid_new + ); + + // Original tree unchanged + assert_eq!( + lookup(&store, "acc", commit1_oid, "resources/docs/a.md").await.unwrap().unwrap().0, + oid_doc + ); + assert_eq!( + lookup(&store, "acc", commit1_oid, "agent/skills/b.py").await.unwrap().unwrap().0, + oid_skill + ); + } + + // --- Fast Path 2 --- + + /// ObjectStore wrapper recording every `get`/`put` oid, used to prove the + /// lazy-loading commit path never touches untouched subtrees. + struct SpyObjectStore { + inner: crate::git::backends::local::LocalObjectStore, + gets: std::sync::Mutex>, + puts: std::sync::Mutex>, + } + + impl SpyObjectStore { + fn new(inner: crate::git::backends::local::LocalObjectStore) -> Self { + Self { + inner, + gets: std::sync::Mutex::new(Vec::new()), + puts: std::sync::Mutex::new(Vec::new()), + } + } + fn reset(&self) { + self.gets.lock().unwrap().clear(); + self.puts.lock().unwrap().clear(); + } + fn was_read(&self, oid: &ObjectId) -> bool { + self.gets.lock().unwrap().iter().any(|o| o == oid) + } + } + + #[async_trait::async_trait] + impl ObjectStore for SpyObjectStore { + async fn put( + &self, + account: &str, + oid: &ObjectId, + zlib_body: bytes::Bytes, + ) -> Result<(), crate::git::error::ObjectStoreError> { + self.puts.lock().unwrap().push(*oid); + self.inner.put(account, oid, zlib_body).await + } + async fn get( + &self, + account: &str, + oid: &ObjectId, + ) -> Result { + self.gets.lock().unwrap().push(*oid); + self.inner.get(account, oid).await + } + async fn exists( + &self, + account: &str, + oid: &ObjectId, + ) -> Result { + self.inner.exists(account, oid).await + } + } + + #[tokio::test] + async fn test_fast_path_2_untouched_subtree_not_read() { + let dir = tempfile::tempdir().unwrap(); + let store = SpyObjectStore::new( + crate::git::backends::local::LocalObjectStore::new(dir.path()), + ); + let oid_a = oid_hex(b"1111111111111111111111111111111111111111"); + let oid_b = oid_hex(b"2222222222222222222222222222222222222222"); + + // Build a root with two subtrees: resources/ and agent/. + let mut editor = TreeEditor::empty(); + editor.upsert(&store, "acc", "resources/a.md", oid_a).await.unwrap(); + editor.upsert(&store, "acc", "agent/b.py", oid_b).await.unwrap(); + let root_oid = editor.write(&store, "acc").await.unwrap(); + + // Record the untouched subtree's oid (agent/). + let root = load_tree(&store, "acc", &root_oid).await.unwrap(); + let agent_oid = root.entries.iter().find(|e| e.filename == "agent").unwrap().oid; + + // Reset spy, then edit only inside resources/. + store.reset(); + let mut editor2 = TreeEditor::from_tree(&store, "acc", root_oid).await.unwrap(); + let oid_c = oid_hex(b"3333333333333333333333333333333333333333"); + editor2.upsert(&store, "acc", "resources/c.md", oid_c).await.unwrap(); + let new_root_oid = editor2.write(&store, "acc").await.unwrap(); + + // Fast Path 2: the untouched agent/ subtree was never read... + assert!(!store.was_read(&agent_oid), "untouched subtree should not be read"); + + // ...and the new root reuses its OID as-is (not rewritten). + let new_root = load_tree(&store, "acc", &new_root_oid).await.unwrap(); + let new_agent_oid = new_root.entries.iter().find(|e| e.filename == "agent").unwrap().oid; + assert_eq!(new_agent_oid, agent_oid, "untouched subtree OID should be reused"); + } + + #[tokio::test] + async fn test_upsert_subtree_preserves_siblings() { + let (_d, store) = make_store(); + let oid_keep = oid_hex(b"1111111111111111111111111111111111111111"); + let oid_old = oid_hex(b"2222222222222222222222222222222222222222"); + let oid_new = oid_hex(b"3333333333333333333333333333333333333333"); + + // Base tree: proj/keep.txt and proj/sub/old.txt. + let mut base = TreeEditor::empty(); + base.upsert(&store, "acc", "proj/keep.txt", oid_keep).await.unwrap(); + base.upsert(&store, "acc", "proj/sub/old.txt", oid_old).await.unwrap(); + let root_oid = base.write(&store, "acc").await.unwrap(); + + // Replacement subtree containing new.txt. + let mut repl = TreeEditor::empty(); + repl.upsert(&store, "acc", "new.txt", oid_new).await.unwrap(); + let repl_oid = repl.write(&store, "acc").await.unwrap(); + + // Splice proj/sub with the replacement subtree. + let mut editor = TreeEditor::from_tree(&store, "acc", root_oid).await.unwrap(); + editor.upsert_subtree(&store, "acc", "proj/sub", repl_oid).await.unwrap(); + let new_root_oid = editor.write(&store, "acc").await.unwrap(); + + // Sibling proj/keep.txt is preserved. + let keep = lookup(&store, "acc", new_root_oid, "proj/keep.txt").await.unwrap(); + assert_eq!(keep.unwrap().0, oid_keep); + + // proj/sub now points at the replacement subtree (containing new.txt). + let (sub_oid, mode) = lookup(&store, "acc", new_root_oid, "proj/sub").await.unwrap().unwrap(); + assert!(mode.is_tree()); + assert_eq!(sub_oid, repl_oid); + let new_file = lookup(&store, "acc", new_root_oid, "proj/sub/new.txt").await.unwrap(); + assert_eq!(new_file.unwrap().0, oid_new); + // Old file is gone. + assert!(lookup(&store, "acc", new_root_oid, "proj/sub/old.txt").await.unwrap().is_none()); + } +} diff --git a/crates/ragfs/src/git/types.rs b/crates/ragfs/src/git/types.rs new file mode 100644 index 0000000000..eccd554ab5 --- /dev/null +++ b/crates/ragfs/src/git/types.rs @@ -0,0 +1,198 @@ +//! DTOs for the Git service API. + +use gix_hash::ObjectId; + +#[derive(Debug, Clone)] +pub struct CommitRequest { + pub account: String, + pub branch: String, // e.g. "main" — NOT the full "refs/heads/main" + pub message: String, + /// Explicit candidate paths (account-relative, e.g. "resources/a.md"). + /// `None` means "enumerate the whole account tree". + pub paths: Option>, + pub author_name: String, + pub author_email: String, +} + +#[derive(Debug, Clone)] +pub enum CommitResponse { + Created { commit_oid: ObjectId, changed: usize }, + /// No path produced an editor change; ref untouched. `commit_oid` is the + /// existing HEAD (or `ObjectId::null` if the branch did not exist). + Noop { commit_oid: ObjectId }, +} + +/// Per-path stat cache entry. Not persisted yet (Fast Path 1 is deferred), +/// but the type lives here so later work can fill in the index. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct IndexEntry { + pub size: u64, + pub mtime_ns: i128, + pub oid: ObjectId, +} + +#[derive(Debug, Clone)] +pub struct ShowRequest { + pub account: String, + /// One of: 40-hex commit OID, short branch name ("main"), + /// or full ref path ("refs/heads/main", "refs/tags/v1"). + pub target_ref: String, + /// If `Some(path)`, return that path's blob bytes from the commit's tree. + /// If `None`, return the commit's metadata. + /// `path` is account-relative tree path, e.g. "resources/a.md". + pub path: Option, +} + +#[derive(Debug, Clone)] +pub enum ShowResponse { + Commit { + oid: ObjectId, + tree: ObjectId, + parents: Vec, + author: Actor, + committer: Actor, + message: String, + }, + Blob { + oid: ObjectId, + size: u64, + /// Zero-copy slice over the decompressed object buffer (header + payload). + /// Cloning is `Arc::clone` — cheap; cloning a `Vec` of the same size is not. + /// The few bytes of the loose-object header upstream of the payload remain + /// alive in the backing buffer until the last `Bytes` handle is dropped; + /// negligible compared to the payload itself. + bytes: bytes::Bytes, + }, +} + +/// Owned, Python-friendly projection of `gix_actor::SignatureRef`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Actor { + pub name: String, + pub email: String, + /// Seconds since UNIX epoch. + pub time_seconds: i64, + /// Timezone offset in seconds (e.g. +08:00 → 28800). + pub tz_offset_seconds: i32, +} + +/// Input for `GitService::restore`. +#[derive(Debug, Clone)] +pub struct RestoreRequest { + /// Account this restore applies to. + pub account: String, + /// Branch whose HEAD is the parent of the new commit. Defaults to "main" + /// in callers; this DTO requires the caller to pass it explicitly to + /// avoid invisible defaults at this layer. + pub branch: String, + /// Optional account-relative subtree path to restore, e.g. "resources/proj_a". + /// `None` restores the whole account tree. + pub project_dir: Option, + /// What to restore from. Same resolution rules as `ShowRequest::target_ref`: + /// 40-hex commit OID / short branch name / full `refs/heads/xxx`. + pub source_commit: String, + /// If `true`, compute and return the diff but write nothing — no VFS + /// writes, no new objects in the object store, no ref update. + pub dry_run: bool, + /// Commit message for the new commit. If `None`, a default is generated: + /// `"restore {project_dir} from {source_oid_short}"`. + pub message: Option, + pub author_name: String, + pub author_email: String, +} + +/// Structured diff between two subtrees, computed by `restore`. +/// +/// All paths in this struct are **relative to `project_dir`** — they are NOT +/// prefixed. Callers (e.g. a future Python wrapper) prefix them when needed +/// for display. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RestoreDiff { + /// Paths whose content in `source_subtree` should be written into the VFS + /// (creates or overwrites). Carries the blob oid to fetch from the + /// object store. + pub to_write: Vec<(String, ObjectId)>, + /// Paths present in `head_subtree` but absent in `source_subtree`. Must + /// be removed from the VFS. + pub to_delete: Vec, + /// Paths whose oid is identical in both subtrees. Listed only for + /// reporting; restore does not touch them. + pub unchanged: Vec, +} + +/// Output of `GitService::restore`. +#[derive(Debug, Clone)] +pub enum RestoreResponse { + /// A new commit was created and the branch ref now points at it. + Applied { + /// The new commit's OID — branch HEAD now points here. + new_commit_oid: ObjectId, + /// The source commit (after `resolve_ref`) we restored from. + source_commit: ObjectId, + /// Previous HEAD oid (parent of `new_commit_oid`). + parent_commit: ObjectId, + /// Number of files written through the VFS. + written: usize, + /// Number of files deleted through the VFS. + deleted: usize, + /// Number of files left untouched because source/head agreed. + unchanged: usize, + /// Account-relative paths (prefixed with `project_dir`) that were + /// written to the VFS. Lets callers trigger downstream side effects + /// (vector index rebuild, watcher notifications) without re-walking + /// the tree. + written_paths: Vec, + /// Account-relative paths that were removed from the VFS. + deleted_paths: Vec, + }, + /// Source subtree byte-equal to head subtree — nothing to do. No new + /// commit was created; the branch ref is unchanged. + Noop { + /// Current HEAD oid (unchanged). + head: ObjectId, + /// Source commit oid (after `resolve_ref`). + source: ObjectId, + }, + /// `dry_run = true` request — returns the computed diff without + /// performing any writes. + DryRun { + /// The computed diff (paths are relative to `project_dir`). + diff: RestoreDiff, + /// Current HEAD oid (would-be parent if applied). + head: ObjectId, + /// Source commit oid (after `resolve_ref`). + source: ObjectId, + }, +} + +/// Restore reached the ref-swap step (`new_commit_oid` is now branch HEAD) +/// but at least one per-path write or delete on the VFS failed. The caller +/// must treat this as "HEAD advanced, working tree partial": +/// `written_paths` / `deleted_paths` list paths that *did* reach the VFS +/// and therefore still need reindex; `failed_writes` / `failed_deletes` +/// list the per-path failures that need follow-up. +/// +/// Boxed inside `GitError::RestoreWritebackPartial` to keep the enum size +/// bounded — the two path lists can be large on big restores. +#[derive(Debug, Clone)] +pub struct RestoreWritebackPartial { + pub new_commit_oid: ObjectId, + pub source_commit: ObjectId, + pub parent_commit: ObjectId, + /// Files that *did* reach the VFS (subset of the original plan). + pub written: usize, + /// Files that *were* deleted from the VFS (or were idempotently already gone). + pub deleted: usize, + /// Files left untouched because source/head agreed. + pub unchanged: usize, + /// Account-relative paths whose blob bytes reached the VFS. + pub written_paths: Vec, + /// Account-relative paths that were removed from (or already absent from) the VFS. + pub deleted_paths: Vec, + /// `(account-relative path, error message)` for writes that failed + /// after the ref already advanced. + pub failed_writes: Vec<(String, String)>, + /// `(account-relative path, error message)` for deletes that failed + /// with a non-`NotFound` error after the ref already advanced. + pub failed_deletes: Vec<(String, String)>, +} diff --git a/crates/ragfs/src/git/util.rs b/crates/ragfs/src/git/util.rs new file mode 100644 index 0000000000..cee57cd949 --- /dev/null +++ b/crates/ragfs/src/git/util.rs @@ -0,0 +1,278 @@ +//! Utility functions for Git module + +use bytes::Bytes; +use std::io::{Read, Write}; + +use crate::git::error::RefStoreError; + +/// Validate a ref name follows Git naming conventions. +/// +/// Checks for: +/// - Empty name +/// - Contains ".." +/// - Starts or ends with "/" +/// - Contains invalid characters +pub fn validate_ref_name(ref_name: &str) -> Result<(), RefStoreError> { + if ref_name.is_empty() { + return Err(RefStoreError::InvalidName( + "ref name cannot be empty".to_string(), + )); + } + if ref_name.contains("..") { + return Err(RefStoreError::InvalidName( + "ref name cannot contain '..'".to_string(), + )); + } + if ref_name.starts_with('/') || ref_name.ends_with('/') { + return Err(RefStoreError::InvalidName( + "ref name cannot start or end with '/'".to_string(), + )); + } + if ref_name.contains(' ') + || ref_name.contains('\x00') + || ref_name.contains('~') + || ref_name.contains('^') + || ref_name.contains(':') + || ref_name.contains('?') + || ref_name.contains('[') + || ref_name.contains('*') + { + return Err(RefStoreError::InvalidName( + "ref name contains invalid characters".to_string(), + )); + } + Ok(()) +} + +/// Compress data using zlib (for Git loose object storage). +pub fn zlib_compress(data: &[u8]) -> Result, std::io::Error> { + let mut encoder = flate2::write::ZlibEncoder::new(Vec::new(), flate2::Compression::default()); + encoder.write_all(data)?; + encoder.finish() +} + +/// Decompress zlib-compressed data (for reading Git loose objects). +/// +/// Pre-allocates a generous output buffer to amortize the `Vec` doubling cost +/// that hurts large blobs (a 100 MiB payload otherwise triggers ~27 reallocs). +/// The hint assumes a worst-case compression ratio of ~4×; capped at 64 MiB +/// so a pathologically small-but-compressible header doesn't reserve absurd +/// amounts of memory. Exceeding the hint still works — `Vec` will grow. +pub fn zlib_decompress(data: &[u8]) -> Result, std::io::Error> { + const HINT_CAP: usize = 64 * 1024 * 1024; + let hint = data.len().saturating_mul(4).min(HINT_CAP); + let mut decoded = Vec::with_capacity(hint); + let mut decoder = flate2::read::ZlibDecoder::new(data); + decoder.read_to_end(&mut decoded)?; + Ok(decoded) +} + +/// Parse a Git loose object header, returning (kind, size, header_end_offset). +pub fn parse_object_header(data: &[u8]) -> Result<(gix_object::Kind, u64, usize), crate::git::error::ObjectStoreError> { + gix_object::decode::loose_header(data).map_err(|e| { + crate::git::error::ObjectStoreError::Backend(format!("invalid object header: {e}")) + }) +} + +/// Read and decompress a Git object from ObjectStore, returning the full +/// uncompressed bytes (including header). +pub async fn read_object( + store: &dyn crate::git::object_store::ObjectStore, + account: &str, + oid: &gix_hash::ObjectId, +) -> Result { + let compressed = store.get(account, oid).await?; + let decompressed = zlib_decompress(&compressed) + .map_err(|e| crate::git::error::ObjectStoreError::Zlib(e.to_string()))?; + Ok(Bytes::from(decompressed)) +} + +/// Serialize, compress, and write a Git object to ObjectStore. +/// Returns the object's ObjectId. +pub async fn write_object( + store: &dyn crate::git::object_store::ObjectStore, + account: &str, + kind: gix_object::Kind, + data: &[u8], +) -> Result { + let header = gix_object::encode::loose_header(kind, data.len() as u64); + let oid = gix_object::compute_hash(gix_hash::Kind::Sha1, kind, data); + let mut full = Vec::with_capacity(header.len() + data.len()); + full.extend_from_slice(&header); + full.extend_from_slice(data); + let compressed = zlib_compress(&full)?; + store.put(account, &oid, Bytes::from(compressed)).await?; + Ok(oid) +} + +/// Same as [`write_object`], but runs an `exists` precheck before compressing +/// and putting (Fast Path 3). If the object is already present, the zlib +/// compression and `put` are skipped and the oid is returned directly. `put` +/// is itself idempotent, so this precheck is purely a performance optimization +/// (saves S3 body upload / local zlib compression for duplicate blobs). +pub async fn write_object_if_absent( + store: &dyn crate::git::object_store::ObjectStore, + account: &str, + kind: gix_object::Kind, + data: &[u8], +) -> Result { + let oid = gix_object::compute_hash(gix_hash::Kind::Sha1, kind, data); + if store.exists(account, &oid).await? { + return Ok(oid); + } + let header = gix_object::encode::loose_header(kind, data.len() as u64); + let mut full = Vec::with_capacity(header.len() + data.len()); + full.extend_from_slice(&header); + full.extend_from_slice(data); + let compressed = zlib_compress(&full)?; + store.put(account, &oid, Bytes::from(compressed)).await?; + Ok(oid) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_ref_name() { + assert!(validate_ref_name("refs/heads/main").is_ok()); + assert!(validate_ref_name("refs/tags/v1.0").is_ok()); + assert!(validate_ref_name("HEAD").is_ok()); + + assert!(validate_ref_name("").is_err()); + assert!(validate_ref_name("..").is_err()); + assert!(validate_ref_name("refs/../heads").is_err()); + assert!(validate_ref_name("/refs/heads").is_err()); + assert!(validate_ref_name("refs/heads/ ").is_err()); + assert!(validate_ref_name("refs~head").is_err()); + assert!(validate_ref_name("refs^head").is_err()); + assert!(validate_ref_name("refs:head").is_err()); + assert!(validate_ref_name("refs?head").is_err()); + assert!(validate_ref_name("refs[head]").is_err()); + assert!(validate_ref_name("refs*head").is_err()); + } + + #[test] + fn test_zlib_round_trip() { + let original = b"tree 15\0hello world!!!"; + let compressed = zlib_compress(original).unwrap(); + let decompressed = zlib_decompress(&compressed).unwrap(); + assert_eq!(decompressed, original); + } + + #[test] + fn test_parse_object_header_tree() { + let data = b"tree 15\0entries data"; + let (kind, size, offset) = parse_object_header(data).unwrap(); + assert_eq!(kind, gix_object::Kind::Tree); + assert_eq!(size, 15); + assert_eq!(offset, 8); + } + + #[test] + fn test_parse_object_header_blob() { + let data = b"blob 5\0hello"; + let (kind, size, offset) = parse_object_header(data).unwrap(); + assert_eq!(kind, gix_object::Kind::Blob); + assert_eq!(size, 5); + assert_eq!(offset, 7); + } + + #[tokio::test] + async fn test_write_read_object_round_trip() { + use tempfile::tempdir; + use crate::git::backends::local::LocalObjectStore; + + let temp_dir = tempdir().unwrap(); + let store = LocalObjectStore::new(temp_dir.path()); + + let data = b"hello tree bytes"; + let kind = gix_object::Kind::Blob; + + // Write the object + let oid = write_object(&store, "test-account", kind, data).await.unwrap(); + + // Read the object back + let raw = read_object(&store, "test-account", &oid).await.unwrap(); + + // Parse and validate header + let (parsed_kind, size, offset) = parse_object_header(&raw).unwrap(); + assert_eq!(parsed_kind, kind); + assert_eq!(size, data.len() as u64); + + // Validate body + assert_eq!(&raw[offset..], data); + + // Validate OID matches expected + let expected_oid = gix_object::compute_hash(gix_hash::Kind::Sha1, kind, data); + assert_eq!(oid, expected_oid); + } + + #[tokio::test] + async fn test_write_object_if_absent_skips_put_on_second_call() { + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + use tempfile::tempdir; + use crate::git::backends::local::LocalObjectStore; + use crate::git::object_store::ObjectStore; + use crate::git::error::ObjectStoreError; + use gix_hash::ObjectId; + + struct CountingStore { + inner: LocalObjectStore, + puts: AtomicUsize, + exists_calls: AtomicUsize, + } + + #[async_trait::async_trait] + impl ObjectStore for CountingStore { + async fn put( + &self, + account: &str, + oid: &ObjectId, + zlib_body: Bytes, + ) -> Result<(), ObjectStoreError> { + self.puts.fetch_add(1, Ordering::SeqCst); + self.inner.put(account, oid, zlib_body).await + } + async fn get(&self, account: &str, oid: &ObjectId) -> Result { + self.inner.get(account, oid).await + } + async fn exists(&self, account: &str, oid: &ObjectId) -> Result { + self.exists_calls.fetch_add(1, Ordering::SeqCst); + self.inner.exists(account, oid).await + } + } + + let temp_dir = tempdir().unwrap(); + let store = Arc::new(CountingStore { + inner: LocalObjectStore::new(temp_dir.path()), + puts: AtomicUsize::new(0), + exists_calls: AtomicUsize::new(0), + }); + + let data = b"duplicate blob content"; + let kind = gix_object::Kind::Blob; + + let oid1 = write_object_if_absent(store.as_ref(), "acct", kind, data) + .await + .unwrap(); + assert_eq!(store.puts.load(Ordering::SeqCst), 1); + + // Second call with identical data: should hit exists and skip put. + let oid2 = write_object_if_absent(store.as_ref(), "acct", kind, data) + .await + .unwrap(); + assert_eq!(oid1, oid2); + assert_eq!(store.puts.load(Ordering::SeqCst), 1, "put must not be called again"); + assert_eq!(store.exists_calls.load(Ordering::SeqCst), 2); + + // Object is readable and oid matches compute_hash. + let raw = read_object(store.as_ref(), "acct", &oid1).await.unwrap(); + let (parsed_kind, _size, offset) = parse_object_header(&raw).unwrap(); + assert_eq!(parsed_kind, kind); + assert_eq!(&raw[offset..], data); + let expected = gix_object::compute_hash(gix_hash::Kind::Sha1, kind, data); + assert_eq!(oid1, expected); + } +} diff --git a/crates/ragfs/src/lib.rs b/crates/ragfs/src/lib.rs index 47106ddc6b..f4582e2981 100644 --- a/crates/ragfs/src/lib.rs +++ b/crates/ragfs/src/lib.rs @@ -34,6 +34,7 @@ pub mod cache; pub mod core; pub mod crypto; pub mod multibackend; +pub mod git; pub mod plugins; pub mod shape; diff --git a/crates/ragfs/tests/git_s3_integration.rs b/crates/ragfs/tests/git_s3_integration.rs new file mode 100644 index 0000000000..e3816a0bd2 --- /dev/null +++ b/crates/ragfs/tests/git_s3_integration.rs @@ -0,0 +1,587 @@ +//! Integration tests for the S3/TOS git storage backend (audit plan §A2). +//! +//! These tests exercise [`S3ObjectStore`] / [`S3RefStore`] against a *real* +//! S3-compatible backend (TOS / MinIO / LocalStack). They are gated behind the +//! `s3` feature **and** the presence of a usable `[git]` + `[git.s3]` section in +//! the OpenViking config file (`ov.conf`), mirroring the skip strategy used by +//! the Python `test_fs_binding_s3.py` suite. +//! +//! Config resolution (first hit wins): +//! 1. `OV_GIT_S3_CONF` env var (explicit path to an `ov.conf` JSON file) +//! 2. `OPENVIKING_CONFIG_FILE` env var +//! 3. `~/.openviking/ov.conf` +//! 4. `/etc/openviking/ov.conf` +//! +//! The relevant section (JSON) looks like: +//! ```json +//! { +//! "git": { +//! "enabled": true, +//! "backend": "s3", +//! "s3": { +//! "bucket": "", +//! "region": "cn-beijing", +//! "endpoint": "https://tos-s3-cn-beijing.volces.com", +//! "access_key": "", +//! "secret_key": "", +//! "prefix": ".ovgit", +//! "use_path_style": false, +//! "cas_mode": "native" +//! } +//! } +//! } +//! ``` +//! +//! When no usable config is found, each test prints a notice and returns +//! successfully (treated as skipped) so default `cargo test` runs stay green. +//! +//! Run against TOS: +//! ```bash +//! OV_GIT_S3_CONF=/path/to/ov.conf \ +//! cargo test -p ragfs --features s3 --test git_s3_integration -- --nocapture +//! ``` +//! Tests namespace every key under `{prefix}/_it/{uuid}` and use a random +//! account per test, so concurrent runs never collide and never touch real data. + +#![cfg(feature = "s3")] + +use std::path::PathBuf; +use std::sync::Arc; + +use bytes::Bytes; +use gix_hash::ObjectId; + +use ragfs::core::filesystem::FileSystem; +use ragfs::git::backends::local::LocalObjectStore; +use ragfs::git::backends::s3::{CasMode, S3Config, S3ObjectStore, S3RefStore}; +use ragfs::git::error::RefStoreError; +use ragfs::git::object_store::ObjectStore; +use ragfs::git::ref_store::RefStore; +use ragfs::git::service::GitService; +use ragfs::git::types::{ + CommitRequest, CommitResponse, RestoreRequest, RestoreResponse, ShowRequest, ShowResponse, +}; +use ragfs::git::util::zlib_compress; +use ragfs::plugins::localfs::LocalFileSystem; + +/// Resolve the `ov.conf` path using the same chain documented above. +fn resolve_conf_path() -> Option { + let candidates = [ + std::env::var("OV_GIT_S3_CONF").ok(), + std::env::var("OPENVIKING_CONFIG_FILE").ok(), + std::env::var("HOME") + .ok() + .map(|h| format!("{h}/.openviking/ov.conf")), + Some("/etc/openviking/ov.conf".to_string()), + ]; + candidates + .into_iter() + .flatten() + .map(PathBuf::from) + .find(|p| p.exists()) +} + +/// Load and build an [`S3Config`] from the resolved `ov.conf`, namespacing the +/// `prefix` under a unique `_it/{uuid}` segment for test isolation. +/// +/// Returns `None` (treated as "skip") when no config file exists, when git is +/// not enabled, when the backend is not `s3`, or when the `[git.s3]` section is +/// missing required fields (`bucket`/`region`). +fn load_s3_config() -> Option { + let path = resolve_conf_path()?; + let raw = std::fs::read_to_string(&path).ok()?; + let root: serde_json::Value = serde_json::from_str(&raw).ok()?; + + // `git` may live at the top level (OpenVikingConfig.git) or, defensively, + // under `storage.git`. + let git = root + .get("git") + .or_else(|| root.get("storage").and_then(|s| s.get("git")))?; + + if !git.get("enabled").and_then(|v| v.as_bool()).unwrap_or(false) { + return None; + } + if git.get("backend").and_then(|v| v.as_str()) != Some("s3") { + return None; + } + + let s3 = git.get("s3")?; + let bucket = s3.get("bucket").and_then(|v| v.as_str())?.to_string(); + let region = s3.get("region").and_then(|v| v.as_str())?.to_string(); + if bucket.is_empty() || region.is_empty() { + return None; + } + + let str_opt = |key: &str| { + s3.get(key) + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + }; + + let base_prefix = str_opt("prefix").unwrap_or_else(|| ".ovgit".to_string()); + let prefix = format!( + "{}/_it/{}", + base_prefix.trim_end_matches('/'), + uuid::Uuid::new_v4().simple() + ); + + let endpoint = str_opt("endpoint"); + let access_key_id = str_opt("access_key"); + let secret_access_key = str_opt("secret_key"); + let use_path_style = s3 + .get("use_path_style") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + let cas_mode = match s3.get("cas_mode").and_then(|v| v.as_str()) { + Some("redis_lock") => CasMode::RedisLock, + _ => CasMode::Native, + }; + + Some(S3Config { + bucket, + prefix, + region, + endpoint, + access_key_id, + secret_access_key, + use_path_style, + cas_mode, + }) +} + +/// Build a random account id so concurrent tests never share namespaces. +fn random_account() -> String { + format!("acct-{}", uuid::Uuid::new_v4().simple()) +} + +/// Build a valid zlib-compressed loose object body together with its object id. +fn loose_object(data: &[u8]) -> (ObjectId, Bytes) { + let kind = gix_object::Kind::Blob; + let header = gix_object::encode::loose_header(kind, data.len() as u64); + let oid = gix_object::compute_hash(gix_hash::Kind::Sha1, kind, data); + let mut full = Vec::with_capacity(header.len() + data.len()); + full.extend_from_slice(&header); + full.extend_from_slice(data); + let compressed = zlib_compress(&full).expect("zlib compress"); + (oid, Bytes::from(compressed)) +} + +/// Compute a deterministic [`ObjectId`] from arbitrary bytes (used as ref values). +fn oid_of(data: &[u8]) -> ObjectId { + gix_object::compute_hash(gix_hash::Kind::Sha1, gix_object::Kind::Blob, data) +} + +/// Print a skip notice once and return `true` when no S3 config is available. +macro_rules! cfg_or_skip { + ($test:expr) => {{ + match load_s3_config() { + Some(cfg) => cfg, + None => { + eprintln!( + "[skip] {}: no usable [git.s3] config (set OV_GIT_S3_CONF to an ov.conf \ + with git.enabled=true, backend=\"s3\")", + $test + ); + return; + } + } + }}; +} + +/// §A2.1 — ObjectStore round-trip + idempotency + not-found behavior. +#[tokio::test] +async fn s3_object_store_round_trip() { + let cfg = cfg_or_skip!("s3_object_store_round_trip"); + let account = random_account(); + let store = S3ObjectStore::from_config(cfg) + .await + .expect("build S3ObjectStore"); + + let (oid, body) = loose_object(b"hello viking s3 round-trip"); + + // First write succeeds. + store.put(&account, &oid, body.clone()).await.expect("put #1"); + // Second write of identical content is idempotent (must not error). + store.put(&account, &oid, body.clone()).await.expect("put #2 idempotent"); + + // get returns the exact zlib bytes we stored. + let fetched = store.get(&account, &oid).await.expect("get"); + assert_eq!(fetched, body, "stored and fetched bytes must match exactly"); + + // exists is true for a written object. + assert!(store.exists(&account, &oid).await.expect("exists")); + + // A never-written oid: get -> NotFound, exists -> false. + let (missing_oid, _) = loose_object(b"this object was never written to s3"); + match store.get(&account, &missing_oid).await { + Err(ragfs::git::error::ObjectStoreError::NotFound(o)) => assert_eq!(o, missing_oid), + other => panic!("expected NotFound, got {other:?}"), + } + assert!(!store.exists(&account, &missing_oid).await.expect("exists missing")); +} + +/// §A2.2 — RefStore CAS create: first create wins, second create conflicts. +#[tokio::test] +async fn s3_ref_store_cas_create() { + let cfg = cfg_or_skip!("s3_ref_store_cas_create"); + let account = random_account(); + let store = S3RefStore::from_config(cfg) + .await + .expect("build S3RefStore"); + + let ref_name = "refs/heads/main"; + let a = oid_of(b"commit-A"); + let b = oid_of(b"commit-B"); + + // create-if-absent succeeds. + store + .cas_update(&account, ref_name, None, a) + .await + .expect("first create"); + assert_eq!(store.read(&account, ref_name).await.expect("read"), a); + + // A second create-if-absent must conflict (ref already exists). + match store.cas_update(&account, ref_name, None, b).await { + Err(RefStoreError::Conflict { expected, actual }) => { + assert_eq!(expected, None); + assert_eq!(actual, Some(a)); + } + other => panic!("expected Conflict, got {other:?}"), + } + // Value must be unchanged. + assert_eq!(store.read(&account, ref_name).await.expect("read after conflict"), a); +} + +/// §A2.3 — RefStore CAS advance: expected==current moves the ref forward. +#[tokio::test] +async fn s3_ref_store_cas_advance() { + let cfg = cfg_or_skip!("s3_ref_store_cas_advance"); + let account = random_account(); + let store = S3RefStore::from_config(cfg) + .await + .expect("build S3RefStore"); + + let ref_name = "refs/heads/main"; + let a = oid_of(b"commit-A"); + let b = oid_of(b"commit-B"); + + store.cas_update(&account, ref_name, None, a).await.expect("create A"); + store + .cas_update(&account, ref_name, Some(a), b) + .await + .expect("advance A -> B"); + assert_eq!(store.read(&account, ref_name).await.expect("read"), b); +} + +/// §A2.4 — RefStore CAS conflict: a stale `expected` is rejected with the +/// actual current value reported. +#[tokio::test] +async fn s3_ref_store_cas_conflict() { + let cfg = cfg_or_skip!("s3_ref_store_cas_conflict"); + let account = random_account(); + let store = S3RefStore::from_config(cfg) + .await + .expect("build S3RefStore"); + + let ref_name = "refs/heads/main"; + let a = oid_of(b"commit-A"); + let stale = oid_of(b"commit-STALE"); + let c = oid_of(b"commit-C"); + + store.cas_update(&account, ref_name, None, a).await.expect("create A"); + + match store.cas_update(&account, ref_name, Some(stale), c).await { + Err(RefStoreError::Conflict { expected, actual }) => { + assert_eq!(expected, Some(stale)); + assert_eq!(actual, Some(a)); + } + other => panic!("expected Conflict, got {other:?}"), + } + assert_eq!(store.read(&account, ref_name).await.expect("read"), a); +} + +/// §A2.5 — RefStore list: all refs under a prefix are returned (exercises the +/// `list_objects_v2` pagination path). +#[tokio::test] +async fn s3_ref_store_list() { + let cfg = cfg_or_skip!("s3_ref_store_list"); + let account = random_account(); + let store = S3RefStore::from_config(cfg) + .await + .expect("build S3RefStore"); + + let entries = [ + ("refs/heads/main", oid_of(b"main")), + ("refs/heads/dev", oid_of(b"dev")), + ("refs/heads/release", oid_of(b"release")), + ]; + for (name, oid) in &entries { + store.cas_update(&account, name, None, *oid).await.expect("create ref"); + } + + let mut listed = store.list(&account, "refs/heads").await.expect("list"); + listed.sort(); + + let mut expected: Vec<(String, ObjectId)> = + entries.iter().map(|(n, o)| (n.to_string(), *o)).collect(); + expected.sort(); + + assert_eq!(listed, expected); +} + +/// §A2.6 — Backend equivalence: the same object is byte-identical and equally +/// visible whether stored via the local or the S3 backend. +#[tokio::test] +async fn backend_equivalence_local_vs_s3() { + let cfg = cfg_or_skip!("backend_equivalence_local_vs_s3"); + let account = random_account(); + + let s3 = S3ObjectStore::from_config(cfg) + .await + .expect("build S3ObjectStore"); + let tmp = tempfile::tempdir().expect("tempdir"); + let local = LocalObjectStore::new(tmp.path()); + + let (oid, body) = loose_object(b"backend equivalence payload \x00\x01\x02 binary-ish"); + + let s3: Arc = Arc::new(s3); + let local: Arc = Arc::new(local); + + for store in [&s3, &local] { + store.put(&account, &oid, body.clone()).await.expect("put"); + assert!(store.exists(&account, &oid).await.expect("exists")); + } + + let from_s3 = s3.get(&account, &oid).await.expect("s3 get"); + let from_local = local.get(&account, &oid).await.expect("local get"); + + assert_eq!(from_s3, body, "s3 bytes must match input"); + assert_eq!(from_local, body, "local bytes must match input"); + assert_eq!(from_s3, from_local, "s3 and local stored bytes must be identical"); +} + +// ───────────────────────────────────────────────────────────────────────── +// §A3 — End-to-end GitService over the S3/TOS backend. +// +// These mirror the local-backend service tests in `src/git/service.rs`, but +// wire `GitService` to `S3ObjectStore` + `S3RefStore` (loaded from `ov.conf`) +// and a real `LocalFileSystem` working tree rooted at a temp dir. The working +// tree layout matches production: each account lives under `/local/{account}` +// relative to the mount base. +// ───────────────────────────────────────────────────────────────────────── + +/// Build a `GitService` backed by S3 stores plus a fresh `LocalFileSystem` +/// working tree. Returns the service, the kept-alive temp dir, and the +/// absolute account root path (`{tmp}/local/{account}`) for direct file IO. +fn make_s3_service( + cfg: S3Config, + account: &str, +) -> impl std::future::Future + '_ { + async move { + let object_store: Arc = Arc::new( + S3ObjectStore::from_config(cfg.clone()) + .await + .expect("build S3ObjectStore"), + ); + let ref_store: Arc = Arc::new( + S3RefStore::from_config(cfg) + .await + .expect("build S3RefStore"), + ); + + let work_dir = tempfile::tempdir().expect("tempdir"); + let acct_root = work_dir.path().join("local").join(account); + std::fs::create_dir_all(&acct_root).expect("create account root"); + let vfs: Arc = + Arc::new(LocalFileSystem::new(work_dir.path().to_str().unwrap()).expect("localfs")); + + let svc = GitService::new(vfs, object_store, ref_store); + (svc, work_dir, acct_root) + } +} + +/// Build a `CommitRequest` with fixed test author info. +fn commit_req(account: &str, branch: &str, message: &str, paths: Option>) -> CommitRequest { + CommitRequest { + account: account.to_string(), + branch: branch.to_string(), + message: message.to_string(), + paths, + author_name: "tester".to_string(), + author_email: "tester@example.com".to_string(), + } +} + +/// §A3.1 — commit → show(path) returns the exact blob bytes that were written. +#[tokio::test] +async fn s3_e2e_commit_then_show_blob_round_trip() { + let cfg = cfg_or_skip!("s3_e2e_commit_then_show_blob_round_trip"); + let account = random_account(); + let (svc, _work_dir, acct_root) = make_s3_service(cfg, &account).await; + + // Binary-ish payload to guard against any string-vs-bytes regression. + let body: &[u8] = b"hello viking s3 e2e \x00\x01\x02\nline2\n"; + std::fs::create_dir_all(acct_root.join("resources")).expect("mkdir resources"); + std::fs::write(acct_root.join("resources/a.md"), body).expect("write a.md"); + + match svc + .commit(commit_req(&account, "main", "first", None)) + .await + .expect("commit") + { + CommitResponse::Created { .. } => {} + other => panic!("expected Created, got {other:?}"), + } + + let resp = svc + .show(ShowRequest { + account: account.clone(), + target_ref: "main".into(), + path: Some("resources/a.md".into()), + }) + .await + .expect("show"); + + match resp { + ShowResponse::Blob { bytes, size, .. } => { + assert_eq!(bytes.as_ref(), body, "blob bytes must match written content"); + assert_eq!(size, body.len() as u64); + } + other => panic!("expected Blob, got {other:?}"), + } +} + +/// §A3.2 — restore to an older commit rolls back the working tree and creates +/// a *new* commit whose parent is the current HEAD (forward-only history). +#[tokio::test] +async fn s3_e2e_restore_rolls_back_and_advances_head() { + let cfg = cfg_or_skip!("s3_e2e_restore_rolls_back_and_advances_head"); + let account = random_account(); + let (svc, _work_dir, acct_root) = make_s3_service(cfg, &account).await; + + let proj = acct_root.join("resources/proj_a"); + std::fs::create_dir_all(&proj).expect("mkdir proj_a"); + + // Source commit: a.md=v1, b.md=v1. + std::fs::write(proj.join("a.md"), b"A v1").unwrap(); + std::fs::write(proj.join("b.md"), b"B v1").unwrap(); + let source_oid = match svc + .commit(commit_req(&account, "main", "source", None)) + .await + .expect("commit source") + { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + // HEAD commit: rewrite a.md, delete b.md, add c.md. + std::fs::write(proj.join("a.md"), b"A v2").unwrap(); + std::fs::remove_file(proj.join("b.md")).unwrap(); + std::fs::write(proj.join("c.md"), b"C new").unwrap(); + let head_oid = match svc + .commit(commit_req( + &account, + "main", + "head", + Some(vec![ + "resources/proj_a/a.md".to_string(), + "resources/proj_a/b.md".to_string(), + "resources/proj_a/c.md".to_string(), + ]), + )) + .await + .expect("commit head") + { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + // Restore the project subtree back to the source commit. + let resp = svc + .restore(RestoreRequest { + account: account.clone(), + branch: "main".into(), + project_dir: Some("resources/proj_a".into()), + source_commit: source_oid.to_hex().to_string(), + dry_run: false, + message: Some("rewind proj_a".into()), + author_name: "tester".into(), + author_email: "tester@example.com".into(), + }) + .await + .expect("restore"); + + match resp { + RestoreResponse::Applied { + source_commit, + parent_commit, + written, + deleted, + .. + } => { + assert_eq!(source_commit, source_oid); + assert_eq!(parent_commit, head_oid, "new commit's parent MUST be HEAD, not source"); + assert_eq!(written, 2, "a.md rewrite + b.md recreate"); + assert_eq!(deleted, 1, "c.md removed"); + } + other => panic!("expected Applied, got {other:?}"), + } + + // Working tree rolled back to the source snapshot. + assert_eq!(std::fs::read(proj.join("a.md")).unwrap(), b"A v1", "a.md rolled back"); + assert_eq!(std::fs::read(proj.join("b.md")).unwrap(), b"B v1", "b.md restored"); + assert!(!proj.join("c.md").exists(), "c.md must be deleted"); + + // HEAD advanced to a brand-new commit (forward-only), parented on the old HEAD. + let new_head = svc + .show(ShowRequest { + account: account.clone(), + target_ref: "main".into(), + path: None, + }) + .await + .expect("show head"); + match new_head { + ShowResponse::Commit { oid, parents, .. } => { + assert_ne!(oid, head_oid, "restore must create a new commit"); + assert_ne!(oid, source_oid, "HEAD is a new commit, not the source"); + assert_eq!(parents, vec![head_oid], "new commit parent must be prior HEAD"); + } + other => panic!("expected Commit, got {other:?}"), + } +} + +/// §A3.3 — idempotency: committing again with no working-tree change is a Noop +/// and leaves the branch ref pointing at the same commit. +#[tokio::test] +async fn s3_e2e_commit_noop_when_unchanged() { + let cfg = cfg_or_skip!("s3_e2e_commit_noop_when_unchanged"); + let account = random_account(); + let (svc, _work_dir, acct_root) = make_s3_service(cfg, &account).await; + + std::fs::create_dir_all(acct_root.join("resources")).unwrap(); + std::fs::write(acct_root.join("resources/a.md"), b"stable content").unwrap(); + + let first_oid = match svc + .commit(commit_req(&account, "main", "first", None)) + .await + .expect("commit first") + { + CommitResponse::Created { commit_oid, .. } => commit_oid, + other => panic!("expected Created, got {other:?}"), + }; + + // Second commit with identical tree state — must be a Noop pointing at the + // existing HEAD. + match svc + .commit(commit_req(&account, "main", "second", None)) + .await + .expect("commit second") + { + CommitResponse::Noop { commit_oid } => { + assert_eq!(commit_oid, first_oid, "Noop must report the unchanged HEAD oid"); + } + other => panic!("expected Noop, got {other:?}"), + } +} diff --git a/docs/design/git-version-control-design.md b/docs/design/git-version-control-design.md new file mode 100644 index 0000000000..041a066ed8 --- /dev/null +++ b/docs/design/git-version-control-design.md @@ -0,0 +1,1257 @@ +# OpenViking 多版本管理技术方案 — 基于 Gitoxide 的 in-process Git 集成 + +> 💡 **一句话摘要**:在现有 OpenViking 的 RAGFS Rust 实现中嵌入一套基于 `gitoxide` 的 in-process Git 服务,以 **账号(account\_id)粒度** 提供 `commit / restore / show` 三个版本管理原语;通过 PyO3 binding 直接被 `VikingFS` Python 层调用,全程零 HTTP、零额外进程,Git 对象/Ref 后端复用现有 `localfs`/`s3fs` 客户端,实现"本地或远程"对称配置。 + +# 1. 背景与目标 + +## 1.1 业务背景 + +OpenViking 现有存储架构是一套以 `viking://` URI 为入口的双层抽象:上层 `VikingFS`(Python)负责 URI 规范化、L0/L1 摘要、向量同步、租户隔离;下层 RAGFS(Rust + PyO3 binding)提供 `FileSystem` trait 与 `MountableFS` radix-trie 路由,实际数据落到 `localfs`、`s3fs`、`memfs` 等插件后端。 + +在持续运行过程中,用户/Agent 对 `viking://resources/`、`viking://agent/skills/` 等命名空间的写入是连续且不可逆的——出错后无法回滚,跨多个文件的"逻辑事务"难以原子化捕获,实验性改动需要手动备份。这些场景的本质需求都是一套**面向账号的多版本快照机制**,语义与 Git 的 commit/restore/show 高度同构。 + +## 1.2 设计目标 + +- **显式版本化**:用户/Agent 通过 API 显式触发 commit/restore/show,不引入隐式 hook,避免影响现有写链路的延迟与一致性语义 +- **账号粒度仓库**:每个 `account_id` 一个逻辑 Git 仓库,跨 scope (resources/agent/user/session) 共享同一棵 root tree,支持跨 scope 的原子快照 +- **多后端对称**:Git objects / refs 的实际存储类型与 resources 目录一致,可在配置中切换本地(local)或远程(s3),运维心智零增量 +- **零进程膨胀**:Git 服务以 in-process binding 形式嵌入现有 RAGFS,共享 Tokio runtime 与配置加载链路,不引入新 HTTP server +- **对现有代码侵入最小**:不修改 `content_write.py`、`viking_fs.write/rm/mv` 等核心写链路,仅在 `VikingFS` 上增加 3 个新方法 +- **定向恢复 (restore)**:支持以 **(project\_dir, commit\_id)** 为输入,将指定 project 目录恢复到目标 commit 的快照状态,并以 HEAD 为父节点*正向生成一个新 commit*。非目标 project 目录保持当前最新状态不动。 + +## 1.3 非目标 (Out of Scope) + +- 不实现自动 commit hook (首版纯主动 API 触发) +- 不实现分支 merge / rebase / cherry-pick / push/pull (首版只覆盖快照 + 回滚 + 查看) +- 不暴露 Git 数据到 `viking://` 用户命名空间 (避免被用户误删/误改) +- 不支持向量索引数据的版本化 (向量索引由 watcher 异步重建, restore 后需触发重建;L0/L1 派生文件已纳入版本管理) +- 不支持 ref 回退式 checkout:本方案不提供 "把 main / HEAD 指针直接移动到旧 commit" 的能力。所有恢复操作都通过正向新增 commit 实现,保证 HEAD 单调前进、commit 链完整可审计。如需查看旧版本,使用 **show** 接口的只读路径。 + +*** + +# 2. 核心设计决策 + +| 决策 | 设计含义 | 替代方案被淘汰的原因 | +| ------------------------------ | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------ | +| **单 Repo per account\_id** | 同一账号下的 `resources/`、`agent/`、`user/`、`session/` 全部在一棵 root tree 之下;一次 commit 可覆盖任意 scope 的子集 | per-resource repo 会产生 N×账号数量的索引数据,跨 resource 的"事务性快照"需要协调多 repo,复杂度高 | +| **纯 API 触发,不接 hook** | `content_write.py` / `viking_fs.write/rm/mv` 完全不动;Git 仅通过 `VikingFS.commit/restore/show` 三个新方法被显式调用 | hook 模式会让每次小写入都触发 Git 写入,放大延迟、放大冲突窗口、放大 ref CAS 失败率;首版优先简单 | +| **Git 存储后端与 resources 同构** | 定义 `ObjectStore` / `RefStore` trait,提供 local 与 s3 两种实现,直接复用 `plugins::localfs::LocalFileSystem` 和 `plugins::s3fs::S3Client` | 独立实现 Git 存储后端会重复造轮子;走 `MountableFS` 又会让 Git 数据进入用户命名空间 | +| **嵌入为 crates/ragfs 子模块** | 新增 `crates/ragfs/src/git/` 模块,与 `core/`、`plugins/`、`server/` 平级;PyO3 binding 在 `RAGFSBindingClient` 上加 3 个方法 | 独立 crate 会引入额外配置、额外 runtime、额外鉴权;`ServicePlugin` 又无法表达 commit 这种非文件操作的语义 | +| **暴露方式 = PyO3 binding,非 HTTP** | 三个新方法挂在现有 `RAGFSBindingClient` 上,通过 `AsyncAGFSClient.run` 由 `VikingFS` 调用,与 `ls/read/write` 一致 | HTTP server 路径在 OpenViking 当前架构中已是 legacy,生产路径是 in-process binding | + +*** + +# 3. 整体架构 + +## 3.1 分层与依赖关系 + +```mermaid +flowchart TB + subgraph Py[Python 层] + VFS["VikingFS +commit / checkout / show"] + Async["AsyncAGFSClient +(asyncio.to_thread)"] + end + + subgraph BindCrate["crates/ragfs-python (PyO3 cdylib)"] + RBC["RAGFSBindingClient ++ git_commit / git_checkout / git_show"] + end + + subgraph CoreCrate[crates/ragfs] + MFS["core/MountableFS +(现有 VFS 路由)"] + Plugins["plugins/ +localfs · s3fs · memfs ..."] + GitMod["git/ (新增) +GitService · ObjectStore · RefStore"] + end + + subgraph Backends[底层存储后端] + Local["LocalFileSystem +(本地磁盘)"] + S3["S3Client +(TOS / S3 / OSS)"] + end + + VFS --> Async + Async --> RBC + RBC --> MFS + RBC --> GitMod + GitMod -- "checkout 写回阶段" --> MFS + MFS --> Plugins + Plugins --> Local + Plugins --> S3 + GitMod -- "直接持有 struct +(不经过 MountableFS)" --> Local + GitMod -- "直接持有 struct +(不经过 MountableFS)" --> S3 +``` + +## 3.2 数据流(三个核心命令) + +```mermaid +sequenceDiagram + participant U as User/Agent + participant V as VikingFS (Python) + participant B as RAGFSBindingClient + participant G as GitService + participant M as MountableFS + participant O as ObjectStore + participant R as RefStore + + Note over U,R: --- commit --- + U->>V: commit(message, paths) + V->>B: git_commit(account, branch, ...) + B->>G: GitService::commit(req) + G->>M: tree_directory / read (枚举+读 blob) + M-->>G: 文件内容流 + G->>O: put(blob_oid, zlib) + G->>O: put(tree_oid, zlib) + G->>O: put(commit_oid, zlib) + G->>R: cas_update(refs/heads/main, parent, new) + R-->>G: ok / conflict + G-->>B: CommitResponse + B-->>V: dict + V-->>U: commit_oid + + Note over U,R: --- show (只读) --- + U->>V: show(ref, path) + V->>B: git_show(account, ref, path) + B->>G: GitService::show + G->>R: read(refs/heads/main) + G->>O: get(commit_oid) → tree_oid + G->>O: get(tree_oid) → blob_oid + G->>O: get(blob_oid) + G-->>B: blob bytes + B-->>V: bytes + V-->>U: bytes + + Note over U,R: --- checkout (写回 VFS) --- + U->>V: checkout(ref, paths) + V->>B: git_checkout(account, ref, paths, dry_run) + B->>G: GitService::checkout + G->>O: 加载 commit → root tree → 递归列 (path, blob_oid) + G->>M: stat / read 对比当前态 + G->>M: write / remove (走完整 VFS 语义) + G-->>B: 差异报告 + B-->>V: dict + V-->>U: applied / deleted / skipped +``` + +## 3.3 关键设计原则 + +> 💡 **Git 数据不进 viking 命名空间** +> +> Git 模块直接持有 `LocalFileSystem`/`S3Client` 实例,**不**通过 `MountableFS` 路由。Git 数据存到 `git/{account}/objects/...`,用户在 `viking://` 下看不到、也改不到。 + +> 💡 **LocalObjectStore 和 S3ObjectStore 直接调用 tokio::fs 和 Arc\, 不复用 LocalFileSystem/S3Client** +> +> LocalFileSystem/S3Client 是面向"用户文件树"的抽象,而 Git 后端是面向"内容寻址对象库"的存储,两者的语义需求不重叠。强行复用会导致更复杂的胶水代码。 + +*** + +# 4. Repo 边界与 Tree 布局 + +## 4.1 Tree 镜像 VikingFS 命名空间 + +由于 `viking_fs._uri_to_path` 已经定义了 `viking://X → /local/{account_id}/X` 的映射规则,我们让 Git 的 root tree 完全镜像 `/local/{account_id}/` 下的子目录结构。这样 tree path 与 viking URI 后缀一一对应,语义直观、无歧义。 + +## 4.2 路径剪枝(自动排除) + +剪枝规则集中实现在 `crates/ragfs/src/git/enumerate.rs::prune_path`,在 `commit` 入口对 `paths=Some(...)` 与 `paths=None`(全量枚举)两条路径都生效: + +| 类别 | 规则 | 理由 | +| ------------- | ------------------------------------------------------- | --------------------------------------------------------------- | +| 内部 scope / 目录 | 第一段命中 `_system` / `tasks` / `temp` / `queue` / `upload` | 与 `VikingFS._INTERNAL_NAMES` / `INTERNAL_SCOPES` 一致,均为运行时锁/系统状态 | +| 运行时锁文件 | 任意段以 `.path.ovlock` 开头 | VFS 内部锁,不应纳入版本 | +| 向量缓存目录 | 任意非叶段等于 `embedding_cache` | embedding 缓存为派生数据 | +| 向量索引文件 | 叶子文件以 `.faiss` 或 `.index` 结尾 | 纯计算产物,体积大且可重建 | + +L0/L1 派生文件(`.abstract.md`、`.overview.md`、`.relations.json`)未命中任一剪枝规则,**会**纳入主线 commit。restore 时随源文件一起回滚,无需重新生成;Python 层在 `restore` 完成后按 `(written_paths, deleted_paths)` 精确触发 L0/L1/DETAIL 向量异步重建(`.relations.json` 不触发向量任务)。 + +## 4.3 单库多命名空间的优势 + +1. **原子跨 scope 快照**:一次 commit 可同时覆盖 `resources/docs` 和 `agent/skills`,对应"Agent 一次任务的所有产出"这种逻辑事务 +2. **定向回滚**:restore 时可指定 `paths=["resources/docs/auth.md"]`,只回滚单个文件 +3. **索引数据线性**:objects/refs 数量随账号线性,不随 resource 数量指数膨胀 +4. **权限边界清晰**:account\_id 已经是天然的隔离单位,Git 仓库边界与现有权限模型完全对齐 + +*** + +# 5. 物理布局 + +## 5.1 Crate 目录结构 + +Git 模块作为 `crates/ragfs` 的子模块,与 `core/`、`plugins/`、`server/` 平级。新增文件全部位于 `crates/ragfs/src/git/` 下,Python binding 仅在 `crates/ragfs-python/src/lib.rs` 上追加方法,无新 crate。 + +``` +crates/ragfs/src/ +├── core/ # 既有(不动) +├── plugins/ # 既有(不动) +├── server/ # 既有(不动) +└── git/ # 新增 + ├── mod.rs # 模块入口 + 重导出 + ├── service.rs # GitService(commit/restore/show 主流程,均在此文件) + ├── object_store.rs # ObjectStore trait + ├── ref_store.rs # RefStore trait + ├── tree_builder.rs # TreeEditor + flatten/lookup 工具 + ├── commit.rs # write_commit / Actor / 时间戳 + ├── enumerate.rs # 从 MountableFS 枚举 + prune_path 剪枝 + ├── util.rs # zlib 压缩/解压、ref 名校验、loose object 读写 + ├── types.rs # 请求/响应 DTO + ├── error.rs # GitError / ObjectStoreError / RefStoreError(thiserror) + ├── config.rs # GitConfig(serde) + └── backends/ + ├── mod.rs + ├── local.rs # LocalObjectStore / LocalRefStore(直接使用 tokio::fs) + └── s3.rs # S3ObjectStore / S3RefStore(直接使用 aws_sdk_s3 + If-Match) + +crates/ragfs-python/src/ +└── lib.rs # 追加 git_commit / git_restore / git_show 方法 + +openviking/openviking/storage/ +└── viking_fs.py # 追加 commit / restore / show / log + URI↔tree-path 工具 +``` + +## 5.2 依赖增量 + +仅引入 gitoxide 中实现 commit/restore/show MVP 所需的最小子 crate 集合,通过 `crates/ragfs/Cargo.toml` 增量声明: + +```toml +[dependencies] +# === Git (gitoxide) === +gix-hash = "0.14" # ObjectId / Hash 抽象 +gix-object = "0.42" # Blob/Tree/Commit 编解码 + tree::Editor +gix-actor = "0.31" # 作者/提交者签名(name ts tz) +gix-date = "0.8" # 时间戳格式化 + +# === Zlib 压缩 === +flate2 = "1" # loose object zlib 编解码 + +# === S3 后端 === +aws-sdk-s3 = ... # S3 API client(直接依赖,不复用 plugins/s3fs 内部封装) +aws-config = ... + +[dev-dependencies] +tempfile = "3" +``` + +> 💡 **说明:** 不引入 `gitoxide` 顶层 crate,只挑选 commit/restore/show MVP 必需的子 crate;不引入 `gix-pack`(MVP 只用 loose object 格式)、不引入 `gix-protocol`(无 push/pull 需求)、不引入 `gix-worktree`(restore 通过 VFS 完成)。 +> +> - 实际实现使用 `flate2` 直接做 zlib 编解码,而非 `gix-features`,以减少 gitoxide 依赖面。 +> - ref 名校验由 `crates/ragfs/src/git/util.rs` 中自实现的 `validate_ref_name` 完成,未引入 `gix-validate`。 +> - 并发模型测试(`loom`)与 fuzz 测试(`proptest`)在 MVP 阶段未引入,以单测 + 集成测试覆盖。 + +*** + +# 6. 核心 Trait 设计 + +## 6.1 ObjectStore + +`ObjectStore` 是 Git 内容寻址存储的抽象,提供 blob/tree/commit 三类对象的存取。所有写入按 SHA-1 内容寻址,天然幂等(同样的字节 → 同样的 oid)。trait 必须 `Send + Sync + 'static`,以便在 Tokio 多线程运行时中跨任务共享。 + +```rust +// crates/ragfs/src/git/object_store.rs +use async_trait::async_trait; +use bytes::Bytes; +use gix_hash::ObjectId; + +/// 内容寻址的 Git 对象存储抽象 +/// put 必须幂等;get 不存在返回 NotFound;exists 不读取内容 +#[async_trait] +pub trait ObjectStore: Send + Sync + 'static { + /// 写入一个已 zlib 压缩的 loose object + /// oid 必须等于 SHA-1(未压缩 header + payload) + async fn put( + &self, + account: &str, + oid: &ObjectId, + zlib_body: Bytes, + ) -> Result<(), ObjectStoreError>; + + /// 读取并 zlib 解压(返回 header + payload 的原始字节) + async fn get( + &self, + account: &str, + oid: &ObjectId, + ) -> Result; + + /// 仅检查存在性(HEAD/stat 优化,跳过内容传输) + async fn exists( + &self, + account: &str, + oid: &ObjectId, + ) -> Result; +} + +#[derive(Debug, thiserror::Error)] +pub enum ObjectStoreError { + #[error("object not found: {0}")] + NotFound(ObjectId), + #[error("backend io: {0}")] + Io(#[from] std::io::Error), + #[error("zlib decode: {0}")] + Zlib(String), + #[error("oid mismatch: expected {expected}, got {actual}")] + OidMismatch { expected: ObjectId, actual: ObjectId }, + #[error("backend error: {0}")] + Backend(String), +} +``` + +> ℹ️ **说明:** 物理路径布局由各实现自行决定(local 走 fanout 目录,s3 走 key prefix),trait 层不暴露物理路径,只暴露逻辑寻址。 + +## 6.2 RefStore + +`RefStore` 是分支/标签的命名引用存储,核心是 **CAS(Compare-And-Swap)** 更新原语 — 这是 Git 一致性的基石。CAS 保证"两个并发 commit 先到先得,后到的看到 `Conflict` 并需要重试或 rebase",避免静默覆盖。 + +```rust +// crates/ragfs/src/git/ref_store.rs +use async_trait::async_trait; +use gix_hash::ObjectId; + +#[async_trait] +pub trait RefStore: Send + Sync + 'static { + /// 读取 ref 的当前值;不存在返回 NotFound + async fn read( + &self, + account: &str, + ref_name: &str, + ) -> Result; + + /// Compare-And-Swap 更新:仅当当前值 == expected 时才写入 new + /// expected = None 表示"仅当 ref 不存在时创建" + async fn cas_update( + &self, + account: &str, + ref_name: &str, + expected: Option, + new: ObjectId, + ) -> Result<(), RefStoreError>; + + /// 列出 account 下的所有 refs(用于 log / branch 列表) + async fn list( + &self, + account: &str, + prefix: &str, + ) -> Result, RefStoreError>; +} + +#[derive(Debug, thiserror::Error)] +pub enum RefStoreError { + #[error("ref not found: {0}")] + NotFound(String), + #[error("CAS conflict: expected {expected:?}, actual {actual:?}")] + Conflict { + expected: Option, + actual: Option, + }, + #[error("invalid ref name: {0}")] + InvalidName(String), + #[error("backend io: {0}")] + Io(#[from] std::io::Error), + #[error("backend: {0}")] + Backend(String), +} +``` + +> ⚠️ **注意:** ref 名必须经 `crate::git::util::validate_ref_name(...)` 校验,拒绝 `..`、空字符、特殊保留字等,避免路径穿越和注入(实现位于 `git/util.rs`,未引入 `gix-validate`)。 + +## 6.3 命名约定 + +| 类别 | 路径模板 | 说明 | +| ----------- | --------------------------------------- | -------------------------------------------------------- | +| Object | `{root}/{account}/objects/{aa}/{bb...}` | Git 标准 fanout(前 2 hex 为目录,后 38 hex 为文件名),便于分布式存储 list 优化 | +| Ref (heads) | `{root}/{account}/refs/heads/{branch}` | 文件内容 = 40 hex 字符 + `\n` | +| HEAD | `{root}/{account}/HEAD` | 内容 = `ref: refs/heads/main\n` | +| Packed-refs | (不实现) | MVP 全部 loose,后续如 ref 数量爆炸再补 pack | + +*** + +# 7. 后端实现 + +## 7.1 LocalObjectStore / LocalRefStore + +**LocalObjectStore** 直接调用 `tokio::fs`(不经 MountableFS、也不复用 `LocalFileSystem`),把 Git 对象写入本地磁盘的 `{base_dir}/{account}/objects/{aa}/{bb...}`。**LocalRefStore** 使用进程内的 `DashMap<(account, ref_name), Arc>>` 串行化同 ref 的 CAS,叠加 `tempfile + rename(2)` 的原子重命名,覆盖同进程并发场景。 + +> **当前实现限制:** MVP 仅做了进程内 Mutex,**未叠加** `flock` 跨进程锁。生产部署若存在同 host 多进程同时写同一账号的场景,需要在后续版本补 `flock`。 + +```rust +// crates/ragfs/src/git/backends/local.rs (节选) +pub struct LocalObjectStore { + base_dir: PathBuf, // e.g. /data/openviking/git +} + +#[async_trait] +impl ObjectStore for LocalObjectStore { + async fn put(&self, account: &str, oid: &ObjectId, body: Bytes) -> Result<()> { + let hex = oid.to_hex().to_string(); + let path = self.base_dir + .join(account).join("objects") + .join(&hex[..2]).join(&hex[2..]); + // 内容寻址 → 已存在则跳过(幂等) + if tokio::fs::try_exists(&path).await? { return Ok(()); } + tokio::fs::create_dir_all(path.parent().unwrap()).await?; + // 写临时文件 + rename 保证原子性 + let tmp = path.with_extension("tmp"); + tokio::fs::write(&tmp, &body).await?; + tokio::fs::rename(&tmp, &path).await?; + Ok(()) + } + // get / exists 略 +} + +pub struct LocalRefStore { + base_dir: PathBuf, + // 进程内串行化 CAS,key = (account, ref_name) + locks: dashmap::DashMap<(String, String), Arc>>, +} + +#[async_trait] +impl RefStore for LocalRefStore { + async fn cas_update( + &self, + account: &str, + name: &str, + expected: Option, + new: ObjectId, + ) -> Result<()> { + validate_ref_name(name)?; // util.rs 自实现 + let lock = self.locks + .entry((account.into(), name.into())) + .or_default().clone(); + let _guard = lock.lock().await; + let path = self.ref_path(account, name); + let actual = read_ref_opt(&path).await?; + if actual != expected { + return Err(RefStoreError::Conflict { expected, actual }); + } + let tmp = path.with_extension("tmp"); + tokio::fs::write(&tmp, format!("{}\n", new.to_hex())).await?; + // rename 保证 crash-consistency + tokio::fs::rename(&tmp, &path).await?; + Ok(()) + } +} +``` + +## 7.2 S3ObjectStore / S3RefStore + +**S3ObjectStore** 直接持有一个 `Arc`(MVP 不复用 `plugins::s3fs::S3Client`,以解耦 git 模块与 plugin 体系),将 object 存为 `{prefix}/{account}/objects/{aa}/{bb...}`。由于内容寻址,`put` 用 `If-None-Match: *` 头实现幂等"仅首次写入"。**S3RefStore** 用 `If-Match: "{etag}"` 实现 CAS,先 `GET` 拿当前值与 ETag,再用 `PUT` 条件写。 + +> **CAS 模式:** `CasMode::Native` 已实现并默认启用;`CasMode::RedisLock` 仅作为枚举占位,**实际尚未实现**,调用会直接返回 `RefStoreError::Backend("RedisLock CAS mode not yet implemented")`。 + +```rust +// crates/ragfs/src/git/backends/s3.rs (节选) +pub struct S3RefStore { + client: Arc, + bucket: String, + prefix: String, + cas_mode: CasMode, // Native | RedisLock(占位,未实现) +} + +#[async_trait] +impl RefStore for S3RefStore { + async fn cas_update( + &self, + account: &str, + name: &str, + expected: Option, + new: ObjectId, + ) -> Result<()> { + validate_ref_name(name)?; + match self.cas_mode { + CasMode::Native => { + // 1. GET 当前 body 与 ETag + let current = self.read_ref_opt(account, name).await?; + let (current_oid, current_etag) = match current { + Some((oid, etag)) => (Some(oid), etag), + None => (None, None), + }; + if current_oid != expected { + return Err(RefStoreError::Conflict { + expected, actual: current_oid, + }); + } + // 2. 条件 PUT + let body = format!("{}\n", new.to_hex()); + let put = self.client.put_object() + .bucket(&self.bucket).key(&self.ref_key(account, name)) + .body(body.into_bytes().into()); + let put = match (current_etag, expected) { + (Some(etag), Some(_)) => put.if_match(etag), + (None, None) => put.if_none_match("*"), + _ => return Err(RefStoreError::Conflict { + expected, actual: current_oid, + }), + }; + // 412 → Conflict;其他 → Backend + map_precondition_failed(put.send().await, expected, current_oid) + } + CasMode::RedisLock => Err(RefStoreError::Backend( + "RedisLock CAS mode not yet implemented".into(), + )), + } + } +} +``` + +> ⚠️ **S3 CAS 兼容性提示:** AWS S3 自 2024 年起支持 `If-Match` / `If-None-Match` 条件写;TOS / OSS 实现情况需在选型时验证。若某后端不支持原生 CAS,需退化为"分布式锁 + GET-then-PUT"模式;`RedisLock` 模式已在配置/枚举中预留,但实现待补。 + +*** + +# 8. GitService 主流程 + +## 8.1 commit 完整实现 + +commit 主流程:**枚举 → 读 blob → 构建 tree → 构建 commit → CAS 更新 ref**。所有 ObjectStore 写入按账号粒度幂等(同 oid 多次 put 安全),tree 写入由 `TreeEditor` 自底向上完成。tree 未变 → 不创建空 commit(no-op 优化)。绝大多数 commit 场景下,被调用方声明为 "改动" 的文件里仍有大量未真正修改,需要通过三级 fast path 层层过滤,保证只有真正变化的字节才进入 streaming hash 与 blob 写入。 + +| 层级 | 触发条件 | 节省的开销 | 实现位置 | +| ------------------------------ | --------------------------------------- | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| **Fast Path 1**: Stat 索引复用 oid | 文件 (size, mtime\_ns) 与 prev\_index 完全一致 | 跳过 `vfs.read` + sha1 hash | **已实现**(`IndexStore` trait + `LocalIndexStore`/`S3IndexStore`,`CommitIndex` 与 `parent_oid` 绑定,索引 miss/decode 错误/parent 不匹配 → 静默回退 slow path;通过 `git.tuning.commit_index_enabled` 关闭) | +| **Fast Path 2**: Tree 子树原样保留 | 子树下所有路径都没 upsert/remove | 跳过子树重 hash + 新 tree object 写入 | **已实现**(`TreeEditor::from_tree` 惰性加载 root,未被 upsert/remove/upsert\_subtree 触及的子树连读取+zlib 解压都省掉,由 `write_subtree` 的 `None` 分支原样复用其 OID) | +| **Fast Path 3**: Blob CAS 去重 | 算出的 oid 在 object\_store 已存在 | 跳过 zlib 压缩 + put\_blob (本地写盘 / S3 PUT) | **已实现**(slow path 写 blob 前在 service 层调用 `object_store.exists` 预检,命中则跳过 zlib 压缩与 `put`;范围严格限定 blob,tree/commit 不预检;通过 `git.tuning.blob_exists_precheck_enabled` 关闭) | + +```rust +// crates/ragfs/src/git/service.rs ::commit (节选) +pub async fn commit(&self, req: CommitRequest) -> Result { + let CommitRequest { + account, branch, message, paths, author_name, author_email, + } = req; + + // 1. 解析当前 HEAD,加载 prev_tree(若 ref 不存在则空 tree) + let prev_head = self.resolve_ref(&account, &branch).await.ok(); + let prev_tree = match prev_head { + Some(oid) => self.load_commit(&account, &oid).await?.tree, + None => empty_tree_oid(), + }; + let mut editor = TreeEditor::from_tree( + &self.object_store, &account, prev_tree, + ).await?; + let mut changed = 0usize; + + // 2. 候选路径:paths=Some → 经 prune_path 过滤后的清单;paths=None → enumerate::collect_all 全量 + let candidates = match &paths { + Some(ps) => ps.iter().filter_map(prune_path).collect(), + None => enumerate::collect_all(&self.vfs, &account).await?, + }; + + for path in candidates { + match self.vfs.stat(&account_path(&account, &path)).await { + Ok(_) => { + // 读全量 + streaming hash + 写 blob(无 Fast Path 1/3) + let bytes = self.vfs.read(&account_path(&account, &path)).await?; + let oid = sha1_blob_streaming(&bytes); + self.write_object(&account, &oid, &bytes).await?; // 幂等 + editor.upsert(&path, oid)?; + changed += 1; + } + Err(e) if is_not_found(&e) => { + // 文件被删 → 从 tree 中移除 + editor.remove(&path)?; + changed += 1; + } + Err(e) => return Err(e.into()), + } + } + + // 3. 无任何变化 → noop + if changed == 0 { + return Ok(CommitResponse::Noop { + commit_oid: prev_head.unwrap_or_default(), + }); + } + + // 4. 写 tree + commit + let new_tree = editor.write(&self.object_store, &account).await?; + let commit_oid = write_commit(&self.object_store, &account, CommitObject { + tree: new_tree, + parents: prev_head.into_iter().collect(), + author: Actor::now(&author_name, &author_email), + committer: Actor::now(&author_name, &author_email), + message: message.into(), + }).await?; + + // 5. CAS 更新 ref;失败 → ConcurrentCommit 直接上抛 + // 注意:当前实现中 commit() 内部不做 retry,由调用方决定如何处理冲突。 + self.ref_store.cas_update( + &account, &format!("refs/heads/{}", branch), + prev_head, commit_oid, + ).await?; + + Ok(CommitResponse::Created { commit_oid, changed }) +} +``` + +> **关于 retry:** 当前实现中 `commit()` 内部 **不包含 CAS 重试循环**(代码中明确注释 `// There is intentionally no retry loop inside commit().`)。冲突直接以 `ConcurrentCommit` 上抛,由 Python 层或上游业务决定重试策略;这与 §11.3 旧版描述的"内部最多重试 3 次"不一致,以本节为准。 + +## 8.2 restore 完整实现 + +restore 主流程:**解析目标 commit → 提取该 commit 中 project\_dir 子树 → 与当前 HEAD 中同路径子树 diff → 通过 MountableFS.write/rm 回写 → 删除回写后空目录 → 以当前 HEAD 为 parent 生成新 commit → CAS 更新 ref → 把受影响路径返回给调用方**。`dry_run` 模式只计算差异不写,用于预检。 + +向量索引重建在 service 层**不直接触发**,而是通过把 `written_paths` / `deleted_paths` 放到 `RestoreResponse::Applied` 中返回,Python 层(`VikingFS.restore`)再调度 `ReindexExecutor`。 + +关键差异:与 git checkout 不同,本接口**不移动分支指针到旧 commit**,而是把"旧内容"作为新 commit 的工作树内容,正向写入。新 commit 的 parent 是当前 HEAD,不是目标 commit,这保证了:(1) HEAD 单调前进;(2) 非 `project_dir` 路径自动保留 HEAD 的最新内容,无需特殊处理;(3) restore 本身可以被再次 restore (因为它就是一个普通 commit)。 + +```rust +pub async fn restore(&self, req: RestoreRequest) -> Result { + let RestoreRequest { + account, branch, project_dir, source_commit, dry_run, message, + author_name, author_email, + } = req; + + // 0. 校验 project_dir(非空、不含 ..、不命中 prune 规则) + validate_project_dir(&project_dir)?; + + // 1. 解析两端 commit + let source_oid = self.resolve_ref(&account, &source_commit).await?; + let source = self.load_commit(&account, &source_oid).await?; + let head_oid = self.resolve_ref(&account, &branch).await?; // 必须已有 HEAD + let head = self.load_commit(&account, &head_oid).await?; + + // 2. 在两棵 tree 中分别"截取" project_dir 子树 + // source 没有该子目录 → 视为空树(等价于把整个目录删掉) + let source_subtree = tree_builder::subtree(...).await? + .unwrap_or(empty_tree_oid()); + let head_subtree = tree_builder::subtree(...).await? + .unwrap_or(empty_tree_oid()); + + // 3. 子树之间 diff,得到三类操作(只限 project_dir 范围内) + let target_entries = flatten(&self.object_store, &account, source_subtree).await?; + let current_entries = flatten(&self.object_store, &account, head_subtree).await?; + let diff = compute_subtree_diff(&target_entries, ¤t_entries); + + if dry_run { + return Ok(RestoreResponse::DryRun { diff, source_oid, head_oid }); + } + if diff.is_empty() { + return Ok(RestoreResponse::Noop { head: head_oid, source: source_oid }); + } + + // 4. 并发回写 VFS:路径要带上 project_dir 前缀 + // 走完整 viking_fs.write/rm,触发现有 lock、加密 + let prefixed = |p: &str| format!("{}/{}", project_dir.trim_end_matches('/'), p); + let written_paths: Vec = stream::iter(diff.to_write) + .map(|(path, blob_oid)| async move { + let body = self.read_blob(&account, &blob_oid).await?; + self.vfs.write(&account_path(&account, &prefixed(&path)), body).await?; + Ok::<_, GitError>(prefixed(&path)) + }) + .buffer_unordered(32) // 当前实现硬编码 32,未读取 git.tuning.restore_concurrency + .try_collect().await?; + let deleted_paths: Vec = stream::iter(diff.to_delete) + .map(|path| { + let p = prefixed(&path); + async move { + // 幂等删除:NotFound 视为成功,允许 restore 在已被并发删除的路径上完成 + match self.vfs.rm(&account_path(&account, &p)).await { + Ok(()) => Ok(p), + Err(e) if is_not_found(&e) => Ok(p), + Err(e) => Err(e.into()), + } + } + }) + .buffer_unordered(32) + .try_collect().await?; + + // 6b. 删除空目录:沿被删路径的祖先链向上 rmdir,直到第一个非空目录或 project_dir 边界 + // (gix tree 不存空目录,而 VFS 写到本地后会留下空 dir,造成 ls 不一致) + self.prune_empty_dirs(&account, &project_dir, &deleted_paths).await?; + + // 5. 在 head.tree 之上做增量编辑: + // 把 project_dir 子树整体替换为 source_subtree。 + // 非 project_dir 路径原样保留 head 中的 tree_oid。 + let mut editor = TreeEditor::from_tree(&self.object_store, &account, head.tree).await?; + editor.upsert_subtree(&project_dir, source_subtree)?; + let new_tree = editor.write(&self.object_store, &account).await?; + + // 6. 构造新 commit:parent = 当前 HEAD(不是 source_oid!) + let new_commit_oid = write_commit(&self.object_store, &account, CommitObject { + tree: new_tree, + parents: vec![head_oid], // ← 关键:HEAD 单向前进 + author: Actor::now(&author_name, &author_email), + committer: Actor::now(&author_name, &author_email), + message: message.unwrap_or_else(|| format!( + "restore {} from {}", project_dir, &source_oid.to_hex()[..12], + )), + }).await?; + + // 7. CAS 更新 ref:expect=head_oid, new=new_commit_oid + // 若期间有别的 commit 进入 → ConcurrentCommit,调用方按提示重试 + self.ref_store.cas_update( + &account, &format!("refs/heads/{}", branch), + Some(head_oid), new_commit_oid, + ).await?; + + Ok(RestoreResponse::Applied { + new_commit_oid, + source_commit: source_oid, + parent_commit: head_oid, + // 计数 + 受影响路径(供上层精确触发向量重建) + written: written_paths.len(), + deleted: deleted_paths.len(), + unchanged: diff.unchanged.len(), + written_paths, + deleted_paths, + }) +} +``` + +> 当前实现相对早期设计的差异 +> +> - **空目录清理(步骤 6b)**:删除完文件后会沿祖先链 rmdir 至 `project_dir` 或第一个非空目录,避免 VFS 残留空目录。 +> - **幂等删除**:`vfs.rm` 返回 NotFound 视为成功,使 restore 可以在已被并发清理的路径上继续推进。 +> - **written\_paths / deleted\_paths**:`Applied` 响应除了 `written/deleted` 计数外,还返回**全量受影响路径(已加 project\_dir 前缀)**;Python 层按 marker / 源文件 / `.relations.json` 分类,精确触发 L0/L1/DETAIL 向量更新,不再依赖广义的 `_trigger_vector_rebuild(paths)`。 +> - **没有 commit\_index 刷新**:对应 §8.1 的 Fast Path 1 未实现,restore 末尾也无须刷新 index。 +> - **回写并发度**:当前硬编码 `buffer_unordered(32)`,**尚未**读取 `git.tuning.restore_concurrency` 配置项。 +> +> ✅ **推荐:** 生产环境调用前先以 `dry_run=true` 跑一遍取得差异列表,再让用户确认,避免误覆盖未提交的本地变更。 + +## 8.3 show 完整实现 + +show 是**纯读路径**,无任何 VFS 写入或 ref 变更,易于实现与验证。支持两种模式:`path=None` 返回 commit 元信息(用于 log 列表);`path=Some(p)` 返回该 path 的 blob 字节(零拷贝 `Bytes` 切片)。 + +```rust +pub async fn show(&self, req: ShowRequest) -> Result { + let ShowRequest { account, target_ref, path } = req; + + // 1. ref 解析:依次尝试 + // a. 40-hex commit_oid → 直接解析 + // b. 4..=39 hex 的缩写 oid → 沿 HEAD 父链回溯,找到唯一前缀匹配的 commit + // (歧义返回 AmbiguousOid;无匹配返回 OidPrefixNotFound) + // c. branch 名(如 "main") → 加前缀 refs/heads/{branch} + // d. 全路径 refs/heads/xxx → 透传 + let commit_oid = self.resolve_ref(&account, &target_ref).await?; + let commit = self.load_commit(&account, &commit_oid).await?; + + match path { + // 模式 A:返回 commit 元信息(log 用) + None => Ok(ShowResponse::Commit { + oid: commit_oid, + tree: commit.tree, + parents: commit.parents, + author: commit.author.into(), + committer: commit.committer.into(), + message: commit.message.to_string(), + }), + + // 模式 B:返回该 path 的 blob 字节 + Some(p) => { + // 按 / 拆分,在 tree 上逐层递归; + // - path 命中目录 → PathIsDirectory(p) + // - path 完全无对应 → PathNotFound(p) + let blob_oid = tree_builder::lookup( + &self.object_store, &account, commit.tree, &p, + ).await?; + + let blob_full = self.load_blob(&account, &blob_oid).await?; + // 去掉 "blob {len}\0" header,使用 Bytes::slice 零拷贝返回 payload + let payload = strip_object_header(blob_full)?; + Ok(ShowResponse::Blob { + oid: blob_oid, + size: payload.len() as u64, + bytes: payload, + }) + } + } +} +``` + +*** + +# 9. Python Binding 与 VikingFS 集成 + +## 9.1 PyO3 binding 新增方法 + +在现有 `RAGFSBindingClient`(`crates/ragfs-python/src/lib.rs`)上追加三个 `#[pymethods]`。模式与 `ls/read/write` 一致:用 `py_detach_blocking` 释放 GIL,在 Tokio runtime 内调 `GitService`,返回结果序列化为 `PyDict`。 + +```rust +// crates/ragfs-python/src/lib.rs (追加) +#[pymethods] +impl RAGFSBindingClient { + /// 提交一次快照 + /// kwargs: account, branch, message, paths(Option>), + /// author_name, author_email + /// returns: {"commit_oid": str, "result": "created" | "noop"} + fn git_commit(&self, py: Python<'_>, kwargs: &PyDict) -> PyResult { + let req = parse_commit_request(kwargs)?; + let svc = self.git_service()?; // FeatureDisabled 时返回 PyErr + py_detach_blocking(py, || { + self.runtime.block_on(svc.commit(req)) + .map_err(map_git_error) + }).map(|r| commit_response_to_pydict(py, r)) + } + + /// 定向恢复某个 project 目录,正向生成新 commit + /// kwargs: account, branch(默认 "main"), project_dir, source_commit, + /// dry_run(bool=false), message(Option), + /// author_name, author_email + /// returns: + /// Applied: {"new_commit_oid": str, "source_commit": str, "parent_commit": str, + /// "written": int, "deleted": int, "unchanged": int} + /// Noop: {"noop": true, "head": str, "source": str} + /// DryRun: {"dry_run": true, "diff": {...}, "head": str, "source": str} + fn git_restore(&self, py: Python<'_>, kwargs: &PyDict) -> PyResult { + let req = parse_restore_request(kwargs)?; + let svc = self.git_service()?; + py_detach_blocking(py, || self.runtime.block_on(svc.restore(req)) + .map_err(map_git_error)) + .map(|r| restore_response_to_pydict(py, r)) + } + + /// 读取 ref / commit / blob + /// kwargs: account, target_ref, path(Option) + /// returns: + /// path=None: {"oid","tree","parents","author","committer","message"} + /// path=str: {"oid","size","bytes": PyBytes} + fn git_show(&self, py: Python<'_>, kwargs: &PyDict) -> PyResult { + let req = parse_show_request(kwargs)?; + let svc = self.git_service()?; + py_detach_blocking(py, || { + self.runtime.block_on(svc.show(req)) + .map_err(map_git_error) + }).map(|r| show_response_to_pydict(py, r)) + } +} + +/// GitError → Python 异常映射(在 openviking 侧定义对应异常类) +fn map_git_error(e: GitError) -> PyErr { + match e { + GitError::FeatureDisabled => PyRuntimeError::new_err("git feature disabled"), + GitError::ConcurrentCommit => PyValueError::new_err("concurrent commit conflict"), + GitError::PathNotFound(p) => PyFileNotFoundError::new_err(p), + GitError::RefNotFound(r) => PyFileNotFoundError::new_err(r), + other => PyRuntimeError::new_err(other.to_string()), + } +} +``` + +## 9.2 Python 侧 VikingFS 新增方法 + +在 `openviking/openviking/storage/viking_fs.py` 的 `VikingFS` 类上追加 4 个公开方法。Python 调用方使用 `viking://` URI,内部经 `_uri_to_tree_path` 转换为账号内 tree 路径后再传给 binding。 + +```python +# openviking/storage/viking_fs.py (追加) +class VikingFS: + # 已有: read / write / rm / ls / mv / mkdir ... + + async def commit( + self, + *, + message: str, + paths: list[str] | None = None, # viking://... URIs + branch: str = "main", + author_name: str | None = None, + author_email: str | None = None, + ) -> dict: + """提交一次跨 scope 快照。返回 {commit_oid, result}.""" + account = self._current_account() + tree_paths = [self._uri_to_tree_path(p) for p in (paths or [])] + return await self._async_client.run( + "git_commit", + account=account, + branch=branch, + message=message, + paths=tree_paths or None, + author_name=author_name or self._default_author_name(), + author_email=author_email or self._default_author_email(), + ) + + async def restore( + self, + *, + project_dir: str, # viking://resources/proj_a/ 或 "resources/proj_a" + source_commit: str, # 40-hex / branch / tag + branch: str = "main", + dry_run: bool = False, + message: str | None = None, + author_name: str | None = None, + author_email: str | None = None, + ) -> dict: + """将 project_dir 恢复到 source_commit 状态,生成一个新 commit。 + + 语义等价于 git restore --source= --worktree --staged + / && git commit。HEAD 单调前进,不会回退。 + """ + account = self._current_account() + tree_dir = self._uri_to_tree_path(project_dir).rstrip("/") + result = await self._async_client.run( + "git_restore", + account=account, branch=branch, + project_dir=tree_dir, source_commit=source_commit, + dry_run=dry_run, message=message, + author_name=author_name or self._default_author_name(), + author_email=author_email or self._default_author_email(), + ) + if dry_run or result.get("noop"): + return result + + # 增量向量更新:只对受影响的源文件,逐个 vectors_only 重算。 + # L0/L1 派生文件已随源文件一起从 git 回写到 VFS,不需要重新生成。 + from openviking.service.reindex_executor import ReindexExecutor + executor = ReindexExecutor() + ctx = self._current_request_context() + for affected_path in result.get("affected_files", []): + affected_uri = self._tree_path_to_uri(affected_path) + if self._is_derived_file(affected_uri): + continue + asyncio.create_task(executor.execute( + uri=affected_uri, mode="vectors_only", + wait=False, ctx=ctx, + )) + return result + + async def show( + self, + target_ref: str, + *, + path: str | None = None, + ) -> dict | bytes: + """path=None → commit 元信息;path=str → blob 字节。""" + account = self._current_account() + tree_path = self._uri_to_tree_path(path) if path else None + resp = await self._async_client.run( + "git_show", + account=account, + target_ref=target_ref, + path=tree_path, + ) + if "bytes" in resp: + return resp["bytes"] + return resp + + async def log( + self, + *, + branch: str = "main", + limit: int = 20, + ) -> list[dict]: + """便捷封装:沿 parent 链反向遍历 commit。""" + account = self._current_account() + head = await self._async_client.run( + "git_show", account=account, target_ref=branch, path=None, + ) + result, current = [head], head.get("parents", []) + while current and len(result) < limit: + parent_oid = current[0] + commit = await self._async_client.run( + "git_show", account=account, target_ref=parent_oid, path=None, + ) + result.append(commit) + current = commit.get("parents", []) + return result + + # --- 工具方法 --- + def _uri_to_tree_path(self, uri: str) -> str: + """viking://resources/a.md → 'resources/a.md' + (去掉 viking:// 前缀,保留 scope 段作为 tree 一级目录)""" + parsed = VikingURI.parse(uri) + if parsed.scope in INTERNAL_SCOPES: + raise ValueError(f"internal scope not versioned: {parsed.scope}") + return f"{parsed.scope}/{parsed.relative_path}" + + async def _trigger_vector_rebuild( + self, account: str, paths: list[str] + ) -> None: + """restore 后异步触发向量索引重建。 + 实现可对接现有的 watcher / 任务队列;失败不影响 restore 结果。""" + try: + await self._vector_service.rebuild(account, paths) + except Exception: + logger.exception("vector rebuild failed for %s", account) +``` + +*** + +# 10. 配置规范 + +## 10.1 与 resources 对称的配置布局 + +配置位于现有 RAGFS 配置文件的 `[git]` 段,布局与 `[plugins.localfs_resources]` / `[plugins.s3fs_resources]` 完全对称,便于运维心智复用。`enabled = false` 时 binding 方法返回 `FeatureDisabled`,不影响现有 VFS。 + +```toml +# ragfs.toml 新增 [git] 段 +[git] +enabled = true +backend = "local" # "local" | "s3" +default_branch = "main" +author_name = "openviking-bot" # commit 默认作者 +author_email = "openviking-bot@system.local" + +# 本地后端 +[git.local] +base_dir = "/data/openviking/git" # objects/refs 存储根 + +# 远程后端(与 plugins.s3fs_resources 配置同构) +[git.s3] +bucket = "openviking-prod" +prefix = ".ovgit" # 全部 key = {prefix}/{account}/... +region = "us-east-1" +endpoint = "https://s3.amazonaws.com" +access_key_env = "OV_S3_AK" # 从环境变量读 +secret_key_env = "OV_S3_SK" +cas_mode = "native" # "native"(If-Match):当前唯一支持的模式 +use_path_style = true # path-style addressing(MinIO/LocalStack/TOS 默认开) + +# 高级调优(字段已在 config 中定义,但 MVP 部分尚未生效) +[git.tuning] +upload_concurrency = 64 # ⚠️ 当前未读取:commit blob 上传现为串行 +restore_concurrency = 32 # ⚠️ 当前未读取:restore 回写并发度硬编码 32 +ref_cas_max_retry = 3 # ⚠️ 当前未读取:commit 内部不做重试(见 §11.3) +ref_cas_backoff_ms = 50 # ⚠️ 当前未读取 +commit_index_enabled = true # Fast Path 1 总开关(默认 true);关闭后强制走 slow path,适合测试 / mtime 不可靠环境 +``` + +## 10.2 切换本地↔远程 + +| 维度 | local → s3 改动 | +| ---------- | ----------------------------------------------------- | +| 配置文件 | `backend = "local"` → `backend = "s3"`;填 `[git.s3]` 块 | +| Service 代码 | 无 | +| Python 调用方 | 无 | +| 数据迁移 | 一次性脚本:本地 `{base_dir}` 全量上传至 S3 key prefix(保持目录结构) | + +> 💡 从本地切到远程的全部成本 = 修改 `backend = "local"` → `backend = "s3"` + 填 `[git.s3]` 块。Service 代码、Python 调用方完全无感。这与 resources 目录"`plugins.localfs_resources` ↔ `plugins.s3fs_resources`"的切换体验完全对称。 + +*** + +# 11. 并发与一致性 + +## 11.1 写并发模型 + +| 层次 | 并发原语 | 说明 | +| --------- | ------------------------ | -------------------------------- | +| Blob 上传 | `buffer_unordered(64)` | 内容寻址,天然幂等;同 oid 多次 put 安全 | +| Tree 写入 | 串行(`Editor::write` 自底向上) | 同 oid 幂等,但顺序必须自底向上 | +| Commit 写入 | 串行,最后一步 | 同 oid 幂等 | +| Ref 更新 | CAS | 本地: 进程锁 + rename(2);S3: If-Match | + +## 11.2 并发冲突处理 + +```mermaid +flowchart LR + A[Client A: commit] --> A1[Read ref += parent_a] + B[Client B: commit] --> B1[Read ref += parent_b] + A1 --> A2[Build tree A] + B1 --> B2[Build tree B] + A2 --> A3[CAS update +expected=parent_a] + B2 --> B3[CAS update +expected=parent_b] + A3 --> OK1[OK new=commit_a] + B3 --> FAIL[CasConflict +current=commit_a] + FAIL --> RETRY{重试?} + RETRY -- yes --> B1 + RETRY -- no --> ERR[返回 409 +给上层] +``` + +## 11.3 重试策略 + +- **幂等部分(blob/tree/commit 写)**: 同 oid 多次 put 安全;后端层面通过 `If-None-Match: *`(S3)与 `try_exists`(local)短路重复写;service 层不额外做 retry。 +- **CAS 冲突**: **当前实现** GitService::commit/restore **内部不做自动重试,直接以 GitError::ConcurrentCommit 上抛给 Python 层,由调用方决定是否 re-read parent 重建 tree 重新提交。git.tuning.ref\_cas\_max\_retry / ref\_cas\_backoff\_ms 配置项已在 GitTuningConfig 中定义但**尚未在代码中读取\*\*,后续接入。 +- **跨账号**: 不同 account\_id 的 ref 路径不同,天然无冲突,可完全并行 + +*** + +# 12. 安全与隔离 + +## 12.1 账号隔离 + +- Git 数据路径全部以 `{account_id}` 为顶层前缀,与现有 `/local/{account_id}/` 隔离模型完全一致 +- `GitService` 所有方法的第一个参数都是 `account_id`,binding 层从 `RequestContext.account_id` 注入,不允许跨账号访问 +- Path 解析时必须经过 `validate_account_id`(白名单字符集 + 长度),防止 `../` 注入 + +## 12.2 加密 + +> 💡 **重要:** 现有 `viking_fs.write` 在写入前会调 `_encrypt_content`。**commit 时不应再次加密**——blob 内容 = 当前 VFS 已加密内容,Git 是对密文做版本管理。restore 写回时走 `viking_fs.write`,会再次"加密"——这里需要绕过(或保持密文不变):restore 路径走 `MountableFS.write` 而非 `viking_fs.write`,避免双重加密;或为 `viking_fs.write` 增加 `raw=True` 参数,restore 调用时传入。 + +## 12.3 资源限制 + +| 维度 | 限制 | 措施 | 当前状态 | +| ------------ | -------------------------------- | ------------------------------------ | ------------------------------------------------ | +| 单 blob 大小 | ≤ 100MB | commit 前 stat 检查,超限报错 | **未实现**(`GitError::BlobTooLarge` 已定义,但无运行时检查) | +| 单 commit 文件数 | ≤ 50000 | enumerate 阶段提前拒绝 | **未实现**(`GitError::TooManyFiles` 已定义,但无运行时检查) | +| 账号 Git 容量 | 由 quota 系统单独管控 | 放在 `[git.quota]`,首版默认 10GB | **未实现**(配置块未引入) | +| restore 并发 | 同子树串行,同一 account\_id 全量 restore 互斥 | `VikingFS.restore` 用 `LockContext` 树锁包裹 writeback:scoped restore 锁 `project_dir`,全量 restore 锁账号根;防止 VFS 写竞态 | **已实现**(写回阶段加锁;后台 reindex 在锁释放后调度,冲突映射为 `ResourceBusyError`) | +| 账号 ID 校验 | 白名单字符集 + 长度,防 `../` 注入 | `validate_account_id` 在 binding 入口拒绝 | **未实现**(`GitError::InvalidAccountId` 已定义,但无校验代码) | + +*** + +# 13. 错误处理 + +## 13.1 错误分类 + +顶层错误类型为 `GitError`(`crates/ragfs/src/git/error.rs`),按可恢复性与归属分组: + +| 类别 | Variant | 说明 | +| --------- | ------------------------------------------------------------------------------------------- | ------------------------------------------- | +| 后端透传 | `ObjectStore(ObjectStoreError)`、`RefStore(RefStoreError)`、`Vfs(...)` | 来自存储后端 / VFS | +| 路径校验 | `PathNotFound(String)`、`PathIsDirectory(String)`、`InvalidProjectDir(String)` | show / restore 入参校验 | +| Tree 内容缺失 | `SubtreeNotFoundInCommit { commit, path }` | restore 时 source 中缺失目标子树(已通过空树语义包装,正常路径不抛出) | +| Ref 解析 | `OidPrefixNotFound(String)`、`AmbiguousOid { prefix, candidates }` | 缩写 OID 解析失败 | +| 并发 | `ConcurrentCommit` | CAS 失败(由 ref\_store conflict 上抛) | +| 资源限制 | `BlobTooLarge { path, size, max }`、`TooManyFiles { count, max }`、`InvalidAccountId(String)` | 已定义但**当前无运行时检查**,详见 §12.3 | +| 其他 | `FeatureDisabled`、`CorruptedObject(...)`、`Other(String)` | binding 关闭、对象腐烂、兜底 | + +## 13.2 Python 异常映射 + +| Rust Error | Python Exception | 语义 | +| -------------------------------------------------------------------- | --------------------------- | ----------- | +| `FeatureDisabled` | `AGFSNotSupportedError` | git 模块未启用 | +| `ConcurrentCommit` | `GitConcurrentCommitError` | 需要上层重试或人工介入 | +| `PathNotFound` / `OidPrefixNotFound` / 后端 NotFound | `AGFSNotFoundError` | 404 语义 | +| `PathIsDirectory` / `BlobTooLarge` / `TooManyFiles` / `AmbiguousOid` | `AGFSInvalidOperationError` | 入参或资源错误 | +| `InvalidAccountId` / `InvalidProjectDir` | `AGFSInvalidPathError` | 路径/账号 ID 非法 | +| `CorruptedObject` / `Other` / `Vfs` | `AGFSInternalError` | 底层异常 | + +*** + +# 14. 可观测性 + +> **当前状态:** §14.1/§14.2 中描述的 tracing span / metrics **MVP 尚未接入**;§14.3 健康检查仅暴露 `git_enabled` / `git_backend` 两个字段。下面保留为目标态,未带状态标注的均为待实现项。 + +## 14.1 Tracing/日志关键字段(目标态) + +- **span 名**: `git.commit`, `git.restore`, `git.show` +- **tag**: `account_id`, `branch`, `parent_oid`, `commit_oid`, `backend` +- **event**: `git.blob.put`(`oid`, `size`), `git.tree.write`, `git.ref.cas`(`expected`, `new`, `result`), `git.cas.conflict` + +## 14.2 Metrics(目标态) + +| 指标 | 类型 | 维度 | +| ---------------------------------- | --------- | ---------------------------- | +| `git_commit_total` | counter | account\_id, branch, result | +| `git_commit_duration_seconds` | histogram | backend | +| `git_commit_files` | histogram | — | +| `git_commit_bytes` | histogram | backend | +| `git_cas_conflict_total` | counter | account\_id, branch | +| `git_object_store_latency_seconds` | histogram | op (put/get/exists), backend | +| `git_ref_store_latency_seconds` | histogram | op (read/cas), backend | + +## 14.3 健康检查 + +- **当前实现**: `RAGFSBindingClient.health()` 在原有字段上追加 `git_enabled: bool` 和 `git_backend: Option`(关闭时为 None)。 +- **目标态(未实现)**: 增加 `git` 子结构,返回 `{"backend", "writable", "last_commit_age_sec"}`;每分钟后台心跳对 `refs/heads/main` 做一次 read,失败则标记 degraded。 + +*** + +# 15. 测试策略 + +## 15.1 测试层次 + +| 层级 | 范围 | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------ | +| **单元测试 (Rust)** | ObjectStore 各操作的幂等性;RefStore CAS 在并发下的正确性(MVP 用普通并发测,未引入 loom);tree\_builder 的 upsert/remove/write;错误映射 | +| **集成测试 (Rust)** | LocalObjectStore 跑全套场景(MVP 暂未引入 MemObjectStore);commit → show 路径 → bytes 一致;commit → restore → 文件一致;并发 commit 的 CAS 冲突处理 | +| **端到端 (Python)** | VikingFS.commit → restore 全流程;跨 scope 原子快照;派生文件被正确纳入 commit 并随 restore 回滚;向量索引在 restore 后被精确重建;多账号并发隔离 | + +## 15.2 关键测试用例清单 + +1. **幂等性**: 同一 commit\_req 调用两次,第二次应快速返回(blob exists 跳过 + ref 未变 → no-op 或 same oid) +2. **跨 scope 原子性**: 一次 commit 同时改 `resources/a.md` 和 `agent/skills/b.py`,restore 父 commit 后两者都应回滚 +3. **派生文件纳入**: 创建 `resources/x.md` 与 `resources/x.md.abstract.md`,commit 后 `show` 两者均可见;restore 父 commit 后两者都应回滚;向量索引文件不被 commit +4. **CAS 冲突**: 两个并发 commit,后到的必须看到 `ConcurrentCommit` 错误而非默默覆盖 +5. **dry\_run 不写**: restore dry\_run 后再 ls,VFS 状态不变 +6. **账号隔离**: A 账号的 commit\_oid 在 B 账号下 show 必须返回 not found +7. **后端等价性**: LocalObjectStore 与 S3ObjectStore (LocalStack/MinIO) 跑同一组用例输出一致 +8. **大文件**: 单 blob 80MB 可正确 commit / show / restore +9. **双重加密**: restore 写回后 VFS read 内容与原始明文一致 + +*** + +# 16. 实施计划 (MVP) + +| 阶段 | 工作内容 | 交付物 | 预估 | +| ----- | --------------------------------------------------------------------------------------------------------------------- | ---------------------- | ---- | +| D1-D2 | 新建 `crates/ragfs/src/git/`,定义 trait + LocalObjectStore/LocalRefStore + S3ObjectStore/S3RefStore (含 If-Match CAS) + 单测 | 裸 Git 存储跑通 put/get/CAS | 2d | +| D3 | 接入 `gix_object::tree::Editor`,实现 `GitService::commit` | commit 流程单测绿 | 1d | +| D4 | 实现 `GitService::show` (纯读路径,易验证) | commit + show 闭环 | 1d | +| D5 | 实现 `GitService::restore`,dry\_run 优先,验证幂等 | commit + restore 闭环 | 1d | +| D6 | PyO3 binding: `RAGFSBindingClient` 三个新方法 + 错误映射 | Python 端可调 | 1d | +| D7 | `VikingFS.commit/restore/show/log` + URI ↔ tree path 转换 | Python 端到端 | 1d | +| D9 | tracing/metrics 接入 + health check | 可观测性完备 | 0.5d | +| D10 | 文档 + 灰度发布 | 上线 Phase 1 | 0.5d | + +> 💡 **总工期**: \~10 人日 (MVP, 单人); 双后端等价测试与 S3 CAS 兼容性验证可能引入额外 2-3 天。 + +*** + +# 17. 当前实现进度与未实现项 + +下面汇总文档中已写出但当前**尚未实现**的部分,供后续阶段补齐: + +### Rust 侧 + +- **commit / restore 内部 CAS 重试循环** —— 文档 §11.3 旧版描述。当前 `commit()` 明确不做 retry,`ConcurrentCommit` 直接上抛。 +- *git.tuning.* 配置项接入\* —— 文档 §10.1。`upload_concurrency` / `restore_concurrency` / `ref_cas_max_retry` / `ref_cas_backoff_ms` 已在 `GitTuningConfig` 中定义并解析,但代码尚未读取(restore 回写并发度硬编码 32,commit blob 上传为串行)。 +- **S3 RedisLock CAS 模式** —— 文档 §7.2。`CasMode::RedisLock` 仅作为枚举占位,实际调用返回 "not yet implemented" 错误。 +- **资源限制实际生效** —— 文档 §12.3。`BlobTooLarge` / `TooManyFiles` / `[git.quota]` 配置块均未实现(仅错误 variant 已定义)。同账号 restore 写竞态防护已通过 `VikingFS.restore` 的 `LockContext` 树锁实现(见 §12.3)。 +- **账号 ID 校验** —— 文档 §12.1。`validate_account_id` 未实现,`GitError::InvalidAccountId` 仅占位。 +- **本地 ref 跨进程锁** —— 文档 §7.1。`LocalRefStore` 仅有进程内 `DashMap`,未叠加 `flock`。 +- **观测性 (tracing / metrics)** —— 文档 §14.1 / §14.2。span / event / 各类 counter / histogram 均未接入。 +- **健康检查增强** —— 文档 §14.3。当前仅 `git_enabled` / `git_backend`,未接入 `writable` / `last_commit_age_sec` / 心跳。 +- **GC / pack file / branch & tag 管理 / diff API** —— 文档 §19。属后续 Phase。 +- **loom 并发模型测试 + proptest fuzz** —— 文档 §5.2。MVP 未引入。 +- 在版本管理中忽略某些特定文件 uri,类似 .gitignore 功能的实现。 + +### Python 侧 + +- **VikingFS.commit / restore / show / log 已实现**(`openviking/storage/viking_fs.py`),`_uri_to_tree_path` / `_tree_path_to_uri` / `_classify_restore_path` / `_schedule_vector_rebuild` / `_run_vector_rebuild` 均已实现,精确按 marker / source-file / `.relations.json` 调度 `ReindexExecutor`。 +- **VikingFS.\_trigger\_vector\_rebuild(account, paths)(早期设计)** 已被更精确的 `_schedule_vector_rebuild(written, deleted)` 替代,**不会**再实现旧 API。 + +*** + +# 18. 风险与缓解 + +| 风险 | 影响 | 缓解 | +| -------------------------------------- | -- | ----------------------------------------------------------------------------------- | +| S3/TOS CAS 兼容性差异 | 高 | POC 阶段验证目标后端的 If-Match 条件写支持;不支持时该后端不可用于 git ref 存储 | +| 大账号 commit 时 enumerate 慢 | 中 | `paths` 参数限定 scope;后续引入增量 diff(基于 mtime + parent tree) | +| 双重加密导致 restore 后内容损坏 | 高 | restore 路径绕过 `viking_fs.write` 加密,直接走 `MountableFS`;集成测试覆盖 | +| L0/L1 派生文件纳入版本历史,模型异步重建导致 commit 间差异增加 | 中 | 用户主动控制 commit 时机,不自动触发;L0/L1 文件通常较小(< 10KB),存储成本可控;如需降频可配置 commit 时忽略 mtime-only 变更 | +| 同一账号多 Agent 高并发 commit | 中 | CAS 冲突自动重试 3 次;长期可引入"基于队列的串行化提交器" | +| Git 数据无 GC,长期膨胀 | 中 | 首版不做 GC,运维侧定期 dump + 压缩;后续接入 reachability-based GC | +| loose object 数量爆炸,本地 inode 紧张 | 低 | Phase 4 引入 pack file;Git fanout 已经缓解一半 | + +*** + +# 19. 后续演进方向 + +1. **Pack file 支持**: 引入 `gix-pack`,对历史 commit 做 delta 压缩,降低存储成本 80%+ +2. **Auto-commit hook**: 在 `content_write.ContentWriteCoordinator` 末尾追加可选 hook,实现"每次写自动 commit"模式(Phase 2 重新评估) +3. **Branch / Tag 管理**: 暴露 `branch_create / branch_delete / tag` API +4. **Diff API**: `diff(ref_a, ref_b)` 返回结构化差异,供 UI 渲染 +5. **跨账号镜像**: 支持账号间的 commit 分享(类似 GitHub fork) +6. **向量索引版本化(可选)**: 若后续需要向量索引的快照回滚能力,可引入轻量 manifest 记录 index 版本与对应 commit\_oid 的映射,避免全量存储向量数据 +7. **外部 Git 工具兼容**: 输出标准 Git 仓库格式,允许通过 `git clone file://...` 检视 + +*** + +# 20. 附录 + +## 20.1 术语表 + +| 术语 | 含义 | +| ------------ | ------------------------------------------------------------------------------------------------ | +| VFS | Virtual File System,本文特指 OpenViking 的 `MountableFS` + plugin 体系 | +| Loose Object | Git 的基础存储单元,zlib 压缩,按 SHA 寻址的单文件 | +| CAS | Compare-And-Swap,本文特指 ref 更新时"仅当当前值 = 期望值才写入" | +| Root Tree | commit 对象指向的最顶层 tree 对象,代表整个仓库快照 | +| Tree Editor | `gix_object::tree::Editor`,gitoxide 提供的内存中 tree 构建器,支持 upsert/remove/write | +| 派生文件 | `.abstract.md` / `.overview.md` / `.relations.json`,由 OpenViking 模型异步生成的 L0/L1 摘要文件,已纳入 Git 版本管理 | + +## 20.2 参考资料 + +- [GitoxideLabs/gitoxide](https://github.com/GitoxideLabs/gitoxide) +- [volcengine/OpenViking](https://github.com/volcengine/OpenViking) +- [OpenViking 存储架构文档](https://github.com/volcengine/OpenViking/blob/main/docs/zh/concepts/05-storage.md) +- [Git Pack Format (后续 Phase 参考)](https://git-scm.com/docs/gitformat-pack) + +> 💡 **文档完成**。如需对某一章节细化(如某后端实现细节、某测试用例代码、迁移脚本),请告知具体目标。 + diff --git a/docs/en/api/01-overview.md b/docs/en/api/01-overview.md index 26ff18b1f9..977d72a4c0 100644 --- a/docs/en/api/01-overview.md +++ b/docs/en/api/01-overview.md @@ -435,6 +435,15 @@ Below are all HTTP API endpoints provided by OpenViking, grouped by functional m | DELETE | `/api/v1/fs` | Delete resource | | POST | `/api/v1/fs/mv` | Move/rename resource | +### Snapshots (Multi-Version Management) + +| Method | Path | Description | +|--------|------|-------------| +| POST | `/api/v1/snapshot/commit` | Save the current workspace state as a new snapshot | +| GET | `/api/v1/snapshot/log` | Walk commit history newest-first | +| GET | `/api/v1/snapshot/show` | View commit metadata, or read a file from a commit | +| POST | `/api/v1/snapshot/restore` | Restore a directory or the whole account tree to a past snapshot (forward commit) | + ### Content | Method | Path | Description | @@ -569,6 +578,7 @@ Subsequent API documentation is organized by functional module as follows: | [Resources](02-resources.md) - Resource management API | Adding, importing, exporting resources and skills | | [Retrieval](06-retrieval.md) - Search API | Search, relations, context acquisition | | [File System](03-filesystem.md) - File system operations | Directory operations, content reading and writing | +| [Snapshots](11-snapshot.md) - Multi-version management | Snapshot commit, history walk, version restore | | [Sessions](05-sessions.md) - Session management | Session creation, message management, memory extraction | | [Skills](04-skills.md) - Skill management API | Skill management | | [System](07-system.md) - System and monitoring API | System status, monitoring, debug API | diff --git a/docs/en/api/11-snapshot.md b/docs/en/api/11-snapshot.md new file mode 100644 index 0000000000..bd8147b295 --- /dev/null +++ b/docs/en/api/11-snapshot.md @@ -0,0 +1,427 @@ +# Snapshots (Multi-Version Management) + +On top of VikingFS, OpenViking provides Git-based multi-version management, called **Snapshots**. It saves an account's entire resource tree as a series of immutable commits, letting you walk history, compare versions, and restore the workspace to any past state. + +Snapshots are powered by [gitoxide](https://github.com/Byron/gitoxide) embedded in the Rust RAGFS layer, maintaining one logical Git repository per `account_id`. This is fully transparent to callers — you never touch a `.ovgit` directory, the object store, or ref internals. + +The four core commands: + +| Command | Purpose | +|---------|---------| +| `commit` | Save the current workspace state as a new snapshot | +| `log` | Walk commit history starting from the newest | +| `show` | View a commit's metadata, or read a file's content from that commit | +| `restore` | Restore a directory (or the whole account tree) to a past snapshot | + +## Core Concepts + +- **Commit**: A snapshot is a commit, uniquely identified by a 40-hex SHA-1 `commit_oid`. Most commands also accept an abbreviated OID prefix or a branch name (e.g. `main`). +- **Branch**: The default branch is `main`. Unless you pass one explicitly, every command operates on `main`. +- **Forward-commit restore**: `restore` does **not** rewind or rewrite history. It reads the content at `source_commit`, writes the diff back into the workspace, and creates a **new commit on top of the current HEAD**. The new commit's parent is therefore the HEAD that existed before the restore — **not** `source_commit`. HEAD always advances monotonically and history is never lost. +- **Scope**: `commit` can be limited to specific URIs via `paths`; `restore` can be limited to a subtree via `project_dir`, leaving files outside it untouched. + +## Implementation + +- HTTP routes: [snapshot.py](file:///cloudide/workspace/OpenViking/openviking/server/routers/snapshot.py), prefix `/api/v1/snapshot`. +- SDK namespace: [snapshot_namespace.py](file:///cloudide/workspace/OpenViking/openviking/snapshot_namespace.py), exposed as `client.snapshot.*`. +- Underlying semantics: `commit` / `restore` / `show` / `log` in [viking_fs.py](file:///cloudide/workspace/OpenViking/openviking/storage/viking_fs.py). +- CLI: the `SnapshotCmd` in [main.rs](file:///cloudide/workspace/OpenViking/crates/ov_cli/src/main.rs), subcommands in [snapshot.rs](file:///cloudide/workspace/OpenViking/crates/ov_cli/src/commands/snapshot.rs). + +## API Reference + +### commit() + +Save the current workspace state as a new snapshot. + +**Parameters** + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| message | str | Yes | - | Commit message | +| paths | List[str] | No | null | `viking://` URIs to scope the snapshot to; entries may be files or directories. Directories are expanded recursively with the snapshot pruning rules applied. `null` snapshots the whole account tree. An empty list `[]` is forwarded as an explicit empty path set (no-op). A path that exists in neither the VFS nor the previous snapshot logs a warning and is treated as a no-op deletion | +| branch | str | No | `main` | Branch to advance | +| author_name | str | No | null | Override the default author name (default `viking-bot`) | +| author_email | str | No | null | Override the default author email | + +**Python SDK (Embedded / HTTP)** + +```python +result = client.snapshot.commit( + message="v1 initial import", + paths=["viking://resources/my_md.md"], +) +print(result["commit_oid"]) +``` + +**HTTP API** + +``` +POST /api/v1/snapshot/commit +``` + +```bash +curl -X POST "http://localhost:1933/api/v1/snapshot/commit" \ + -H "Content-Type: application/json" \ + -H "X-API-Key: your-key" \ + -d '{ + "message": "v1 initial import", + "paths": ["viking://resources/my_md.md"] + }' +``` + +**CLI** + +```bash +ov snapshot commit -m "v1 initial import" --paths viking://resources/my_md.md -o json +``` + +**Response** + +When a new snapshot is created: + +```json +{ + "status": "ok", + "result": { + "result": "created", + "commit_oid": "3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2", + "changed": 3 + } +} +``` + +When the workspace is unchanged relative to the last commit, the result is `noop` and `commit_oid` is the current HEAD: + +```json +{ + "status": "ok", + "result": { + "result": "noop", + "commit_oid": "3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2" + } +} +``` + +--- + +### log() + +Starting from a branch's HEAD, walk history along the first parent (`parents[0]`) and return commits newest-first. + +**Parameters** + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| branch | str | No | `main` | Branch to walk | +| limit | int | No | 20 | Max commits to return. The HTTP endpoint clamps this to 1–500 | + +**Python SDK (Embedded / HTTP)** + +```python +history = client.snapshot.log(limit=10) +for commit in history: + print(commit["oid"], commit["message"]) +``` + +**HTTP API** + +``` +GET /api/v1/snapshot/log?branch={branch}&limit={limit} +``` + +```bash +curl -X GET "http://localhost:1933/api/v1/snapshot/log?branch=main&limit=10" \ + -H "X-API-Key: your-key" +``` + +**CLI** + +```bash +ov snapshot log --limit 10 -o json +``` + +**Response** + +`result` is a list of commit metadata, each element having the same shape as the metadata returned by [show()](#show): + +```json +{ + "status": "ok", + "result": [ + { + "oid": "9a0b1c2d3e4f5061728394a5b6c7d8e9f0a1b2c3", + "tree": "11223344556677889900aabbccddeeff00112233", + "parents": ["3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2"], + "author": { + "name": "viking-bot", + "email": "bot@openviking.local", + "time_seconds": 1750300000, + "tz_offset_seconds": 28800 + }, + "committer": { + "name": "viking-bot", + "email": "bot@openviking.local", + "time_seconds": 1750300000, + "tz_offset_seconds": 28800 + }, + "message": "v2 modify delete add" + } + ] +} +``` + +> When the branch has no commits yet, the HTTP endpoint returns `404 NOT_FOUND`. + +--- + +### show() + +View a commit's metadata; if `path` is given, return that file's content from the commit instead. + +**Parameters** + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| target_ref | str | Yes | - | Commit OID (abbreviated prefix allowed), branch name, or tag | +| path | str | No | null | `viking://` URI of a single file; omit to return commit metadata | + +**Python SDK (Embedded / HTTP)** + +```python +# View commit metadata +meta = client.snapshot.show("3f2a1b9c") +print(meta["message"], meta["parents"]) + +# Read a file's content from the commit +blob = client.snapshot.show("3f2a1b9c", path="viking://resources/my_project/guide.md") +``` + +> Note: when reading a file (`path` given), the **Embedded (local) client** returns raw `bytes`, while the **HTTP client** returns a `{"oid": str, "size": int, "bytes": bytes}` dict. + +**HTTP API** + +``` +GET /api/v1/snapshot/show?target_ref={ref}[&path={uri}] +``` + +```bash +# Commit metadata (returns JSON) +curl -X GET "http://localhost:1933/api/v1/snapshot/show?target_ref=3f2a1b9c" \ + -H "X-API-Key: your-key" + +# File content (returns a binary stream) +curl -X GET "http://localhost:1933/api/v1/snapshot/show?target_ref=3f2a1b9c&path=viking://resources/my_project/guide.md" \ + -H "X-API-Key: your-key" +``` + +Without `path`, the response is commit metadata JSON. With `path`, the response is a raw byte stream (`Content-Type: application/octet-stream`) plus two headers: + +- `X-Snapshot-Oid`: the blob object's OID +- `X-Snapshot-Size`: the blob size in bytes + +**CLI** + +```bash +# Commit metadata +ov snapshot show 3f2a1b9c -o json + +# Read file content (defaults to stdout; use --out-file to write to a local file) +ov snapshot show 3f2a1b9c --path viking://resources/my_project/guide.md --out-file ./guide.md +``` + +**Response (commit metadata)** + +```json +{ + "status": "ok", + "result": { + "oid": "3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2", + "tree": "00112233445566778899aabbccddeeff00112233", + "parents": [], + "author": { + "name": "viking-bot", + "email": "bot@openviking.local", + "time_seconds": 1750299000, + "tz_offset_seconds": 28800 + }, + "committer": { + "name": "viking-bot", + "email": "bot@openviking.local", + "time_seconds": 1750299000, + "tz_offset_seconds": 28800 + }, + "message": "v1 initial import" + } +} +``` + +--- + +### restore() + +Restore a directory (or the whole account tree) to its state at `source_commit`. + +This is a **forward-commit restore**: it computes the diff between `source_commit` and the current HEAD, writes it back into the workspace, and creates a **new commit on top of the current HEAD**. The new commit's parent is the pre-restore HEAD (not `source_commit`), so history is never rewritten. Files outside `project_dir` are left untouched. + +**Parameters** + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| source_commit | str | Yes | - | What to restore from: commit OID (abbreviated prefix allowed), branch name, or tag | +| project_dir | str | No | null | `viking://` URI of the subtree to restore; omit to restore the whole account tree | +| branch | str | No | `main` | Branch to advance | +| dry_run | bool | No | false | Compute and return the diff only; write nothing | +| message | str | No | null | Message for the new commit; auto-generated when omitted | +| author_name | str | No | null | Override the default author name | +| author_email | str | No | null | Override the default author email | + +**Python SDK (Embedded / HTTP)** + +```python +result = client.snapshot.restore( + project_dir="viking://resources/my_project", + source_commit="3f2a1b9c", + message="restore to v1", +) +print(result["result"], result["new_commit_oid"]) + +# Preview which files would change first +plan = client.snapshot.restore( + project_dir="viking://resources/my_project", + source_commit="3f2a1b9c", + dry_run=True, +) +print(plan["diff"]) +``` + +**HTTP API** + +``` +POST /api/v1/snapshot/restore +``` + +```bash +curl -X POST "http://localhost:1933/api/v1/snapshot/restore" \ + -H "Content-Type: application/json" \ + -H "X-API-Key: your-key" \ + -d '{ + "project_dir": "viking://resources/my_project", + "source_commit": "3f2a1b9c", + "message": "restore to v1" + }' +``` + +**CLI** + +```bash +# Positional args are then +ov snapshot restore 3f2a1b9c viking://resources/my_project -m "restore to v1" -o json + +# Dry run +ov snapshot restore 3f2a1b9c viking://resources/my_project --dry-run -o json +``` + +**Response (applied)** + +On a successful write that produces a new commit, `result` is `applied`. Note `parent_commit` equals the old (pre-restore) HEAD, confirming the forward-commit semantics: + +```json +{ + "status": "ok", + "result": { + "result": "applied", + "new_commit_oid": "c3d4e5f60718293a4b5c6d7e8f9a0b1c2d3e4f50", + "source_commit": "3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2", + "parent_commit": "9a0b1c2d3e4f5061728394a5b6c7d8e9f0a1b2c3", + "written": 1, + "deleted": 1, + "unchanged": 1, + "written_paths": ["resources/my_project/guide.md"], + "deleted_paths": ["resources/my_project/changelog.md"], + "task_id": "snapshot_restore_reindex-..." + } +} +``` + +When the restore has vector side effects (files written/deleted), the response carries a `task_id` you can poll via `GET /api/v1/tasks/{task_id}` to track the background vector rebuild. + +**Response (noop)** + +When the source is byte-identical to the current state, the result is `noop` and no new commit is created: + +```json +{ + "status": "ok", + "result": { + "result": "noop", + "head": "9a0b1c2d3e4f5061728394a5b6c7d8e9f0a1b2c3", + "source": "3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2" + } +} +``` + +**Response (dry_run)** + +With `dry_run=true`, only the planned diff is returned and nothing is written. Diff paths are relative to `project_dir`: + +```json +{ + "status": "ok", + "result": { + "result": "dry_run", + "head": "9a0b1c2d3e4f5061728394a5b6c7d8e9f0a1b2c3", + "source": "3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2", + "diff": { + "to_write": [{"path": "guide.md", "oid": "..."}], + "to_delete": ["changelog.md"], + "unchanged": ["notes/todo.md"] + } + } +} +``` + +## A Typical Flow + +A complete "commit → modify → restore" flow (Python SDK): + +```python +import openviking as ov + +client = ov.OpenViking() +client.initialize() + +root = "viking://resources/my_project" + +# 1. Write initial content and commit v1 +client.write(f"{root}/guide.md", "# Guide\n\nv1 content\n", mode="create", wait=True) +v1 = client.snapshot.commit(message="v1 initial import") + +# 2. Modify and commit v2 +client.write(f"{root}/guide.md", "# Guide\n\nv2 content\n", mode="replace", wait=True) +v2 = client.snapshot.commit(message="v2 update") + +# 3. Walk history +for c in client.snapshot.log(limit=10): + print(c["oid"][:8], c["message"]) + +# 4. Restore the workspace to v1 (creates a new commit on top of v2) +client.snapshot.restore(project_dir=root, source_commit=v1["commit_oid"], message="restore to v1") + +client.close() +``` + +For more end-to-end examples, see the [examples/snapshot/](file:///cloudide/workspace/OpenViking/examples/snapshot) directory in the repository, covering the SDK, HTTP, and CLI surfaces. + +## Error Handling + +| Scenario | HTTP Status | Error Code | +|----------|-------------|------------| +| Branch/commit not found, or `show`'s `path` does not exist in that commit | 404 | `NOT_FOUND` | +| Branch concurrently advanced during restore (CAS conflict) | 409 | `CONFLICT` | +| Request body contains an unknown field (request model is `extra="forbid"`) | 400 | `INVALID_ARGUMENT` | + +## Related Documentation + +- [File System](03-filesystem.md): snapshots build on filesystem resources +- [System](07-system.md): track the background vector rebuild triggered by restore via `GET /api/v1/tasks/{task_id}` +- [API Overview](01-overview.md): full endpoint reference diff --git a/docs/en/guides/13-ragfs-cache.md b/docs/en/guides/14-ragfs-cache.md similarity index 100% rename from docs/en/guides/13-ragfs-cache.md rename to docs/en/guides/14-ragfs-cache.md diff --git a/docs/en/guides/15-snapshot.md b/docs/en/guides/15-snapshot.md new file mode 100644 index 0000000000..5295f41dc9 --- /dev/null +++ b/docs/en/guides/15-snapshot.md @@ -0,0 +1,228 @@ +# Snapshots (Multi-Version Management) Guide + +This guide explains how to enable and use OpenViking's multi-version management (snapshots). On top of VikingFS, it provides Git-based `commit`/`log`/`show`/`restore` primitives, letting you save an account's resource tree as a series of immutable snapshots, walk history, compare versions, and restore the workspace to any past state. + +Multi-version management is powered by [gitoxide](https://github.com/Byron/gitoxide) embedded in the Rust RAGFS layer, maintaining one logical Git repository per `account_id`. It is fully transparent to callers — you never run any `git` command yourself. + +> For the full API reference of each command's parameters and responses, see [Snapshots API](../api/11-snapshot.md). + +## Prerequisites + +- You already have a working `ov.conf`. +- Resource read/write is verified to work (snapshots build on filesystem resources). +- If you choose the S3 backend for Git objects, prepare the bucket, region, endpoint, and access credentials first. + +## Enabling Multi-Version Management + +Multi-version management is **enabled** by default (`git.enabled` defaults to `true`). The Git object backend can be `local` (local filesystem) or `s3` (S3-compatible object storage); when `git.backend` is not set explicitly, it **inherits `storage.agfs.backend`** (a `memory` storage backend maps to `local`). To turn multi-version management off, set `git.enabled` to `false`. + +### Local Backend (recommended for single-node deployments) + +```json +{ + "storage": { + "workspace": "./data" + }, + "git": { + "enabled": true, + "backend": "local", + "default_branch": "main", + "author_name": "viking-bot", + "author_email": "bot@viking.local", + "local": { + "base_dir": "" + } + } +} +``` + +Configuration reference: + +| Field | Default | Description | +|-------|---------|-------------| +| `git.enabled` | `true` | Whether multi-version management is on. Set to `false` to disable snapshot commands | +| `git.backend` | inherits `storage.agfs.backend` | Git object backend: `local` or `s3`. When not set explicitly, inherits `storage.agfs.backend` (`memory` maps to `local`) | +| `git.default_branch` | `main` | Default branch name when none is specified | +| `git.author_name` | `viking-bot` | Default author name when callers omit `author_name` | +| `git.author_email` | `bot@viking.local` | Default author email | +| `git.local.base_dir` | `""` | Directory holding Git objects/refs. **When empty, defaults to `{storage.workspace}/.ovgit`** | + +> Usually leave `git.local.base_dir` empty so snapshot data lands in `.ovgit` under the workspace, making it easy to back up and migrate alongside resource data. + +### S3 Backend (recommended for distributed / cloud deployments) + +Stores Git objects and refs in S3-compatible object storage (e.g. Volcengine TOS, MinIO, AWS S3). When `backend` is `s3`, the `git.s3` section is **required**, and `bucket` and `region` must not be empty. + +> Tip: the `git.s3` fields `bucket`, `region`, `endpoint`, `access_key`, and `secret_key` **inherit the matching `storage.agfs.s3`** values when not set explicitly. So when `storage.agfs` is already configured as an s3 backend, you usually don't need to repeat `git.s3` — as long as `git.backend` is not set explicitly, multi-version management reuses the bucket and credentials from `storage.agfs`. + +```json +{ + "storage": { + "workspace": "./data" + }, + "git": { + "enabled": true, + "backend": "s3", + "default_branch": "main", + "author_name": "viking-bot", + "author_email": "bot@viking.local", + "s3": { + "bucket": "your-tos-bucket", + "region": "cn-beijing", + "endpoint": "https://tos-s3-cn-beijing.volces.com", + "access_key": "", + "secret_key": "", + "prefix": ".ovgit", + "use_path_style": false, + "cas_mode": "native" + } + } +} +``` + +Configuration reference: + +| Field | Default | Description | +|-------|---------|-------------| +| `git.s3.bucket` | inherits `storage.agfs.s3.bucket` | Bucket holding Git objects/refs. Required (may be inherited from `storage.agfs.s3`) | +| `git.s3.region` | inherits `storage.agfs.s3.region`, else `us-east-1` | Region the bucket is in. Required | +| `git.s3.prefix` | `.ovgit` | Key prefix; all data is stored under `{prefix}/{account}/...` | +| `git.s3.endpoint` | inherits `storage.agfs.s3.endpoint`, else `""` | Custom S3 endpoint (MinIO/TOS, etc.); leave empty for standard AWS S3 | +| `git.s3.access_key` / `git.s3.secret_key` | inherit the matching `storage.agfs.s3` fields, else `null` | Credentials read directly; empty falls back to the SDK default credentials chain | +| `git.s3.use_path_style` | `true` | `true` uses path-style addressing (MinIO, etc.); `false` uses virtual-host style (TOS, etc.) | +| `git.s3.cas_mode` | `native` | Ref CAS mode. `native` uses S3 conditional writes (If-Match) | + +After editing the config, restart the OpenViking service (or re-initialize the SDK client) for it to take effect. + +> The repository ships ready-to-use examples: [ov.conf.git-local.example](file:///cloudide/workspace/OpenViking/examples/snapshot/ov.conf.git-local.example) and [ov.conf.git-s3-tos.example](file:///cloudide/workspace/OpenViking/examples/snapshot/ov.conf.git-s3-tos.example). + +## Directory Layout Change: the `.ovgit` Directory + +When the `local` backend is enabled and `base_dir` is left empty, OpenViking adds a **`.ovgit`** directory under the workspace to hold Git objects and refs: + +```text +data/ # storage.workspace +├── viking/ # user-visible resource tree (viking:// maps here) +│ └── ... +└── .ovgit/ # multi-version management data (new) + └── {account_id}/ # one logical Git repository per account + ├── objects/ # Git objects (commit/tree/blob), standard fanout aa/bb... + ├── refs/ + │ └── heads/ + │ └── main # branch ref, content is a 40-hex OID + └── HEAD # current branch pointer, content "ref: refs/heads/main" +``` + +Key points: + +- `.ovgit` is an internal data directory. It is **not** exposed through `viking://` — users cannot see or modify it through the filesystem APIs (`ls`/`read`, etc.). +- Its layout matches a standard Git object store (content-addressed `objects/`, loose `refs/`), but it is managed automatically by OpenViking. You should **not** run `git` commands against it. +- When backing up or migrating the workspace, copy `.ovgit` along with it to preserve the full version history. +- With the `s3` backend, no local `.ovgit` directory is created; data lives under the bucket's `{prefix}/{account}/...` keys instead. + +## Usage + +Once enabled, all three surfaces expose snapshot commands. The examples below show a minimal "commit → modify → restore" flow. + +### Python SDK + +Snapshot methods live under the `client.snapshot.*` namespace. + +```python +import openviking as ov + +client = ov.OpenViking() +client.initialize() + +root = "viking://resources/my_project" + +# 1. Write initial content and commit v1 +client.write(f"{root}/guide.md", "# Guide\n\nv1 content\n", mode="create", wait=True) +v1 = client.snapshot.commit(message="v1 initial import") +print("v1:", v1["commit_oid"]) + +# 2. Modify and commit v2 +client.write(f"{root}/guide.md", "# Guide\n\nv2 content\n", mode="replace", wait=True) +v2 = client.snapshot.commit(message="v2 update") + +# 3. Walk history +for c in client.snapshot.log(limit=10): + print(c["oid"][:8], c["message"]) + +# 4. Inspect a commit's metadata +print(client.snapshot.show(v1["commit_oid"])["message"]) + +# 5. Restore the workspace to v1 (creates a new "forward" commit on top of v2) +client.snapshot.restore(project_dir=root, source_commit=v1["commit_oid"], message="restore to v1") + +client.close() +``` + +### CLI + +The CLI subcommands live under `ov snapshot`: + +```bash +# Commit the current workspace state +ov snapshot commit -m "v1 initial import" -o json + +# Walk history (newest first) +ov snapshot log --limit 10 -o json + +# View commit metadata +ov snapshot show -o json + +# Read a file's content from a commit (defaults to stdout; use --out-file to write a local file) +ov snapshot show --path viking://resources/my_project/guide.md --out-file ./guide.md + +# Restore a directory to a past snapshot (positional args are then ) +ov snapshot restore viking://resources/my_project -m "restore to v1" -o json + +# Preview which files would change first +ov snapshot restore viking://resources/my_project --dry-run -o json +``` + +### HTTP API + +```bash +# Commit +curl -X POST "http://localhost:1933/api/v1/snapshot/commit" \ + -H "Content-Type: application/json" \ + -H "X-API-Key: your-key" \ + -d '{"message": "v1 initial import"}' + +# Walk history +curl -X GET "http://localhost:1933/api/v1/snapshot/log?branch=main&limit=10" \ + -H "X-API-Key: your-key" + +# View commit metadata +curl -X GET "http://localhost:1933/api/v1/snapshot/show?target_ref=" \ + -H "X-API-Key: your-key" + +# Restore +curl -X POST "http://localhost:1933/api/v1/snapshot/restore" \ + -H "Content-Type: application/json" \ + -H "X-API-Key: your-key" \ + -d '{"project_dir": "viking://resources/my_project", "source_commit": "", "message": "restore to v1"}' +``` + +## Key Semantics: Forward-Commit Restore + +`restore` uses **forward-commit** semantics: it reads the content at `source_commit`, writes the diff back into the workspace, and creates a **new commit on top of the current HEAD**. Therefore: + +- The new commit's parent is the HEAD that existed before the restore — **not** `source_commit`. +- HEAD always advances monotonically, and **history is never rewritten or lost** — going back to an older version is itself a new commit. +- `restore` only affects files within `project_dir` (the whole account tree when omitted); files outside that scope are left untouched. + +## Notes + +- After editing the `git` config, restart the service / re-initialize the client for it to take effect. +- With the `s3` backend, `git.s3.bucket` and `git.s3.region` are required; missing them causes initialization to fail. +- If a restore has vector side effects (files written/deleted), the response carries a `task_id` you can poll via `GET /api/v1/tasks/{task_id}` to track the background vector rebuild (see the [Observability guide](05-observability.md) and [API Overview](../api/01-overview.md)). +- Do not operate on the `.ovgit` directory with an external `git` tool; it is maintained by OpenViking. + +## Related Documentation + +- [Snapshots API](../api/11-snapshot.md): full reference of command parameters and responses +- [Configuration](01-configuration.md): full `ov.conf` reference +- [Multi-Write Storage Guide](13-multi-write-storage.md): multi-backend replication of resource data diff --git a/docs/zh/api/01-overview.md b/docs/zh/api/01-overview.md index b77a057460..d57ad5f892 100644 --- a/docs/zh/api/01-overview.md +++ b/docs/zh/api/01-overview.md @@ -431,6 +431,15 @@ JSON 输出 - 错误: | DELETE | `/api/v1/fs` | 删除资源 | | POST | `/api/v1/fs/mv` | 移动/重命名资源 | +### 快照端点(多版本管理) + +| 方法 | 路径 | 说明 | +|------|------|------| +| POST | `/api/v1/snapshot/commit` | 把当前工作区状态保存成新快照 | +| GET | `/api/v1/snapshot/log` | 从最新提交开始回溯历史 | +| GET | `/api/v1/snapshot/show` | 查看提交元数据,或读取提交中某个文件 | +| POST | `/api/v1/snapshot/restore` | 恢复目录或整棵账号树到某个历史快照(正向提交) | + ### 内容端点 | 方法 | 路径 | 说明 | @@ -564,6 +573,7 @@ VikingBot API 需要服务器启动时指定 `--with-bot` 选项: |------|------| | [资源管理](02-resources.md) | 资源和技能的添加、导入、导出 | | [文件系统](03-filesystem.md) | 目录操作、内容读写 | +| [多版本管理](11-snapshot.md) | 快照提交、历史回溯、版本恢复 | | [技能](04-skills.md) | 技能管理 API | | [会话管理](05-sessions.md) | 会话创建、消息管理、记忆提取 | | [检索](06-retrieval.md) | 搜索、关联、上下文获取 | diff --git a/docs/zh/api/11-snapshot.md b/docs/zh/api/11-snapshot.md new file mode 100644 index 0000000000..da1345f0c7 --- /dev/null +++ b/docs/zh/api/11-snapshot.md @@ -0,0 +1,427 @@ +# 多版本管理(快照) + +OpenViking 在 VikingFS 之上提供了一套基于 Git 的多版本管理能力,称为**快照(Snapshot)**。它把某个账号(account)下的整棵资源树保存成一系列不可变的提交(commit),让你能够回溯历史、对比版本,并把工作区恢复到任意一个历史状态。 + +快照能力底层由内嵌在 Rust RAGFS 层的 [gitoxide](https://github.com/Byron/gitoxide) 驱动,按 `account_id` 维护一个逻辑 Git 仓库(每个账号一个仓库),对调用方完全透明——你无需关心 `.ovgit` 目录、对象库或引用细节。 + +四个核心命令: + +| 命令 | 作用 | +|------|------| +| `commit` | 把当前工作区状态保存成一个新快照 | +| `log` | 从最新提交开始回溯历史 | +| `show` | 查看某个提交的元数据,或读取该提交中某个文件的内容 | +| `restore` | 把目录(或整棵账号树)恢复到某个历史快照的状态 | + +## 核心概念 + +- **提交(commit)**:一个快照对应一个提交,由 40 位十六进制的 SHA-1 `commit_oid` 唯一标识。多数命令也接受 OID 的缩写前缀,或分支名(如 `main`)。 +- **分支(branch)**:默认分支为 `main`。除非显式传入,所有命令都作用在 `main` 上。 +- **正向恢复(forward-commit restore)**:`restore` **不会**回退或改写历史。它会读取 `source_commit` 的内容,把差异写回工作区,并在当前 HEAD 之上**生成一个新的提交**。因此新提交的父提交是恢复操作发生前的 HEAD,而**不是** `source_commit`。HEAD 始终单调向前推进,历史永远不会丢失。 +- **作用范围**:`commit` 可以通过 `paths` 限定只快照部分 URI;`restore` 可以通过 `project_dir` 限定只恢复某个子目录,目录之外的文件保持不变。 + +## API 实现介绍 + +- HTTP 路由:[snapshot.py](file:///cloudide/workspace/OpenViking/openviking/server/routers/snapshot.py),前缀 `/api/v1/snapshot`。 +- 命名空间(SDK):[snapshot_namespace.py](file:///cloudide/workspace/OpenViking/openviking/snapshot_namespace.py),暴露为 `client.snapshot.*`。 +- 底层语义实现:[viking_fs.py](file:///cloudide/workspace/OpenViking/openviking/storage/viking_fs.py) 的 `commit` / `restore` / `show` / `log`。 +- CLI 命令:[main.rs](file:///cloudide/workspace/OpenViking/crates/ov_cli/src/main.rs) 的 `SnapshotCmd`,子命令 [snapshot.rs](file:///cloudide/workspace/OpenViking/crates/ov_cli/src/commands/snapshot.rs)。 + +## API 参考 + +### commit() + +把当前工作区状态保存成一个新的快照。 + +**参数** + +| 参数 | 类型 | 必填 | 默认值 | 说明 | +|------|------|------|--------|------| +| message | str | 是 | - | 提交说明 | +| paths | List[str] | 否 | null | 限定本次快照的 `viking://` URI 列表,条目可以是文件或目录;目录会按照快照的剪枝规则递归展开。`null` 表示对整棵账号树做快照。传入空列表 `[]` 表示显式的空路径集(不会产生改动)。如果某个路径在 VFS 和前一次快照中都不存在,会输出一条 warn,并按"对该名称下任何子树执行删除"处理 | +| branch | str | 否 | `main` | 要推进的分支 | +| author_name | str | 否 | null | 覆盖默认的提交者名字(默认 `viking-bot`) | +| author_email | str | 否 | null | 覆盖默认的提交者邮箱 | + +**Python SDK (Embedded / HTTP)** + +```python +result = client.snapshot.commit( + message="v1 initial import", + paths=["viking://resources/my_md.md"], +) +print(result["commit_oid"]) +``` + +**HTTP API** + +``` +POST /api/v1/snapshot/commit +``` + +```bash +curl -X POST "http://localhost:1933/api/v1/snapshot/commit" \ + -H "Content-Type: application/json" \ + -H "X-API-Key: your-key" \ + -d '{ + "message": "v1 initial import", + "paths": ["viking://resources/my_md.md"] + }' +``` + +**CLI** + +```bash +ov snapshot commit -m "v1 initial import" --paths viking://resources/my_md.md -o json +``` + +**响应** + +新建快照时: + +```json +{ + "status": "ok", + "result": { + "result": "created", + "commit_oid": "3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2", + "changed": 3 + } +} +``` + +当工作区相对上一次提交没有任何变化时返回 `noop`,`commit_oid` 为当前 HEAD: + +```json +{ + "status": "ok", + "result": { + "result": "noop", + "commit_oid": "3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2" + } +} +``` + +--- + +### log() + +从某个分支的 HEAD 开始,沿首个父提交(`parents[0]`)逐层回溯历史,按时间从新到旧返回提交列表。 + +**参数** + +| 参数 | 类型 | 必填 | 默认值 | 说明 | +|------|------|------|--------|------| +| branch | str | 否 | `main` | 要回溯的分支 | +| limit | int | 否 | 20 | 最多返回的提交数量。HTTP 接口限制范围为 1–500 | + +**Python SDK (Embedded / HTTP)** + +```python +history = client.snapshot.log(limit=10) +for commit in history: + print(commit["oid"], commit["message"]) +``` + +**HTTP API** + +``` +GET /api/v1/snapshot/log?branch={branch}&limit={limit} +``` + +```bash +curl -X GET "http://localhost:1933/api/v1/snapshot/log?branch=main&limit=10" \ + -H "X-API-Key: your-key" +``` + +**CLI** + +```bash +ov snapshot log --limit 10 -o json +``` + +**响应** + +`result` 是一个提交元数据列表,每个元素与 [show()](#show) 返回的提交元数据结构相同: + +```json +{ + "status": "ok", + "result": [ + { + "oid": "9a0b1c2d3e4f5061728394a5b6c7d8e9f0a1b2c3", + "tree": "11223344556677889900aabbccddeeff00112233", + "parents": ["3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2"], + "author": { + "name": "viking-bot", + "email": "bot@openviking.local", + "time_seconds": 1750300000, + "tz_offset_seconds": 28800 + }, + "committer": { + "name": "viking-bot", + "email": "bot@openviking.local", + "time_seconds": 1750300000, + "tz_offset_seconds": 28800 + }, + "message": "v2 modify delete add" + } + ] +} +``` + +> 当分支还没有任何提交时,HTTP 接口返回 `404 NOT_FOUND`。 + +--- + +### show() + +查看某个提交的元数据;如果同时指定 `path`,则返回该提交中对应文件的内容。 + +**参数** + +| 参数 | 类型 | 必填 | 默认值 | 说明 | +|------|------|------|--------|------| +| target_ref | str | 是 | - | 提交 OID(支持缩写前缀)、分支名或标签 | +| path | str | 否 | null | 某个文件的 `viking://` URI;省略时返回提交元数据 | + +**Python SDK (Embedded / HTTP)** + +```python +# 查看提交元数据 +meta = client.snapshot.show("3f2a1b9c") +print(meta["message"], meta["parents"]) + +# 读取该提交中某个文件的内容 +blob = client.snapshot.show("3f2a1b9c", path="viking://resources/my_project/guide.md") +``` + +> 注意:带 `path` 读取文件内容时,**Embedded(本地)客户端**直接返回原始 `bytes`;**HTTP 客户端**返回 `{"oid": str, "size": int, "bytes": bytes}` 字典。 + +**HTTP API** + +``` +GET /api/v1/snapshot/show?target_ref={ref}[&path={uri}] +``` + +```bash +# 提交元数据(返回 JSON) +curl -X GET "http://localhost:1933/api/v1/snapshot/show?target_ref=3f2a1b9c" \ + -H "X-API-Key: your-key" + +# 读取文件内容(返回二进制流) +curl -X GET "http://localhost:1933/api/v1/snapshot/show?target_ref=3f2a1b9c&path=viking://resources/my_project/guide.md" \ + -H "X-API-Key: your-key" +``` + +不带 `path` 时返回提交元数据 JSON;带 `path` 时返回原始字节流(`Content-Type: application/octet-stream`),并附带两个响应头: + +- `X-Snapshot-Oid`:blob 对象的 OID +- `X-Snapshot-Size`:blob 字节数 + +**CLI** + +```bash +# 提交元数据 +ov snapshot show 3f2a1b9c -o json + +# 读取文件内容(默认输出到 stdout,可用 --out-file 写入本地文件) +ov snapshot show 3f2a1b9c --path viking://resources/my_project/guide.md --out-file ./guide.md +``` + +**响应(提交元数据)** + +```json +{ + "status": "ok", + "result": { + "oid": "3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2", + "tree": "00112233445566778899aabbccddeeff00112233", + "parents": [], + "author": { + "name": "viking-bot", + "email": "bot@openviking.local", + "time_seconds": 1750299000, + "tz_offset_seconds": 28800 + }, + "committer": { + "name": "viking-bot", + "email": "bot@openviking.local", + "time_seconds": 1750299000, + "tz_offset_seconds": 28800 + }, + "message": "v1 initial import" + } +} +``` + +--- + +### restore() + +把某个目录(或整棵账号树)恢复到 `source_commit` 时的状态。 + +这是**正向恢复**:它会计算 `source_commit` 与当前 HEAD 之间的差异并写回工作区,然后在当前 HEAD 之上生成一个**新的提交**。新提交的父提交是恢复前的 HEAD(而非 `source_commit`),历史不会被改写。`project_dir` 之外的文件保持不变。 + +**参数** + +| 参数 | 类型 | 必填 | 默认值 | 说明 | +|------|------|------|--------|------| +| source_commit | str | 是 | - | 要恢复到的来源:提交 OID(支持缩写前缀)、分支名或标签 | +| project_dir | str | 否 | null | 要恢复的子目录 `viking://` URI;省略时恢复整棵账号树 | +| branch | str | 否 | `main` | 要推进的分支 | +| dry_run | bool | 否 | false | 仅计算并返回差异,不做任何写入 | +| message | str | 否 | null | 新提交的说明;省略时自动生成 | +| author_name | str | 否 | null | 覆盖默认的提交者名字 | +| author_email | str | 否 | null | 覆盖默认的提交者邮箱 | + +**Python SDK (Embedded / HTTP)** + +```python +result = client.snapshot.restore( + project_dir="viking://resources/my_project", + source_commit="3f2a1b9c", + message="restore to v1", +) +print(result["result"], result["new_commit_oid"]) + +# 先预演,确认要改动哪些文件 +plan = client.snapshot.restore( + project_dir="viking://resources/my_project", + source_commit="3f2a1b9c", + dry_run=True, +) +print(plan["diff"]) +``` + +**HTTP API** + +``` +POST /api/v1/snapshot/restore +``` + +```bash +curl -X POST "http://localhost:1933/api/v1/snapshot/restore" \ + -H "Content-Type: application/json" \ + -H "X-API-Key: your-key" \ + -d '{ + "project_dir": "viking://resources/my_project", + "source_commit": "3f2a1b9c", + "message": "restore to v1" + }' +``` + +**CLI** + +```bash +# 位置参数依次为 +ov snapshot restore 3f2a1b9c viking://resources/my_project -m "restore to v1" -o json + +# 预演 +ov snapshot restore 3f2a1b9c viking://resources/my_project --dry-run -o json +``` + +**响应(applied)** + +成功写入并生成新提交时,`result` 为 `applied`。注意 `parent_commit` 等于恢复前的旧 HEAD,印证了正向恢复语义: + +```json +{ + "status": "ok", + "result": { + "result": "applied", + "new_commit_oid": "c3d4e5f60718293a4b5c6d7e8f9a0b1c2d3e4f50", + "source_commit": "3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2", + "parent_commit": "9a0b1c2d3e4f5061728394a5b6c7d8e9f0a1b2c3", + "written": 1, + "deleted": 1, + "unchanged": 1, + "written_paths": ["resources/my_project/guide.md"], + "deleted_paths": ["resources/my_project/changelog.md"], + "task_id": "snapshot_restore_reindex-..." + } +} +``` + +当恢复产生向量副作用(写入/删除文件)时,响应会附带一个 `task_id`,可通过 `GET /api/v1/tasks/{task_id}` 轮询后台向量重建进度。 + +**响应(noop)** + +来源与当前状态字节级一致、无需改动时返回 `noop`,不生成新提交: + +```json +{ + "status": "ok", + "result": { + "result": "noop", + "head": "9a0b1c2d3e4f5061728394a5b6c7d8e9f0a1b2c3", + "source": "3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2" + } +} +``` + +**响应(dry_run)** + +`dry_run=true` 时只返回计划差异,不做任何写入。差异中的路径均相对于 `project_dir`: + +```json +{ + "status": "ok", + "result": { + "result": "dry_run", + "head": "9a0b1c2d3e4f5061728394a5b6c7d8e9f0a1b2c3", + "source": "3f2a1b9c4d5e6f70819293a4b5c6d7e8f9a0b1c2", + "diff": { + "to_write": [{"path": "guide.md", "oid": "..."}], + "to_delete": ["changelog.md"], + "unchanged": ["notes/todo.md"] + } + } +} +``` + +## 典型流程 + +下面演示一个"提交 → 修改 → 恢复"的完整流程(Python SDK): + +```python +import openviking as ov + +client = ov.OpenViking() +client.initialize() + +root = "viking://resources/my_project" + +# 1. 写入初始内容并提交 v1 +client.write(f"{root}/guide.md", "# Guide\n\nv1 content\n", mode="create", wait=True) +v1 = client.snapshot.commit(message="v1 initial import") + +# 2. 修改后再提交 v2 +client.write(f"{root}/guide.md", "# Guide\n\nv2 content\n", mode="replace", wait=True) +v2 = client.snapshot.commit(message="v2 update") + +# 3. 查看历史 +for c in client.snapshot.log(limit=10): + print(c["oid"][:8], c["message"]) + +# 4. 把工作区恢复到 v1(会在 v2 之上生成一个新提交) +client.snapshot.restore(project_dir=root, source_commit=v1["commit_oid"], message="restore to v1") + +client.close() +``` + +更多端到端示例参见仓库中的 [examples/snapshot/](file:///cloudide/workspace/OpenViking/examples/snapshot) 目录,涵盖 SDK、HTTP、CLI 三种调用方式。 + +## 错误处理 + +| 场景 | HTTP 状态码 | 错误码 | +|------|-------------|--------| +| 分支/提交不存在,或 `show` 的 `path` 在该提交中不存在 | 404 | `NOT_FOUND` | +| 恢复期间分支被并发提交改写(CAS 冲突) | 409 | `CONFLICT` | +| 请求体包含未知字段(请求模型为 `extra="forbid"`) | 400 | `INVALID_ARGUMENT` | + +## 相关文档 + +- [文件系统](03-filesystem.md):快照建立在文件系统资源之上 +- [系统](07-system.md):通过 `GET /api/v1/tasks/{task_id}` 跟踪 restore 触发的后台向量重建 +- [API 概览](01-overview.md):完整端点总览 diff --git a/docs/zh/guides/13-ragfs-cache.md b/docs/zh/guides/14-ragfs-cache.md similarity index 100% rename from docs/zh/guides/13-ragfs-cache.md rename to docs/zh/guides/14-ragfs-cache.md diff --git a/docs/zh/guides/15-snapshot.md b/docs/zh/guides/15-snapshot.md new file mode 100644 index 0000000000..6a6446832a --- /dev/null +++ b/docs/zh/guides/15-snapshot.md @@ -0,0 +1,228 @@ +# 多版本管理(快照)指南 + +本指南介绍如何启用并使用 OpenViking 的多版本管理(快照)能力。多版本管理在 VikingFS 之上提供基于 Git 的 `commit`/`log`/`show`/`restore` 原语,让你把账号下的资源树保存成一系列不可变快照,随时回溯历史、对比版本,并把工作区恢复到任意历史状态。 + +多版本管理由内嵌在 Rust RAGFS 层的 [gitoxide](https://github.com/Byron/gitoxide) 驱动,以 `account_id` 为粒度维护一个逻辑 Git 仓库(每个账号一个仓库),对调用方完全透明——你无需手动执行任何 `git` 命令。 + +> 关于各命令参数和响应结构的完整 API 参考,见 [多版本管理 API](../api/11-snapshot.md)。 + +## 前置条件 + +- 已有可用的 `ov.conf`。 +- 已确认资源的读写正常(多版本管理建立在文件系统资源之上)。 +- 如果选择 S3 后端存放 Git 对象,已准备好 bucket、region、endpoint 和访问凭据。 + +## 启用多版本管理 + +多版本管理默认**开启**(`git.enabled` 默认为 `true`)。Git 对象的存储后端可以选择 `local`(本地文件系统)或 `s3`(S3 兼容对象存储);当不显式设置 `git.backend` 时,会**自动继承 `storage.agfs.backend`**(`storage.agfs.backend` 为 `memory` 时映射为 `local`)。如需关闭多版本管理,把 `git.enabled` 设为 `false` 即可。 + +### 本地后端(推荐用于单机部署) + +```json +{ + "storage": { + "workspace": "./data" + }, + "git": { + "enabled": true, + "backend": "local", + "default_branch": "main", + "author_name": "viking-bot", + "author_email": "bot@viking.local", + "local": { + "base_dir": "", + } + } +} +``` + +配置说明: + +| 字段 | 默认值 | 说明 | +|------|--------|------| +| `git.enabled` | `true` | 是否启用多版本管理。设为 `false` 可关闭快照功能 | +| `git.backend` | 继承 `storage.agfs.backend` | Git 对象后端:`local` 或 `s3`。不显式设置时继承 `storage.agfs.backend`(`memory` 映射为 `local`) | +| `git.default_branch` | `main` | 未显式指定时使用的默认分支名 | +| `git.author_name` | `viking-bot` | 调用方未传 `author_name` 时使用的默认提交者名字 | +| `git.author_email` | `bot@viking.local` | 默认提交者邮箱 | +| `git.local.base_dir` | `""` | Git 对象/引用的存放目录。**留空时默认使用 `{storage.workspace}/.ovgit`** | + +> 通常把 `git.local.base_dir` 留空即可,让快照数据自动落在工作区下的 `.ovgit` 目录,便于和资源数据一起备份与迁移。 + +### S3 后端(推荐用于分布式/云端部署) + +把 Git 对象与引用存到 S3 兼容对象存储(如火山引擎 TOS、MinIO、AWS S3)。当 `backend` 为 `s3` 时,**必须**提供 `git.s3` 段,且 `bucket`、`region` 不能为空。 + +> 提示:`git.s3` 的 `bucket`、`region`、`endpoint`、`access_key`、`secret_key` 在未显式设置时会**自动继承 `storage.agfs.s3`** 的对应字段。因此当 `storage.agfs` 已经配置为 s3 后端时,通常无需重复填写 `git.s3`——只要不显式设置 `git.backend`,多版本管理会直接复用 `storage.agfs` 的 bucket 与访问凭据。 + +```json +{ + "storage": { + "workspace": "./data" + }, + "git": { + "enabled": true, + "backend": "s3", + "default_branch": "main", + "author_name": "viking-bot", + "author_email": "bot@viking.local", + "s3": { + "bucket": "your-tos-bucket", + "region": "cn-beijing", + "endpoint": "https://tos-s3-cn-beijing.volces.com", + "access_key": "", + "secret_key": "", + "prefix": ".ovgit", + "use_path_style": false, + "cas_mode": "native" + } + } +} +``` + +配置说明: + +| 字段 | 默认值 | 说明 | +|------|--------|------| +| `git.s3.bucket` | 继承 `storage.agfs.s3.bucket` | 存放 Git 对象/引用的 bucket,必填(可由 `storage.agfs.s3` 继承) | +| `git.s3.region` | 继承 `storage.agfs.s3.region`,否则 `us-east-1` | bucket 所在区域,必填 | +| `git.s3.prefix` | `.ovgit` | 键前缀,所有数据存放在 `{prefix}/{account}/...` 下 | +| `git.s3.endpoint` | 继承 `storage.agfs.s3.endpoint`,否则 `""` | 自定义 S3 端点(MinIO/TOS 等);标准 AWS S3 留空 | +| `git.s3.access_key` / `git.s3.secret_key` | 继承 `storage.agfs.s3` 对应字段,否则 `null` | 直接读取的凭据;留空则走 SDK 默认凭据链 | +| `git.s3.use_path_style` | `true` | `true` 用 path-style 寻址(MinIO 等);`false` 用 virtual-host 寻址(TOS 等) | +| `git.s3.cas_mode` | `native` | 引用 CAS 模式。`native` 使用 S3 条件写(If-Match) | + +修改配置后,重启 OpenViking 服务(或重新初始化 SDK 客户端)使其生效。 + +> 仓库中提供了可直接参考的完整示例:[ov.conf.git-local.example](file:///cloudide/workspace/OpenViking/examples/snapshot/ov.conf.git-local.example) 与 [ov.conf.git-s3-tos.example](file:///cloudide/workspace/OpenViking/examples/snapshot/ov.conf.git-s3-tos.example)。 + +## 目录结构变化:`.ovgit` 目录 + +启用 `local` 后端且 `base_dir` 留空时,OpenViking 会在工作区下新增一个 **`.ovgit`** 目录用于存放 Git 对象和引用: + +```text +data/ # storage.workspace +├── viking/ # 用户可见的资源树(viking:// 映射到这里) +│ └── ... +└── .ovgit/ # 多版本管理数据(新增) + └── {account_id}/ # 每个账号一个逻辑 Git 仓库 + ├── objects/ # Git 对象(commit/tree/blob),标准 fanout 布局 aa/bb... + ├── refs/ + │ └── heads/ + │ └── main # 分支引用,内容为 40 位十六进制 OID + └── HEAD # 当前分支指针,内容为 "ref: refs/heads/main" +``` + +要点: + +- `.ovgit` 是内部数据目录,**不会**通过 `viking://` 暴露,用户在文件系统 API(`ls`/`read` 等)中看不到也无法修改它。 +- 它与 Git 的标准对象库布局一致(内容寻址的 `objects/`、loose 引用的 `refs/`),但由 OpenViking 自动管理,**无需也不应**手动运行 `git` 命令去操作它。 +- 备份或迁移工作区时,把 `.ovgit` 一并复制即可保留完整的版本历史。 +- 选择 `s3` 后端时,不会创建本地 `.ovgit` 目录,数据改为存放在 bucket 的 `{prefix}/{account}/...` 键下。 + +## 使用方法 + +启用后,三种调用方式都会出现快照相关命令。下面以一个"提交 → 修改 → 恢复"的最小流程演示。 + +### Python SDK + +快照方法挂在 `client.snapshot.*` 命名空间下。 + +```python +import openviking as ov + +client = ov.OpenViking() +client.initialize() + +root = "viking://resources/my_project" + +# 1. 写入初始内容并提交 v1 +client.write(f"{root}/guide.md", "# Guide\n\nv1 content\n", mode="create", wait=True) +v1 = client.snapshot.commit(message="v1 initial import") +print("v1:", v1["commit_oid"]) + +# 2. 修改后再提交 v2 +client.write(f"{root}/guide.md", "# Guide\n\nv2 content\n", mode="replace", wait=True) +v2 = client.snapshot.commit(message="v2 update") + +# 3. 查看历史 +for c in client.snapshot.log(limit=10): + print(c["oid"][:8], c["message"]) + +# 4. 查看某个提交的元数据 +print(client.snapshot.show(v1["commit_oid"])["message"]) + +# 5. 把工作区恢复到 v1(会在 v2 之上生成一个新的“正向”提交) +client.snapshot.restore(project_dir=root, source_commit=v1["commit_oid"], message="restore to v1") + +client.close() +``` + +### CLI + +CLI 子命令位于 `ov snapshot` 下: + +```bash +# 提交当前工作区状态 +ov snapshot commit -m "v1 initial import" -o json + +# 回溯历史(最新在前) +ov snapshot log --limit 10 -o json + +# 查看提交元数据 +ov snapshot show -o json + +# 读取某个提交中的文件内容(默认输出到 stdout,可用 --out-file 写入本地文件) +ov snapshot show --path viking://resources/my_project/guide.md --out-file ./guide.md + +# 把目录恢复到某个历史快照(位置参数依次为 ) +ov snapshot restore viking://resources/my_project -m "restore to v1" -o json + +# 先预演,确认会改动哪些文件 +ov snapshot restore viking://resources/my_project --dry-run -o json +``` + +### HTTP API + +```bash +# 提交 +curl -X POST "http://localhost:1933/api/v1/snapshot/commit" \ + -H "Content-Type: application/json" \ + -H "X-API-Key: your-key" \ + -d '{"message": "v1 initial import"}' + +# 回溯历史 +curl -X GET "http://localhost:1933/api/v1/snapshot/log?branch=main&limit=10" \ + -H "X-API-Key: your-key" + +# 查看提交元数据 +curl -X GET "http://localhost:1933/api/v1/snapshot/show?target_ref=" \ + -H "X-API-Key: your-key" + +# 恢复 +curl -X POST "http://localhost:1933/api/v1/snapshot/restore" \ + -H "Content-Type: application/json" \ + -H "X-API-Key: your-key" \ + -d '{"project_dir": "viking://resources/my_project", "source_commit": "", "message": "restore to v1"}' +``` + +## 重要语义:正向恢复 + +`restore` 采用**正向恢复(forward-commit)**:它读取 `source_commit` 的内容,把差异写回工作区,并在**当前 HEAD 之上生成一个新的提交**。因此: + +- 新提交的父提交是恢复操作发生前的 HEAD,**不是** `source_commit`。 +- HEAD 始终单调向前推进,**历史永远不会被改写或丢失**——回到旧版本本身也是一次新的提交。 +- `restore` 只影响 `project_dir`(省略时为整棵账号树)范围内的文件,范围之外的文件保持不变。 + +## 注意事项 + +- 修改 `git` 配置后必须重启服务 / 重新初始化客户端才能生效。 +- 启用 `s3` 后端时,`git.s3.bucket` 与 `git.s3.region` 为必填项,缺失会导致初始化失败。 +- 恢复操作如涉及向量副作用(写入/删除文件),响应会返回一个 `task_id`,可通过 `GET /api/v1/tasks/{task_id}` 轮询后台向量重建进度(参见 [系统指南](05-observability.md) 与 [API 概览](../api/01-overview.md))。 +- 不要手动用外部 `git` 工具去操作 `.ovgit` 目录,它由 OpenViking 维护。 + +## 相关文档 + +- [多版本管理 API](../api/11-snapshot.md):命令参数与响应的完整参考 +- [配置说明](01-configuration.md):`ov.conf` 完整配置项 +- [多写存储指南](13-multi-write-storage.md):资源数据的多后端复制 diff --git a/examples/snapshot/ov.conf.git-local.example b/examples/snapshot/ov.conf.git-local.example new file mode 100644 index 0000000000..46d0de819a --- /dev/null +++ b/examples/snapshot/ov.conf.git-local.example @@ -0,0 +1,37 @@ +{ + "storage": { + "workspace": "./data" + }, + "embedding": { + "dense": { + "model": "doubao-embedding-vision-251215", + "api_key": "{your-api-key}", + "api_base": "https://ark.cn-beijing.volces.com/api/v3", + "dimension": 1024, + "provider": "volcengine", + "input": "multimodal" + } + }, + "vlm": { + "model": "doubao-seed-2-0-pro-260215", + "api_key": "{your-api-key}", + "api_base": "https://ark.cn-beijing.volces.com/api/v3", + "temperature": 0.0, + "max_retries": 2, + "provider": "volcengine", + "thinking": false + }, + "server": { + "host": "127.0.0.1" + }, + "git": { + "enabled": true, + "backend": "local", + "default_branch": "main", + "author_name": "viking-bot", + "author_email": "bot@viking.local", + "local": { + "base_dir": "" + } + } +} diff --git a/examples/snapshot/ov.conf.git-s3-tos.example b/examples/snapshot/ov.conf.git-s3-tos.example new file mode 100644 index 0000000000..50f5b0b34b --- /dev/null +++ b/examples/snapshot/ov.conf.git-s3-tos.example @@ -0,0 +1,44 @@ +{ + "storage": { + "workspace": "./data" + }, + "embedding": { + "dense": { + "model": "doubao-embedding-vision-251215", + "api_key": "{your-api-key}", + "api_base": "https://ark.cn-beijing.volces.com/api/v3", + "dimension": 1024, + "provider": "volcengine", + "input": "multimodal" + } + }, + "vlm": { + "model": "doubao-seed-2-0-pro-260215", + "api_key": "{your-api-key}", + "api_base": "https://ark.cn-beijing.volces.com/api/v3", + "temperature": 0.0, + "max_retries": 2, + "provider": "volcengine", + "thinking": false + }, + "server": { + "host": "127.0.0.1" + }, + "git": { + "enabled": true, + "backend": "s3", + "default_branch": "main", + "author_name": "viking-bot", + "author_email": "bot@viking.local", + "s3": { + "bucket": "your-tos-bucket", + "region": "cn-beijing", + "endpoint": "https://tos-s3-cn-beijing.volces.com", + "access_key": "", + "secret_key": "", + "prefix": ".ovgit", + "use_path_style": false, + "cas_mode": "native" + } + } +} \ No newline at end of file diff --git a/examples/snapshot/snapshot_cli_test.py b/examples/snapshot/snapshot_cli_test.py new file mode 100644 index 0000000000..f379e915fe --- /dev/null +++ b/examples/snapshot/snapshot_cli_test.py @@ -0,0 +1,218 @@ +from __future__ import annotations + +import json +import os +import subprocess +import time +import uuid +from pathlib import Path +from typing import Any + +OVCLI_CONFIG_FILE = "/home/byteide/.openviking/ovcli.conf" +CLI_BIN = "ov" +WORKSPACE_URI = "viking://resources/snapshot_cli_demo" +COMMAND_TIMEOUT = 180 + + +def unique_run_uri() -> tuple[str, str]: + run_id = f"{int(time.time())}_{uuid.uuid4().hex[:8]}" + return run_id, f"{WORKSPACE_URI}_{run_id}" + + +def resource_uris(root_uri: str) -> dict[str, str]: + return { + "guide": f"{root_uri}/guide.md", + "todo": f"{root_uri}/notes/todo.md", + "changelog": f"{root_uri}/changelog.md", + "archive": f"{root_uri}/archive/old.md", + } + + +def print_section(title: str) -> None: + print(f"\n{'=' * 20} {title} {'=' * 20}") + + +def parse_json(stdout: str) -> dict[str, Any]: + depth = 0 + start: int | None = None + in_string = False + escaped = False + + for index, char in enumerate(stdout): + if start is None: + if char == "{": + start = index + depth = 1 + continue + + if in_string: + if escaped: + escaped = False + elif char == "\\": + escaped = True + elif char == '"': + in_string = False + continue + + if char == '"': + in_string = True + elif char == "{": + depth += 1 + elif char == "}": + depth -= 1 + if depth == 0: + try: + return json.loads(stdout[start : index + 1]) + except json.JSONDecodeError: + return {} + return {} + + +def print_commit_oid(label: str, snapshot: dict[str, Any]) -> None: + print(f"{label} commit_oid: {snapshot.get('commit_oid') or ''}") + + +def show_snapshot(label: str, snapshot: dict[str, Any]) -> None: + commit_oid = snapshot.get("commit_oid") + if not commit_oid: + raise RuntimeError(f"{label} snapshot output did not include commit_oid: {snapshot}") + print(f"showing {label} commit_oid: {commit_oid}") + run_ov(["snapshot", "show", commit_oid, "-o", "json"]) + + +def run_ov(args: list[str], check: bool = True) -> subprocess.CompletedProcess[str]: + env = os.environ.copy() + env["OPENVIKING_CLI_CONFIG_FILE"] = str(Path(OVCLI_CONFIG_FILE).resolve()) + print(f"\n$ {CLI_BIN} {' '.join(args)}") + proc = subprocess.run( + [CLI_BIN] + args, + capture_output=True, + text=True, + timeout=COMMAND_TIMEOUT, + env=env, + ) + print("--- stdout ---") + print(proc.stdout.rstrip()) + print("--- stderr ---") + print(proc.stderr.rstrip()) + if check and proc.returncode != 0: + raise RuntimeError(f"command failed with exit code {proc.returncode}: {CLI_BIN} {' '.join(args)}") + return proc + + +def mkdir(uri: str) -> None: + run_ov(["mkdir", uri, "-o", "json"]) + + +def write_text(uri: str, content: str, mode: str) -> None: + run_ov(["write", uri, "--content", content, "--mode", mode, "--wait", "-o", "json"]) + + +def remove_resource(uri: str) -> None: + run_ov(["rm", uri, "--wait", "-o", "json"]) + + +def print_find(query: str, root_uri: str) -> None: + run_ov(["find", query, "--uri", root_uri, "--limit", "10", "-o", "json"]) + + +def commit_snapshot(message: str) -> dict[str, Any]: + proc = run_ov(["snapshot", "commit", "-m", message, "-o", "json"]) + return parse_json(proc.stdout).get("result") or {} + + +def get_task(task_id: str) -> dict[str, Any]: + proc = run_ov(["task", "status", task_id, "-o", "json"]) + return parse_json(proc.stdout).get("result") or {} + + +def wait_for_task(task_id: str | None, *, timeout: float = COMMAND_TIMEOUT, poll_interval: float = 0.5) -> None: + """Poll a background task by id until it reaches a terminal state. + + restore schedules vectorization/indexing asynchronously and returns a + ``task_id``; finding before that task completes can read stale vectors. + """ + if not task_id: + print("wait_for_task: no task_id (no vector side-effects to await)") + return + deadline = time.time() + timeout + while True: + task = get_task(task_id) + status = task.get("status") + if status in ("completed", "failed"): + print(f"wait_for_task {task_id[:12]}: {status}") + if status == "failed": + raise RuntimeError(f"task {task_id} failed: {task.get('error')}") + return + if time.time() > deadline: + raise TimeoutError(f"task {task_id} not complete after {timeout}s (status={status})") + time.sleep(poll_interval) + + +def main() -> None: + run_id, root_uri = unique_run_uri() + uris = resource_uris(root_uri) + alpha = f"alpha_{run_id}" + beta = f"beta_{run_id}" + todo = f"todo_{run_id}" + changelog = f"changelog_{run_id}" + gamma = f"gamma_{run_id}" + archive = f"archive_{run_id}" + + print_section("setup") + print(f"config: {Path(OVCLI_CONFIG_FILE).resolve()}") + print(f"workspace: {root_uri}") + mkdir(root_uri) + mkdir(f"{root_uri}/notes") + + print_section("v1 initial import") + write_text(uris["guide"], f"# Guide\n\nInitial CLI content with {alpha}.\n", "create") + write_text(uris["todo"], f"# Todo\n\nRemember {todo}.\n", "create") + v1 = commit_snapshot("cli v1 initial import") + print_commit_oid("v1", v1) + print_find(alpha, root_uri) + + print_section("v2 modify delete add") + write_text(uris["guide"], f"# Guide\n\nUpdated CLI content with {beta}.\n", "replace") + remove_resource(uris["todo"]) + write_text(uris["changelog"], f"# Changelog\n\nCreated {changelog}.\n", "create") + v2 = commit_snapshot("cli v2 modify delete add") + print_commit_oid("v2", v2) + print_find(beta, root_uri) + print_find(todo, root_uri) + print_find(changelog, root_uri) + + print_section("v3 second changes") + mkdir(f"{root_uri}/archive") + write_text(uris["changelog"], f"# Changelog\n\nCreated {changelog}. Added {gamma}.\n", "replace") + write_text(uris["archive"], f"# Archive\n\nArchived marker {archive}.\n", "create") + v3 = commit_snapshot("cli v3 second changes") + print_commit_oid("v3", v3) + print_find(gamma, root_uri) + print_find(archive, root_uri) + run_ov(["snapshot", "log", "--limit", "10", "-o", "json"]) + show_snapshot("v1", v1) + show_snapshot("v2", v2) + show_snapshot("v3", v3) + + print_section("restore to v1") + source_commit = v1.get("commit_oid") + if not source_commit: + raise RuntimeError(f"snapshot commit output did not include commit_oid: {v1}") + restore_proc = run_ov(["snapshot", "restore", source_commit, root_uri, "-m", "cli restore to v1", "-o", "json"]) + restore = parse_json(restore_proc.stdout).get("result") or {} + wait_for_task(restore.get("task_id")) + run_ov(["ls", root_uri, "--recursive", "-o", "json"]) + run_ov(["read", uris["guide"]]) + run_ov(["read", uris["todo"]]) + print_find(alpha, root_uri) + print_find(beta, root_uri) + print_find(changelog, root_uri) + run_ov(["snapshot", "log", "--limit", "10", "-o", "json"]) + + print_section("done") + print("CLI snapshot multi-version example finished") + + +if __name__ == "__main__": + main() diff --git a/examples/snapshot/snapshot_example.py b/examples/snapshot/snapshot_example.py new file mode 100644 index 0000000000..7173f8a3c8 --- /dev/null +++ b/examples/snapshot/snapshot_example.py @@ -0,0 +1,184 @@ +from __future__ import annotations + +import os +import time +import uuid +from pathlib import Path +from typing import Any + +OV_CONFIG_FILE = "/home/byteide/.openviking/ov.conf" +WORKSPACE_URI = "viking://resources/snapshot_sdk_demo" +WAIT_TIMEOUT = 180.0 + + +def unique_run_uri() -> tuple[str, str]: + run_id = f"{int(time.time())}_{uuid.uuid4().hex[:8]}" + return run_id, f"{WORKSPACE_URI}_{run_id}" + + +def resource_uris(root_uri: str) -> dict[str, str]: + return { + "guide": f"{root_uri}/guide.md", + "todo": f"{root_uri}/notes/todo.md", + "changelog": f"{root_uri}/changelog.md", + "archive": f"{root_uri}/archive/old.md", + } + + +def print_section(title: str) -> None: + print(f"\n{'=' * 20} {title} {'=' * 20}") + + +def short_oid(commit_oid: str | None) -> str: + return commit_oid[:12] if commit_oid else "" + + +def write_text(client: Any, uri: str, content: str, mode: str) -> None: + result = client.write(uri, content, mode=mode, wait=True, timeout=WAIT_TIMEOUT) + print(f"write: {uri} (mode={result.get('mode')}, bytes={result.get('written_bytes')})") + + +def remove_resource(client: Any, uri: str) -> None: + client.rm(uri, wait=True, timeout=WAIT_TIMEOUT) + print(f"rm: {uri}") + + +def print_find(client: Any, query: str, root_uri: str) -> None: + results = client.find(query, target_uri=root_uri, limit=10) + if not results.resources: + print(f"find {query!r}: (no matches)") + return + print(f"find {query!r}: {len(results.resources)} match(es)") + for r in results.resources: + print(f" {r.uri} (score: {r.score:.4f})") + + +def print_read(client: Any, uri: str) -> None: + content = client.read(uri) + first_line = content.splitlines()[0] if content else "" + print(f"read {uri}: {len(content)} chars | {first_line}") + + +def commit_snapshot(client: Any, message: str) -> dict[str, Any]: + result = client.snapshot.commit(message=message) + print(f"commit {message!r}: result={result.get('result')} oid={short_oid(result.get('commit_oid'))}") + return result + + +def wait_for_task( + client: Any, + task_id: str | None, + *, + timeout: float = WAIT_TIMEOUT, + poll_interval: float = 0.5, +) -> None: + """Poll a background task by id until it reaches a terminal state. + + restore schedules vectorization/indexing asynchronously and returns a + ``task_id``; finding before that task completes can read stale vectors. + """ + if not task_id: + print("wait_for_task: no task_id (no vector side-effects to await)") + return + deadline = time.time() + timeout + while True: + task = client.get_task(task_id) or {} + status = task.get("status") + if status in ("completed", "failed"): + print(f"wait_for_task {task_id[:12]}: {status}") + if status == "failed": + raise RuntimeError(f"task {task_id} failed: {task.get('error')}") + return + if time.time() > deadline: + raise TimeoutError(f"task {task_id} not complete after {timeout}s (status={status})") + time.sleep(poll_interval) + + +def main() -> None: + os.environ["OPENVIKING_CONFIG_FILE"] = str(Path(OV_CONFIG_FILE).resolve()) + + import openviking as ov + + run_id, root_uri = unique_run_uri() + uris = resource_uris(root_uri) + alpha = f"alpha_{run_id}" + beta = f"beta_{run_id}" + todo = f"todo_{run_id}" + changelog = f"changelog_{run_id}" + gamma = f"gamma_{run_id}" + archive = f"archive_{run_id}" + + client = ov.OpenViking(path="./data") + client.initialize() + try: + print_section("setup") + print(f"config: {Path(OV_CONFIG_FILE).resolve()}") + print(f"workspace: {root_uri}") + client.mkdir(root_uri) + client.mkdir(f"{root_uri}/notes") + print(f"mkdir: {root_uri}, {root_uri}/notes") + + print_section("v1 initial import") + write_text(client, uris["guide"], f"# Guide\n\nInitial SDK content with {alpha}.\n", mode="create") + write_text(client, uris["todo"], f"# Todo\n\nRemember {todo}.\n", mode="create") + v1 = commit_snapshot(client, "sdk v1 initial import") + print_find(client, alpha, root_uri) + + print_section("v2 modify delete add") + write_text(client, uris["guide"], f"# Guide\n\nUpdated SDK content with {beta}.\n", mode="replace") + remove_resource(client, uris["todo"]) + write_text(client, uris["changelog"], f"# Changelog\n\nCreated {changelog}.\n", mode="create") + v2 = commit_snapshot(client, "sdk v2 modify delete add") + print_find(client, beta, root_uri) + print_find(client, todo, root_uri) + print_find(client, changelog, root_uri) + + print_section("v3 second changes") + client.mkdir(f"{root_uri}/archive") + print(f"mkdir: {root_uri}/archive") + write_text(client, uris["changelog"], f"# Changelog\n\nCreated {changelog}. Added {gamma}.\n", mode="replace") + write_text(client, uris["archive"], f"# Archive\n\nArchived marker {archive}.\n", mode="create") + v3 = commit_snapshot(client, "sdk v3 second changes") + print_find(client, gamma, root_uri) + print_find(client, archive, root_uri) + log_before = client.snapshot.log(limit=10) + print(f"snapshot log: {len(log_before)} commit(s)") + for commit in log_before: + print(f" {short_oid(commit.get('oid'))} {commit.get('message', '')}") + for label, snap in (("v1", v1), ("v2", v2), ("v3", v3)): + meta = client.snapshot.show(snap["commit_oid"]) + print(f"snapshot show {label}: oid={short_oid(meta.get('oid'))} message={meta.get('message', '')!r}") + + print_section("restore to v1") + restore = client.snapshot.restore( + project_dir=root_uri, + source_commit=v1["commit_oid"], + message="sdk restore to v1", + ) + print( + f"snapshot restore: result={restore.get('result')} oid={short_oid(restore.get('commit_oid'))} " + f"written={len(restore.get('written_paths') or [])} deleted={len(restore.get('deleted_paths') or [])}" + ) + wait_for_task(client, restore.get("task_id")) + entries = client.ls(root_uri, recursive=True) + print(f"ls after restore: {len(entries)} entry(ies)") + for entry in entries: + print(f" {entry.get('uri') if isinstance(entry, dict) else entry}") + print_read(client, uris["guide"]) + print_read(client, uris["todo"]) + print_find(client, alpha, root_uri) + print_find(client, beta, root_uri) + print_find(client, changelog, root_uri) + log_after = client.snapshot.log(limit=10) + print(f"snapshot log after restore: {len(log_after)} commit(s)") + for commit in log_after: + print(f" {short_oid(commit.get('oid'))} {commit.get('message', '')}") + + print_section("done") + print("Python SDK snapshot multi-version example finished") + finally: + client.close() + + +if __name__ == "__main__": + main() diff --git a/examples/snapshot/snapshot_http_api_test.py b/examples/snapshot/snapshot_http_api_test.py new file mode 100644 index 0000000000..74306f6580 --- /dev/null +++ b/examples/snapshot/snapshot_http_api_test.py @@ -0,0 +1,231 @@ +from __future__ import annotations + +import pprint +import time +import uuid +from pathlib import Path +from typing import Any + +import httpx + +from openviking_cli.utils.config.ovcli_config import load_ovcli_config + +OVCLI_CONFIG_FILE = "/home/byteide/.openviking/ovcli.conf" +WORKSPACE_URI = "viking://resources/snapshot_http_demo" +WAIT_TIMEOUT = 180.0 + + +def unique_run_uri() -> tuple[str, str]: + run_id = f"{int(time.time())}_{uuid.uuid4().hex[:8]}" + return run_id, f"{WORKSPACE_URI}_{run_id}" + + +def resource_uris(root_uri: str) -> dict[str, str]: + return { + "guide": f"{root_uri}/guide.md", + "todo": f"{root_uri}/notes/todo.md", + "changelog": f"{root_uri}/changelog.md", + "archive": f"{root_uri}/archive/old.md", + } + + +def print_section(title: str) -> None: + print(f"\n{'=' * 20} {title} {'=' * 20}") + + +def print_result(label: str, value: Any) -> None: + print(f"\n--- {label} ---") + pprint.pp(value, width=120) + + +def build_headers(config: Any) -> dict[str, str]: + headers = dict(config.extra_headers or {}) + if config.api_key: + headers["X-API-Key"] = config.api_key + if config.account: + headers["X-OpenViking-Account"] = config.account + if config.user: + headers["X-OpenViking-User"] = config.user + if config.actor_peer_id: + headers["X-OpenViking-Actor-Peer"] = config.actor_peer_id + return headers + + +def request_json(client: httpx.Client, method: str, path: str, label: str | None = None, **kwargs: Any) -> dict[str, Any]: + response = client.request(method, path, **kwargs) + output_label = label or f"{method} {path}" + print_result(f"{output_label} status", response.status_code) + if response.headers.get("content-type", "").startswith("application/json"): + data = response.json() + print_result(f"{output_label} response", data) + response.raise_for_status() + return data + print(response.text) + response.raise_for_status() + return {"raw": response.text} + + +def mkdir(client: httpx.Client, uri: str) -> None: + request_json(client, "POST", "/api/v1/fs/mkdir", json={"uri": uri}) + + +def write_text(client: httpx.Client, uri: str, content: str, mode: str) -> None: + print(f"write: {uri}") + print(content.rstrip()) + request_json( + client, + "POST", + "/api/v1/content/write", + json={"uri": uri, "content": content, "mode": mode, "wait": True}, + ) + + +def remove_resource(client: httpx.Client, uri: str) -> None: + request_json(client, "DELETE", "/api/v1/fs", params={"uri": uri, "recursive": False, "wait": True}) + + +def read_text(client: httpx.Client, uri: str) -> None: + request_json(client, "GET", "/api/v1/content/read", label=f"read content after restore: {uri}", params={"uri": uri}) + + +def print_find(client: httpx.Client, query: str, root_uri: str) -> None: + request_json( + client, + "POST", + "/api/v1/search/find", + json={"query": query, "target_uri": root_uri, "limit": 10}, + ) + + +def commit_snapshot(client: httpx.Client, message: str) -> dict[str, Any]: + data = request_json( + client, + "POST", + "/api/v1/snapshot/commit", + json={"message": message}, + ) + return data.get("result") or {} + + +def show_snapshot(client: httpx.Client, target_ref: str) -> None: + request_json(client, "GET", "/api/v1/snapshot/show", label=f"snapshot show {target_ref}", params={"target_ref": target_ref}) + + +def wait_for_task(client: httpx.Client, task_id: str | None, *, timeout: float = WAIT_TIMEOUT, poll_interval: float = 0.5) -> None: + """Poll a background task by id until it reaches a terminal state. + + restore schedules vectorization/indexing asynchronously and returns a + ``task_id``; finding before that task completes can read stale vectors. + """ + if not task_id: + print("wait_for_task: no task_id (no vector side-effects to await)") + return + deadline = time.time() + timeout + while True: + data = request_json(client, "GET", f"/api/v1/tasks/{task_id}", label=f"task status {task_id[:12]}") + status = (data.get("result") or {}).get("status") + if status in ("completed", "failed"): + print(f"wait_for_task {task_id[:12]}: {status}") + if status == "failed": + raise RuntimeError(f"task {task_id} failed: {(data.get('result') or {}).get('error')}") + return + if time.time() > deadline: + raise TimeoutError(f"task {task_id} not complete after {timeout}s (status={status})") + time.sleep(poll_interval) + + +def main() -> None: + config_path = str(Path(OVCLI_CONFIG_FILE).resolve()) + config = load_ovcli_config(config_path) + if config is None or not config.url: + raise RuntimeError(f"missing url in {config_path}") + + run_id, root_uri = unique_run_uri() + uris = resource_uris(root_uri) + alpha = f"alpha_{run_id}" + beta = f"beta_{run_id}" + todo = f"todo_{run_id}" + changelog = f"changelog_{run_id}" + gamma = f"gamma_{run_id}" + archive = f"archive_{run_id}" + + with httpx.Client( + base_url=config.url.rstrip("/"), + headers=build_headers(config), + timeout=config.timeout, + ) as client: + print_section("setup") + print(f"config: {config_path}") + print(f"server: {config.url}") + print(f"workspace: {root_uri}") + mkdir(client, root_uri) + mkdir(client, f"{root_uri}/notes") + + print_section("v1 initial import") + write_text(client, uris["guide"], f"# Guide\n\nInitial HTTP content with {alpha}.\n", "create") + write_text(client, uris["todo"], f"# Todo\n\nRemember {todo}.\n", "create") + v1 = commit_snapshot(client, "http v1 initial import") + print_find(client, alpha, root_uri) + + print_section("v2 modify delete add") + write_text(client, uris["guide"], f"# Guide\n\nUpdated HTTP content with {beta}.\n", "replace") + remove_resource(client, uris["todo"]) + write_text(client, uris["changelog"], f"# Changelog\n\nCreated {changelog}.\n", "create") + v2 = commit_snapshot(client, "http v2 modify delete add") + print_find(client, beta, root_uri) + print_find(client, todo, root_uri) + print_find(client, changelog, root_uri) + + print_section("v3 second changes") + mkdir(client, f"{root_uri}/archive") + write_text(client, uris["changelog"], f"# Changelog\n\nCreated {changelog}. Added {gamma}.\n", "replace") + write_text(client, uris["archive"], f"# Archive\n\nArchived marker {archive}.\n", "create") + v3 = commit_snapshot(client, "http v3 second changes") + print_find(client, gamma, root_uri) + print_find(client, archive, root_uri) + request_json( + client, + "GET", + "/api/v1/snapshot/log", + label="snapshot log before restore", + params={"branch": "main", "limit": 10}, + ) + show_snapshot(client, v1.get("commit_oid", "")) + show_snapshot(client, v2.get("commit_oid", "")) + show_snapshot(client, v3.get("commit_oid", "")) + + print_section("restore to v1") + restore = request_json( + client, + "POST", + "/api/v1/snapshot/restore", + label=f"restore workspace to v1 {v1.get('commit_oid')}", + json={"project_dir": root_uri, "source_commit": v1.get("commit_oid"), "message": "http restore to v1"}, + ) + wait_for_task(client, (restore.get("result") or {}).get("task_id")) + request_json( + client, + "GET", + "/api/v1/fs/ls", + label="list workspace after restore", + params={"uri": root_uri, "recursive": True}, + ) + read_text(client, uris["guide"]) + read_text(client, uris["todo"]) + print_find(client, alpha, root_uri) + print_find(client, beta, root_uri) + print_find(client, changelog, root_uri) + request_json( + client, + "GET", + "/api/v1/snapshot/log", + label="snapshot log after restore", + params={"branch": "main", "limit": 10}, + ) + + print_section("done") + print("HTTP API snapshot multi-version example finished") + + +if __name__ == "__main__": + main() diff --git a/openviking/async_client.py b/openviking/async_client.py index 2276174cc4..a0f427bfb7 100644 --- a/openviking/async_client.py +++ b/openviking/async_client.py @@ -64,6 +64,7 @@ def __init__( self.user = UserIdentifier.the_default_user() self._initialized = False + self._snapshot: Optional["AsyncSnapshotNamespace"] = None # Mark initialized only after LocalClient is successfully constructed. self._singleton_initialized = False @@ -333,6 +334,18 @@ async def add_resource( def _service(self): return self._client.service + @property + def snapshot(self) -> "AsyncSnapshotNamespace": + """Snapshot version control namespace. + + Lazy-initialized on first access so importing the client does not + pull in the snapshot module when it's not needed. + """ + if getattr(self, "_snapshot", None) is None: + from openviking.snapshot_namespace import AsyncSnapshotNamespace + self._snapshot = AsyncSnapshotNamespace(self) + return self._snapshot + async def wait_processed(self, timeout: float = None) -> Dict[str, Any]: """Wait for all queued processing to complete.""" await self._ensure_initialized() diff --git a/openviking/client/local.py b/openviking/client/local.py index 2741a1cc56..1b0a04210e 100644 --- a/openviking/client/local.py +++ b/openviking/client/local.py @@ -784,6 +784,68 @@ async def restore_ovpack( vector_mode=vector_mode, ) + # ============= Git Version Control ============= + + async def git_commit( + self, + *, + message: str, + paths: Optional[List[str]] = None, + branch: str = "main", + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + """Create a git snapshot. See VikingFS.commit for semantics.""" + return await self._service.fs.commit( + message=message, + paths=paths, + branch=branch, + author_name=author_name, + author_email=author_email, + ctx=self._ctx, + ) + + async def git_restore( + self, + *, + project_dir: Optional[str] = None, + source_commit: str, + branch: str = "main", + dry_run: bool = False, + message: Optional[str] = None, + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + """Restore a subtree, or the full account tree when project_dir is omitted.""" + return await self._service.fs.restore( + project_dir=project_dir, + source_commit=source_commit, + branch=branch, + dry_run=dry_run, + message=message, + author_name=author_name, + author_email=author_email, + ctx=self._ctx, + ) + + async def git_show( + self, + target_ref: str, + *, + path: Optional[str] = None, + ) -> Any: + """Read a commit's metadata or a single blob.""" + return await self._service.fs.show(target_ref, path=path, ctx=self._ctx) + + async def git_log( + self, + *, + branch: str = "main", + limit: int = 20, + ) -> List[Dict[str, Any]]: + """Walk back along parents[0] up to limit commits.""" + return await self._service.fs.log(branch=branch, limit=limit, ctx=self._ctx) + # ============= Debug ============= async def check_consistency(self, uri: str) -> Dict[str, Any]: diff --git a/openviking/pyagfs/__init__.py b/openviking/pyagfs/__init__.py index 7e0a07c370..42cb721f27 100644 --- a/openviking/pyagfs/__init__.py +++ b/openviking/pyagfs/__init__.py @@ -34,6 +34,8 @@ AGFSPluginError, AGFSSerializationError, AGFSTimeoutError, + GitConcurrentCommitError, + GitRestoreWritebackPartialError, ) from .helpers import cp, download, upload from .protocols import AGFSSyncClientProtocol @@ -180,6 +182,8 @@ def get_binding_client(): "AGFSNetworkError", "AGFSInternalError", "AGFSPluginError", + "GitConcurrentCommitError", + "GitRestoreWritebackPartialError", "cp", "upload", "download", diff --git a/openviking/pyagfs/exceptions.py b/openviking/pyagfs/exceptions.py index 73fd756a31..36ec261f2f 100644 --- a/openviking/pyagfs/exceptions.py +++ b/openviking/pyagfs/exceptions.py @@ -133,3 +133,65 @@ class AGFSPluginError(AGFSClientError): """Plugin error""" pass + + +class GitConcurrentCommitError(AGFSClientError): + """Raised when a git ref CAS update lost the race against another writer. + + The branch ref moved between the read-parent step and the cas_update step. + Callers should refresh and retry, or surface the conflict to the user. + """ + + pass + + +class GitRestoreWritebackPartialError(AGFSClientError): + """Raised when ``git_restore`` advanced the branch ref to the new commit + but at least one per-path VFS write or delete failed afterwards. + + The branch ref already points at ``new_commit_oid`` — the operation cannot + be rolled back. Callers must: + + 1. Still trigger reindex for ``written_paths`` / ``deleted_paths`` (those + did reach the VFS, so the vector index would otherwise stay stale). + 2. Surface ``failed_writes`` / ``failed_deletes`` so the operator can + retry or repair the affected paths. + + Native code constructs this with ``(message, payload_dict)``; callers may + also build it from Python with no payload (defaults are zero-counts / + empty lists). ``task_id`` is filled in by ``VikingFS.restore`` once the + background reindex has been scheduled. + """ + + def __init__(self, message, payload=None): + super().__init__(message) + payload = payload or {} + self.new_commit_oid = payload.get("new_commit_oid") + self.source_commit = payload.get("source_commit") + self.parent_commit = payload.get("parent_commit") + self.written = payload.get("written", 0) + self.deleted = payload.get("deleted", 0) + self.unchanged = payload.get("unchanged", 0) + self.written_paths = list(payload.get("written_paths") or []) + self.deleted_paths = list(payload.get("deleted_paths") or []) + # Each entry is a (path, error-message) pair; tuples in Rust marshal + # to Python as tuples, but accept lists here too for tolerance. + self.failed_writes = [tuple(p) for p in (payload.get("failed_writes") or [])] + self.failed_deletes = [tuple(p) for p in (payload.get("failed_deletes") or [])] + # Filled by ``VikingFS.restore`` after it schedules the reindex task. + self.task_id = None + + def to_dict(self): + return { + "new_commit_oid": self.new_commit_oid, + "source_commit": self.source_commit, + "parent_commit": self.parent_commit, + "written": self.written, + "deleted": self.deleted, + "unchanged": self.unchanged, + "written_paths": self.written_paths, + "deleted_paths": self.deleted_paths, + "failed_writes": [list(p) for p in self.failed_writes], + "failed_deletes": [list(p) for p in self.failed_deletes], + "task_id": self.task_id, + } diff --git a/openviking/server/app.py b/openviking/server/app.py index c978d19d08..41f9eb4461 100644 --- a/openviking/server/app.py +++ b/openviking/server/app.py @@ -44,6 +44,7 @@ search_router, sessions_router, skills_router, + snapshot_router, stats_router, system_router, tasks_router, @@ -531,6 +532,7 @@ async def general_error_handler(request: Request, exc: Exception): app.include_router(privacy_configs_router) app.include_router(skills_router) app.include_router(sessions_router) + app.include_router(snapshot_router) app.include_router(stats_router) app.include_router(pack_router) app.include_router(debug_router) diff --git a/openviking/server/dependencies.py b/openviking/server/dependencies.py index 1c9f11e742..16b506583a 100644 --- a/openviking/server/dependencies.py +++ b/openviking/server/dependencies.py @@ -27,11 +27,16 @@ def get_service() -> OpenVikingService: return _service -def set_service(service: OpenVikingService) -> None: - """Set the OpenVikingService instance. +def get_service_or_none() -> Optional[OpenVikingService]: + """Return the registered OpenVikingService, or None if not set.""" + return _service + + +def set_service(service: Optional[OpenVikingService]) -> None: + """Set (or clear) the OpenVikingService instance. Args: - service: OpenVikingService instance to set + service: OpenVikingService instance to register, or None to clear. """ global _service _service = service diff --git a/openviking/server/error_mapping.py b/openviking/server/error_mapping.py index 3b81578184..357478afc9 100644 --- a/openviking/server/error_mapping.py +++ b/openviking/server/error_mapping.py @@ -25,6 +25,7 @@ AGFSPluginError, AGFSSerializationError, AGFSTimeoutError, + GitConcurrentCommitError, ) from openviking.storage.errors import LockAcquisitionError, ResourceBusyError from openviking_cli.exceptions import ( @@ -447,6 +448,14 @@ def map_exception( "retryable": True, } return OpenVikingError(str(exc), code="CONFLICT", details=details) + if isinstance(exc, GitConcurrentCommitError): + details = { + "conflict_type": "git_ref_cas", + "retryable": True, + } + return OpenVikingError( + str(exc) or "concurrent git commit", code="CONFLICT", details=details + ) if isinstance(exc, PermissionError): return PermissionDeniedError(str(exc), resource=resource) if isinstance(exc, FileNotFoundError): diff --git a/openviking/server/models.py b/openviking/server/models.py index e998239d3a..f04f7b79cd 100644 --- a/openviking/server/models.py +++ b/openviking/server/models.py @@ -49,5 +49,6 @@ class Response(BaseModel): "SESSION_EXPIRED": 410, "UNSUPPORTED_URI": 400, "UNSUPPORTED_MODE": 400, + "RESTORE_WRITEBACK_PARTIAL": 500, "UNKNOWN": 500, } diff --git a/openviking/server/routers/__init__.py b/openviking/server/routers/__init__.py index ded23d4892..1fb22653a5 100644 --- a/openviking/server/routers/__init__.py +++ b/openviking/server/routers/__init__.py @@ -18,6 +18,7 @@ from openviking.server.routers.search import router as search_router from openviking.server.routers.sessions import router as sessions_router from openviking.server.routers.skills import router as skills_router +from openviking.server.routers.snapshot import router as snapshot_router from openviking.server.routers.stats import router as stats_router from openviking.server.routers.system import router as system_router from openviking.server.routers.tasks import router as tasks_router @@ -37,6 +38,7 @@ "relations_router", "sessions_router", "skills_router", + "snapshot_router", "stats_router", "pack_router", "privacy_configs_router", diff --git a/openviking/server/routers/snapshot.py b/openviking/server/routers/snapshot.py new file mode 100644 index 0000000000..ec7500a0e8 --- /dev/null +++ b/openviking/server/routers/snapshot.py @@ -0,0 +1,177 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""HTTP routes for git-style version control (snapshots). + +Mirrors VikingFS.commit / VikingFS.restore / VikingFS.show / VikingFS.log, +which already implement the underlying semantics. +""" + +from typing import List, Optional + +from fastapi import APIRouter, Body, Depends, Query +from fastapi.responses import Response as FastAPIResponse +from pydantic import BaseModel, ConfigDict + +from openviking.pyagfs.exceptions import ( + AGFSClientError, + AGFSNotFoundError, + GitRestoreWritebackPartialError, +) +from openviking.server.auth import get_request_context +from openviking.server.dependencies import get_service +from openviking.server.error_mapping import map_exception +from openviking.server.identity import RequestContext +from openviking.server.models import Response +from openviking_cli.exceptions import InternalError, NotFoundError, OpenVikingError + +router = APIRouter(prefix="/api/v1/snapshot", tags=["snapshot"]) + + +class CommitRequest(BaseModel): + """Request body for ``POST /api/v1/snapshot/commit``.""" + + model_config = ConfigDict(extra="forbid") + + message: str + paths: Optional[List[str]] = None + branch: str = "main" + author_name: Optional[str] = None + author_email: Optional[str] = None + + +@router.post("/commit") +async def commit( + request: CommitRequest = Body(...), + _ctx: RequestContext = Depends(get_request_context), +): + """Create a new snapshot of the current workspace state.""" + service = get_service() + try: + result = await service.fs.commit( + message=request.message, + paths=request.paths, + branch=request.branch, + author_name=request.author_name, + author_email=request.author_email, + ctx=_ctx, + ) + except AGFSClientError as e: + mapped = map_exception(e) + if mapped is not None: + raise mapped from e + raise + return Response(status="ok", result=result) + + +@router.get("/log") +async def log( + branch: str = Query("main", description="Branch ref name"), + limit: int = Query(20, ge=1, le=500, description="Max commits to return"), + _ctx: RequestContext = Depends(get_request_context), +): + """Walk commit history newest-first along parents[0].""" + service = get_service() + try: + result = await service.fs.log(branch=branch, limit=limit, ctx=_ctx) + except AGFSNotFoundError: + raise NotFoundError(branch, "git_ref") + except AGFSClientError as e: + mapped = map_exception(e) + if mapped is not None: + raise mapped from e + raise + return Response(status="ok", result=result) + + +class RestoreRequest(BaseModel): + """Request body for ``POST /api/v1/snapshot/restore``.""" + + model_config = ConfigDict(extra="forbid") + + project_dir: Optional[str] = None + source_commit: str + branch: str = "main" + dry_run: bool = False + message: Optional[str] = None + author_name: Optional[str] = None + author_email: Optional[str] = None + + +@router.post("/restore") +async def restore( + request: RestoreRequest = Body(...), + _ctx: RequestContext = Depends(get_request_context), +): + """Forward-commit restore: rebuild project_dir from source_commit on top of HEAD.""" + service = get_service() + try: + result = await service.fs.restore( + project_dir=request.project_dir, + source_commit=request.source_commit, + branch=request.branch, + dry_run=request.dry_run, + message=request.message, + author_name=request.author_name, + author_email=request.author_email, + ctx=_ctx, + ) + except AGFSNotFoundError as e: + raise NotFoundError(request.source_commit, "git_ref") from e + except GitRestoreWritebackPartialError as exc: + # HEAD already advanced to the new commit, but some per-path VFS + # writes/deletes failed. Surface structured diagnostics (including + # task_id of the scheduled reindex) instead of collapsing to a + # generic InternalError. + raise OpenVikingError( + f"snapshot restore partial: {exc}", + code="RESTORE_WRITEBACK_PARTIAL", + details=exc.to_dict(), + ) from exc + except AGFSClientError as e: + mapped = map_exception(e) + if mapped is not None: + raise mapped from e + raise + except RuntimeError as e: + # Fallback for the case where the native git binding cannot import + # pyagfs and surfaces apply-phase failures as a bare RuntimeError. + # With GitRestoreWritebackPartialError wired up, structured partial + # failures now go through the branch above; this clause only catches + # the degraded path. + raise InternalError( + f"snapshot restore failed: {e}", cause=e + ) from e + return Response(status="ok", result=result) + + +@router.get("/show") +async def show( + target_ref: str = Query(..., description="Commit oid, branch, or tag"), + path: Optional[str] = Query(None, description="Optional viking:// URI for a single blob"), + _ctx: RequestContext = Depends(get_request_context), +): + """Without ``path``: commit metadata JSON. With ``path``: raw blob bytes + X-Snapshot-* headers.""" + service = get_service() + try: + if path is None: + result = await service.fs.show(target_ref, ctx=_ctx) + return Response(status="ok", result=result) + + blob = await service.fs.show_blob_raw(target_ref, path=path, ctx=_ctx) + except AGFSNotFoundError as e: + resource = path if path is not None else target_ref + raise NotFoundError(resource, "git_blob" if path is not None else "git_ref") from e + except AGFSClientError as e: + mapped = map_exception(e) + if mapped is not None: + raise mapped from e + raise + + return FastAPIResponse( + content=blob["bytes"], + media_type="application/octet-stream", + headers={ + "X-Snapshot-Oid": str(blob["oid"]), + "X-Snapshot-Size": str(blob["size"]), + }, + ) diff --git a/openviking/service/core.py b/openviking/service/core.py index 14281dd429..da57163b31 100644 --- a/openviking/service/core.py +++ b/openviking/service/core.py @@ -39,6 +39,7 @@ from openviking_cli.session.user_id import UserIdentifier from openviking_cli.utils import get_logger from openviking_cli.utils.config import OPENVIKING_ENABLE_RECORDER_ENV, get_openviking_config +from openviking_cli.utils.config.git_config import GitConfig from openviking_cli.utils.config.open_viking_config import initialize_openviking_config from openviking_cli.utils.config.storage_config import StorageConfig @@ -118,6 +119,7 @@ def __init__( config.embedding.max_concurrent, config.vlm.max_concurrent, binding_config=binding_config, + git_config=config.git, ) # Initialize embedder @@ -132,13 +134,15 @@ def _init_storage( max_concurrent_embedding: int = 10, max_concurrent_semantic: int = 64, binding_config: Any = None, + *, + git_config: Optional[GitConfig] = None, ) -> None: """Initialize storage resources.""" from openviking.utils.agfs_utils import RagfsBindingConfig, create_agfs_client # Create RAGFS client using utility runtime_binding_config = binding_config or RagfsBindingConfig(agfs=config.agfs) - self._agfs_client = create_agfs_client(runtime_binding_config) + self._agfs_client = create_agfs_client(runtime_binding_config, git_config=git_config) # Initialize QueueManager with agfs_client if self._agfs_client: @@ -287,6 +291,7 @@ async def initialize(self) -> None: self._config.embedding.max_concurrent, self._config.vlm.max_concurrent, binding_config=self._build_ragfs_binding_config(), + git_config=self._config.git, ) if self._embedder is None: @@ -412,6 +417,14 @@ async def initialize(self) -> None: agfs_client=self._agfs_client, ) + # Register as the process-wide service so flows that resolve the + # service via the dependency global (e.g. background reindex tasks + # triggered by git restore) work in embedded mode, not just under the + # HTTP server which calls set_service() during bootstrap. + from openviking.server.dependencies import set_service + + set_service(self) + self._initialized = True logger.info("OpenVikingService initialized") @@ -448,6 +461,13 @@ async def close(self) -> None: self._privacy_config_service = None self._initialized = False + # Clear the process-wide registration if it still points at us, so a + # closed service is never resolved via the dependency global. + from openviking.server.dependencies import get_service_or_none, set_service + + if get_service_or_none() is self: + set_service(None) + logger.info("OpenVikingService closed") async def reindex( diff --git a/openviking/service/fs_service.py b/openviking/service/fs_service.py index 92995a21e9..beb7960d6b 100644 --- a/openviking/service/fs_service.py +++ b/openviking/service/fs_service.py @@ -531,3 +531,90 @@ async def set_tags( recursive=recursive, ctx=ctx, ) + + async def commit( + self, + *, + message: str, + ctx: RequestContext, + paths: Optional[List[str]] = None, + branch: str = "main", + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + """Forward to VikingFS.commit. See viking_fs.commit for semantics.""" + viking_fs = self._ensure_initialized() + validated = ( + [validate_viking_uri(p) for p in paths] if paths is not None else None + ) + return await viking_fs.commit( + message=message, + paths=validated, + branch=branch, + author_name=author_name, + author_email=author_email, + ctx=ctx, + ) + + async def restore( + self, + *, + project_dir: Optional[str], + source_commit: str, + ctx: RequestContext, + branch: str = "main", + dry_run: bool = False, + message: Optional[str] = None, + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + """Forward to VikingFS.restore. See viking_fs.restore for semantics.""" + viking_fs = self._ensure_initialized() + if project_dir is not None: + project_dir = validate_viking_uri(project_dir, field_name="project_dir") + return await viking_fs.restore( + project_dir=project_dir, + source_commit=source_commit, + branch=branch, + dry_run=dry_run, + message=message, + author_name=author_name, + author_email=author_email, + ctx=ctx, + ) + + async def show( + self, + target_ref: str, + ctx: RequestContext, + *, + path: Optional[str] = None, + ) -> Any: + """Forward to VikingFS.show. Returns dict (metadata) or bytes (blob).""" + viking_fs = self._ensure_initialized() + # validate_optional_viking_uri returns "" for None input; VikingFS.show needs None. + path = validate_optional_viking_uri(path, field_name="path") or None + return await viking_fs.show(target_ref, path=path, ctx=ctx) + + async def show_blob_raw( + self, + target_ref: str, + ctx: RequestContext, + *, + path: str, + ) -> Dict[str, Any]: + """Forward to VikingFS.show_blob_raw. Returns ``{"oid", "size", "bytes"}``.""" + viking_fs = self._ensure_initialized() + path = validate_viking_uri(path, field_name="path") + return await viking_fs.show_blob_raw(target_ref, path=path, ctx=ctx) + + async def log( + self, + ctx: RequestContext, + *, + branch: str = "main", + limit: int = 20, + ) -> List[Dict[str, Any]]: + """Forward to VikingFS.log. Walks parents[0] up to limit commits.""" + viking_fs = self._ensure_initialized() + return await viking_fs.log(branch=branch, limit=limit, ctx=ctx) diff --git a/openviking/service/reindex_executor.py b/openviking/service/reindex_executor.py index b3b9da79c8..e3fd0c1b3c 100644 --- a/openviking/service/reindex_executor.py +++ b/openviking/service/reindex_executor.py @@ -712,6 +712,68 @@ async def _reindex_resource_vectors_from_entries( counters.failed_records += 1 counters.warnings.append(f"Failed to reindex {file_uri} vector: {exc}") + async def reindex_directory_marker( + self, *, dir_uri: str, level: ContextLevel, ctx: RequestContext + ) -> None: + """Recompute ONLY this directory's L0 (ABSTRACT) or L1 (OVERVIEW) vector. + + Non-recursive: does not touch descendants. Used by git restore when a + directory's ``.abstract.md`` / ``.overview.md`` marker changed. When the + on-disk semantic source is empty, the corresponding vector is deleted + instead of upserted. + """ + if level not in (ContextLevel.ABSTRACT, ContextLevel.OVERVIEW): + raise ValueError(f"reindex_directory_marker only supports L0/L1, got {level!r}") + if dir_uri == "viking://": + return + + viking_fs = get_viking_fs() + marker_name = ".abstract.md" if level == ContextLevel.ABSTRACT else ".overview.md" + lock_path = viking_fs._uri_to_path(f"{dir_uri}/{marker_name}", ctx=ctx) + async with LockContext(get_lock_manager(), [lock_path], lock_mode="exact"): + abstract = await self._read_directory_abstract(dir_uri, ctx=ctx) + if level == ContextLevel.ABSTRACT: + vector_text = abstract + else: + overview = await self._read_directory_overview(dir_uri, ctx=ctx) + vector_text = overview or abstract + + if not vector_text: + await self.delete_uri_level(uri=dir_uri, level=level, ctx=ctx) + return + + await self._upsert_context( + uri=dir_uri, + parent_uri=VikingURI(dir_uri).parent.uri, + abstract=abstract, + vector_text=vector_text, + is_leaf=False, + context_type=context_type_for_uri(dir_uri), + level=level, + ctx=ctx, + ) + + async def delete_uri_level( + self, *, uri: str, level: ContextLevel, ctx: RequestContext + ) -> int: + """Delete ONLY the vector record at ``(uri, level)``. Returns count. + + Used by git restore for both directory markers (dir + L0/L1) and + deleted source files (file + DETAIL). + """ + service = get_service() + assert service.vikingdb_manager is not None + records = await service.vikingdb_manager.get_context_by_uri( + uri=uri, + level=int(level), + limit=100, + ctx=ctx, + ) + ids = [str(rec["id"]) for rec in records if rec.get("id")] + if not ids: + return 0 + return await service.vikingdb_manager.delete(ids, ctx=ctx) + async def _reindex_user_namespace( self, *, diff --git a/openviking/snapshot_namespace.py b/openviking/snapshot_namespace.py new file mode 100644 index 0000000000..ed6c7c37de --- /dev/null +++ b/openviking/snapshot_namespace.py @@ -0,0 +1,158 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""Snapshot (multi-version) namespace for OpenViking clients. + +Exposes the snapshot/versioning methods on BaseClient under a +`client.snapshot.*` namespace so the user-facing API reads as +`client.snapshot.commit(...)` rather than the flat `client.git_commit(...)` +underneath. +""" +from __future__ import annotations + +from typing import Any, Dict, List, Optional, TYPE_CHECKING + +from openviking_cli.utils import run_async + +if TYPE_CHECKING: + from openviking.async_client import AsyncOpenViking + from openviking.sync_client import SyncOpenViking + + +class AsyncSnapshotNamespace: + """Snapshot version control methods on the async client. + + Forwards to the underlying BaseClient's git_* methods. + """ + + def __init__(self, client: "AsyncOpenViking"): + self._client = client + + async def commit( + self, + *, + message: str, + paths: Optional[List[str]] = None, + branch: str = "main", + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + await self._client._ensure_initialized() + return await self._client._client.git_commit( + message=message, + paths=paths, + branch=branch, + author_name=author_name, + author_email=author_email, + ) + + async def restore( + self, + *, + project_dir: Optional[str] = None, + source_commit: str, + branch: str = "main", + dry_run: bool = False, + message: Optional[str] = None, + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + await self._client._ensure_initialized() + return await self._client._client.git_restore( + project_dir=project_dir, + source_commit=source_commit, + branch=branch, + dry_run=dry_run, + message=message, + author_name=author_name, + author_email=author_email, + ) + + async def show( + self, + target_ref: str, + *, + path: Optional[str] = None, + ) -> Any: + await self._client._ensure_initialized() + return await self._client._client.git_show(target_ref, path=path) + + async def log( + self, + *, + branch: str = "main", + limit: int = 20, + ) -> List[Dict[str, Any]]: + await self._client._ensure_initialized() + return await self._client._client.git_log(branch=branch, limit=limit) + + +class SyncSnapshotNamespace: + """Synchronous wrapper around AsyncSnapshotNamespace. + + Each method calls into the SyncOpenViking's underlying async client + via run_async, matching the rest of the SyncOpenViking surface. + """ + + def __init__(self, client: "SyncOpenViking"): + self._client = client + + def _ns(self) -> AsyncSnapshotNamespace: + return self._client._async_client.snapshot + + def commit( + self, + *, + message: str, + paths: Optional[List[str]] = None, + branch: str = "main", + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + return run_async( + self._ns().commit( + message=message, + paths=paths, + branch=branch, + author_name=author_name, + author_email=author_email, + ) + ) + + def restore( + self, + *, + project_dir: Optional[str] = None, + source_commit: str, + branch: str = "main", + dry_run: bool = False, + message: Optional[str] = None, + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + return run_async( + self._ns().restore( + project_dir=project_dir, + source_commit=source_commit, + branch=branch, + dry_run=dry_run, + message=message, + author_name=author_name, + author_email=author_email, + ) + ) + + def show( + self, + target_ref: str, + *, + path: Optional[str] = None, + ) -> Any: + return run_async(self._ns().show(target_ref, path=path)) + + def log( + self, + *, + branch: str = "main", + limit: int = 20, + ) -> List[Dict[str, Any]]: + return run_async(self._ns().log(branch=branch, limit=limit)) diff --git a/openviking/storage/viking_fs.py b/openviking/storage/viking_fs.py index 97589a347c..1ea9df3cae 100644 --- a/openviking/storage/viking_fs.py +++ b/openviking/storage/viking_fs.py @@ -31,6 +31,7 @@ is_hidden_by_actor_peer_view, may_include_hidden_actor_peers, ) +from openviking.core.context import ContextLevel from openviking.core.namespace import ( is_accessible as namespace_is_accessible, ) @@ -283,6 +284,7 @@ def __init__( self._bound_ctx: contextvars.ContextVar[Optional[RequestContext]] = contextvars.ContextVar( "vikingfs_bound_ctx", default=None ) + self._background_tasks: set = set() @staticmethod def _default_ctx() -> RequestContext: @@ -3320,3 +3322,565 @@ async def write_context( except Exception as e: logger.error(f"[VikingFS] Failed to write {uri}: {e}") raise IOError(f"Failed to write {uri}: {e}") + + # ------------------------------------------------------------------ + # Git version control (commit / restore / show / log) + # ------------------------------------------------------------------ + + # First path segments that the Rust git enumerate.rs prunes from snapshots, + # plus the runtime lock name. Mirrors INTERNAL_FIRST_SEGMENTS in + # crates/ragfs/src/git/enumerate.rs and VikingFS._INTERNAL_NAMES so that + # callers fail fast in Python with a clear error rather than passing a + # path that the Rust side will silently drop. + _GIT_INTERNAL_FIRST_SEGMENTS = frozenset( + {"_system", "tasks", "temp", "queue", "upload", ".path.ovlock"} + ) + + _DEFAULT_GIT_AUTHOR_NAME = "viking-bot" + _DEFAULT_GIT_AUTHOR_EMAIL = "bot@viking.local" + + def _uri_to_tree_path(self, uri: str, ctx: Optional[RequestContext] = None) -> str: + """Convert a viking:// URI to an account-relative git tree path. + + ``viking://resources/proj_a/docs/a.md`` -> ``resources/proj_a/docs/a.md``. + + Pure prefix stripping: removes the ``viking://`` scheme and any + ``/local/{account}/`` segment. Internal scopes that the Rust git layer + would prune (`_system`, `tasks`, `temp`, `queue`, `upload`) and the + runtime lock name (`.path.ovlock`) are rejected with ``ValueError`` + — passing them through silently would result in a no-op commit and + confuse callers. + """ + real_ctx = self._ctx_or_default(ctx) + canonical = canonicalize_uri(uri, real_ctx) + _, parts = self._normalized_uri_parts(canonical) + if not parts: + raise ValueError( + f"git tree path cannot be the account root: {uri!r}" + ) + first = parts[0] + if first in self._GIT_INTERNAL_FIRST_SEGMENTS: + raise ValueError( + f"git tree path rejects internal scope/segment {first!r}: {uri!r}" + ) + return "/".join(parts) + + def _tree_path_to_uri(self, tree_path: str) -> str: + """Convert an account-relative git tree path to a viking:// URI. + + Inverse of :py:meth:`_uri_to_tree_path` (without context canonicalization). + """ + cleaned = tree_path.strip("/") + if not cleaned: + raise ValueError("tree path must not be empty") + return f"viking://{cleaned}" + + _DIR_MARKER_LEVELS = { + ".abstract.md": ContextLevel.ABSTRACT, + ".overview.md": ContextLevel.OVERVIEW, + } + _NO_VECTOR_DERIVED = frozenset({".relations.json"}) + + def _classify_restore_path( + self, tree_path: str, *, deleted: bool + ) -> Optional[tuple]: + """Classify a restore-affected tree path into a vector maintenance task. + + Returns a ``(op, uri, level)`` triple, or ``None`` when the path has no + vector side-effect: + + - ``dir/.abstract.md`` / ``dir/.overview.md`` → recompute (write) or + delete (removal) ONLY that directory's L0/L1 vector: + ``("reindex_marker"|"delete", dir_uri, ABSTRACT|OVERVIEW)``. + - ``.relations.json`` → ``None`` (not a vector text source). + - anything else (a source file) → reindex (write) or delete (removal) + its DETAIL vector: + ``("reindex_file", file_uri, DETAIL)`` / ``("delete", file_uri, DETAIL)``. + + ``None`` is also returned for a directory marker at the account root + (no parent directory to scope an L0/L1 vector to). + """ + parent, _, name = tree_path.rpartition("/") + if name in self._NO_VECTOR_DERIVED: + return None + level = self._DIR_MARKER_LEVELS.get(name) + if level is not None: + if not parent: + return None + dir_uri = self._tree_path_to_uri(parent) + op = "delete" if deleted else "reindex_marker" + return (op, dir_uri, level) + # Source file. + file_uri = self._tree_path_to_uri(tree_path) + op = "delete" if deleted else "reindex_file" + return (op, file_uri, ContextLevel.DETAIL) + + async def commit( + self, + *, + message: str, + paths: Optional[List[str]] = None, + branch: str = "main", + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ctx: Optional[RequestContext] = None, + ) -> Dict[str, Any]: + """Create a git snapshot of the account's tree. + + Args: + message: Commit message. + paths: Optional list of ``viking://`` URIs to scope the commit to; + entries may be files or directories. Directories are expanded + recursively with the snapshot pruning rules applied. ``None`` + (default) enumerates the whole account tree. An empty list is + forwarded as an explicit empty path list (no-op commit). A + path that exists in neither the VFS nor the previous snapshot + logs a warning and is treated as a no-op deletion. + branch: Branch to advance. Defaults to ``"main"``. + author_name / author_email: Override the default bot author. + ctx: Request context (provides ``account_id``). + + Returns: + Dict with ``result`` (``"created"`` / ``"noop"``) and ``commit_oid``; + ``changed`` count when ``result == "created"``. + """ + real_ctx = self._ctx_or_default(ctx) + account = real_ctx.account_id + if paths is None: + tree_paths: Optional[List[str]] = None + else: + tree_paths = [self._uri_to_tree_path(p, ctx=real_ctx) for p in paths] + return await self._async_agfs.run( + "git_commit", + account=account, + branch=branch, + message=message, + paths=tree_paths, + author_name=author_name or self._DEFAULT_GIT_AUTHOR_NAME, + author_email=author_email or self._DEFAULT_GIT_AUTHOR_EMAIL, + ) + + async def restore( + self, + *, + project_dir: Optional[str] = None, + source_commit: str, + branch: str = "main", + dry_run: bool = False, + message: Optional[str] = None, + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ctx: Optional[RequestContext] = None, + ) -> Dict[str, Any]: + """Restore a project subtree to the state at ``source_commit``. + + Generates a new commit whose parent is the current HEAD (not the + source commit) and writes the diff through the VFS. Paths outside + ``project_dir`` are left untouched. + + Args: + project_dir: ``viking://`` URI of the subtree, e.g. + ``"viking://resources/proj_a"``. May also be passed as a + short form like ``"resources/proj_a"``. Trailing slashes are + stripped. Internal scopes are rejected with ``ValueError``. + If None, restore the entire tree. + source_commit: 40-hex OID, branch name, or full ref path. + branch: Branch to advance. Defaults to ``"main"``. + dry_run: If True, returns the planned diff without writing. + message: Optional commit message; defaults to a generated string. + author_name / author_email: Override the default bot author. + ctx: Request context (provides ``account_id``). + + Returns: + Dict containing ``result`` (``"applied"`` / ``"noop"`` / ``"dry_run"``) + and corresponding oid / diff fields. When an ``Applied`` result has + vector side-effects, a ``task_id`` is included for polling the + background reindex via ``GET /api/v1/tasks/{task_id}``. + + After an ``Applied`` result, this method schedules background vector + maintenance for the affected paths via :class:`ReindexExecutor`: + directory markers (``.abstract.md`` / ``.overview.md``) recompute or + delete only that directory's L0/L1 vector; source files reindex (write) + or delete (removal) their DETAIL vector. ``.relations.json`` has no + vector side-effect. The rebuild is tracked as a single task + (``snapshot_restore_reindex``); per-path failures are logged and do not + block the return value. The task reaches ``completed`` only after each + affected vector has been written to (or deleted from) the index, so + polling ``task_id`` to ``completed`` guarantees subsequent ``find`` + reads see the restored state. + """ + real_ctx = self._ctx_or_default(ctx) + account = real_ctx.account_id + tree_dir: Optional[str] + if project_dir is None: + tree_dir = None + else: + tree_dir = self._uri_to_tree_path(project_dir, ctx=real_ctx).rstrip("/") + if not tree_dir: + raise ValueError(f"project_dir must not be empty: {project_dir!r}") + # Build kwargs dynamically, only include project_dir if it's not None + kwargs = { + "account": account, + "branch": branch, + "source_commit": source_commit, + "dry_run": dry_run, + "message": message, + "author_name": author_name or self._DEFAULT_GIT_AUTHOR_NAME, + "author_email": author_email or self._DEFAULT_GIT_AUTHOR_EMAIL, + } + if tree_dir is not None: + kwargs["project_dir"] = tree_dir + # dry_run only computes the diff; it never writes the VFS, so it needs + # no lock. + if dry_run: + return await self._async_agfs.run("git_restore", **kwargs) + + from openviking.storage.errors import LockAcquisitionError, ResourceBusyError + from openviking.storage.transaction import LockContext, get_lock_manager + from openviking.pyagfs.exceptions import GitRestoreWritebackPartialError + + # Serialize the writeback against concurrent VFS mutations on the same + # subtree. A scoped restore tree-locks project_dir; a full restore + # (project_dir is None) locks the account root so only one restore runs + # per account at a time. The lock covers only the writeback — the + # background reindex is scheduled after release, otherwise its per-path + # child locks would conflict with this tree lock. + lock_path = ( + self._uri_to_path(project_dir, ctx=real_ctx) + if project_dir is not None + else f"/local/{account}" + ) + partial_exc: Optional[GitRestoreWritebackPartialError] = None + try: + async with LockContext(get_lock_manager(), [lock_path], lock_mode="tree"): + try: + result = await self._async_agfs.run("git_restore", **kwargs) + except GitRestoreWritebackPartialError as exc: + # The ref already advanced — capture the exception and + # finish the lock scope cleanly. Reindex scheduling and + # re-raise happen below, outside the tree lock. + partial_exc = exc + result = None + except LockAcquisitionError: + raise ResourceBusyError( + f"Resource is being processed: {project_dir or '*'}", + uri=project_dir or "*", + ) + + if partial_exc is not None: + # HEAD has moved forward but some VFS writes/deletes failed. + # Still schedule reindex for the paths that *did* reach the VFS so + # the vector index doesn't stay stale, then re-raise so the caller + # learns about the partial failure (and can inspect failed_writes + # / failed_deletes / task_id on the exception). + try: + partial_exc.task_id = await self._schedule_restore_reindex_for_paths( + written_paths=partial_exc.written_paths, + deleted_paths=partial_exc.deleted_paths, + project_dir=project_dir, + real_ctx=real_ctx, + ) + except Exception: + logger.exception( + "[VikingFS] git restore partial: reindex scheduling failed; " + "HEAD advanced but reindex was not queued" + ) + raise partial_exc + + if result.get("result") != "applied": + return result + + written = list(result.get("written_paths") or []) + deleted = list(result.get("deleted_paths") or []) + if written or deleted: + try: + task_id = await self._schedule_restore_reindex_for_paths( + written_paths=written, + deleted_paths=deleted, + project_dir=project_dir, + real_ctx=real_ctx, + ) + if task_id is not None: + result["task_id"] = task_id + except Exception: + logger.exception( + "[VikingFS] git restore reindex task creation failed; " + "falling back to fire-and-forget rebuild" + ) + self._schedule_vector_rebuild( + written=written, deleted=deleted, ctx=real_ctx + ) + return result + + async def _schedule_restore_reindex_for_paths( + self, + *, + written_paths: List[str], + deleted_paths: List[str], + project_dir: Optional[str], + real_ctx: RequestContext, + ) -> Optional[str]: + """Classify ``written``/``deleted`` paths into vector tasks and queue + them as a single tracked background rebuild. Returns the task id, or + ``None`` if there is nothing to do. + + Shared by the applied path and the partial-writeback recovery path — + both schedule reindex for paths that actually reached the VFS. + """ + if not written_paths and not deleted_paths: + return None + tasks = self._collect_restore_vector_tasks(written_paths, deleted_paths) + if not tasks: + return None + + from openviking.service.task_tracker import get_task_tracker + + tracker = get_task_tracker() + task = await tracker.create( + "snapshot_restore_reindex", + resource_id=project_dir or "*", + account_id=real_ctx.account_id, + user_id=real_ctx.user.user_id, + ) + await tracker.update_stage( + task.task_id, + "queued", + account_id=real_ctx.account_id, + user_id=real_ctx.user.user_id, + ) + background = asyncio.create_task( + self._run_restore_rebuild_tracked(task.task_id, tasks, real_ctx), + name=f"vikingfs-git-restore-reindex:{task.task_id}", + ) + self._background_tasks.add(background) + background.add_done_callback(self._background_tasks.discard) + return task.task_id + + async def show( + self, + target_ref: str, + *, + path: Optional[str] = None, + ctx: Optional[RequestContext] = None, + ) -> Union[Dict[str, Any], bytes]: + """Read a commit's metadata or a single blob. + + ``path=None`` returns the commit metadata dict (oid, tree, parents, + author, committer, message). ``path=str`` returns the blob bytes + directly, stripping the oid/size envelope returned by the binding. + """ + real_ctx = self._ctx_or_default(ctx) + account = real_ctx.account_id + tree_path = self._uri_to_tree_path(path, ctx=real_ctx) if path else None + resp = await self._async_agfs.run( + "git_show", + account=account, + target_ref=target_ref, + path=tree_path, + ) + if path is not None and isinstance(resp, dict) and "bytes" in resp: + return resp["bytes"] + return resp + + async def show_blob_raw( + self, + target_ref: str, + *, + path: str, + ctx: Optional[RequestContext] = None, + ) -> Dict[str, Any]: + """Like ``show(target_ref, path=...)`` but returns the full envelope. + + Returns ``{"oid": str, "size": int, "bytes": bytes}`` without + stripping. Used by the HTTP snapshot router to populate + ``X-Snapshot-Oid`` / ``X-Snapshot-Size`` response headers. + """ + real_ctx = self._ctx_or_default(ctx) + account = real_ctx.account_id + tree_path = self._uri_to_tree_path(path, ctx=real_ctx) + resp = await self._async_agfs.run( + "git_show", + account=account, + target_ref=target_ref, + path=tree_path, + ) + if not isinstance(resp, dict) or "bytes" not in resp: + raise TypeError( + f"git_show returned unexpected shape for blob path: {type(resp).__name__}" + ) + return resp + + async def log( + self, + *, + branch: str = "main", + limit: int = 20, + ctx: Optional[RequestContext] = None, + ) -> List[Dict[str, Any]]: + """Walk back from ``branch``'s HEAD along ``parents[0]`` up to ``limit`` commits. + + Returns a list of commit metadata dicts (same shape as ``show(target_ref)``). + """ + if limit <= 0: + return [] + real_ctx = self._ctx_or_default(ctx) + account = real_ctx.account_id + head = await self._async_agfs.run( + "git_show", account=account, target_ref=branch, path=None, + ) + results: List[Dict[str, Any]] = [head] + parents = head.get("parents") or [] + while parents and len(results) < limit: + parent_oid = parents[0] + commit = await self._async_agfs.run( + "git_show", account=account, target_ref=parent_oid, path=None, + ) + results.append(commit) + parents = commit.get("parents") or [] + return results + + def _collect_restore_vector_tasks( + self, + written: List[str], + deleted: List[str], + ) -> set[tuple]: + """Classify restore-affected paths into deduplicated ``(op, uri, level)`` tasks. + + Tasks are deduplicated on the exact ``(op, uri, level)`` key (no ancestor + subsumption — each change is handled independently because no operation + recurses into descendants). + """ + tasks: set[tuple] = set() + for tree_path in written: + try: + task = self._classify_restore_path(tree_path, deleted=False) + except ValueError: + continue + if task is not None: + tasks.add(task) + for tree_path in deleted: + try: + task = self._classify_restore_path(tree_path, deleted=True) + except ValueError: + continue + if task is not None: + tasks.add(task) + return tasks + + def _schedule_vector_rebuild( + self, + *, + written: List[str], + deleted: List[str], + ctx: RequestContext, + ) -> None: + """Fire-and-forget precise vector maintenance for a git restore. + + Each affected path is classified by :py:meth:`_classify_restore_path` + into a ``(op, uri, level)`` task and scheduled independently: + + - ``reindex_marker`` — recompute only a directory's L0/L1 vector. + - ``reindex_file`` — recompute only a source file's DETAIL vector + (non-recursive ``execute(mode="vectors_only")``). + - ``delete`` — remove only the ``(uri, level)`` vector (directory + marker removal, or deleted source file). + + Failures are logged and never propagate. + """ + try: + from openviking.service.reindex_executor import get_reindex_executor + except Exception: + logger.exception("[VikingFS] ReindexExecutor import failed; skipping rebuild") + return + + try: + loop = asyncio.get_running_loop() + except RuntimeError: + logger.warning( + "[VikingFS] git restore vector rebuild skipped: no running event loop" + ) + return + + tasks = self._collect_restore_vector_tasks(written, deleted) + if not tasks: + return + + executor = get_reindex_executor() + for op, uri, level in tasks: + loop.create_task( + self._run_vector_rebuild(executor, op, uri, level, ctx), + name=f"vikingfs-git-{op}:{uri}:{int(level)}", + ) + + async def _run_restore_rebuild_tracked( + self, + task_id: str, + tasks: set[tuple], + ctx: RequestContext, + ) -> None: + """Background worker driving a tracked restore vector rebuild. + + Runs all classified ``(op, uri, level)`` rebuild tasks concurrently and + drives the task through start → complete/fail. Per-task failures are + swallowed (and logged) inside :py:meth:`_run_vector_rebuild`, preserving + the "failures do not block" semantics. + """ + from openviking.service.task_tracker import get_task_tracker + + tracker = get_task_tracker() + await tracker.start( + task_id, + account_id=ctx.account_id, + user_id=ctx.user.user_id, + stage="reindexing", + ) + try: + from openviking.service.reindex_executor import get_reindex_executor + + executor = get_reindex_executor() + await asyncio.gather( + *[ + self._run_vector_rebuild(executor, op, uri, level, ctx) + for (op, uri, level) in tasks + ] + ) + await tracker.complete( + task_id, + {"status": "completed", "task_count": len(tasks)}, + account_id=ctx.account_id, + user_id=ctx.user.user_id, + ) + except Exception as exc: + await tracker.fail( + task_id, + str(exc), + account_id=ctx.account_id, + user_id=ctx.user.user_id, + ) + + async def _run_vector_rebuild( + self, + executor: Any, + op: str, + uri: str, + level: ContextLevel, + ctx: RequestContext, + ) -> None: + """Wrapper coroutine: dispatch one vector task and swallow errors.""" + try: + if op == "reindex_marker": + await executor.reindex_directory_marker( + dir_uri=uri, level=level, ctx=ctx + ) + elif op == "reindex_file": + await executor.execute( + uri=uri, mode="vectors_only", wait=True, ctx=ctx + ) + elif op == "delete": + await executor.delete_uri_level(uri=uri, level=level, ctx=ctx) + else: # pragma: no cover - defensive + logger.warning("[VikingFS] unknown vector rebuild op %r for %s", op, uri) + except Exception: + logger.exception( + "[VikingFS] git restore vector task %s failed for %s", op, uri + ) diff --git a/openviking/sync_client.py b/openviking/sync_client.py index a7d60e0720..877f2c5e3f 100644 --- a/openviking/sync_client.py +++ b/openviking/sync_client.py @@ -35,6 +35,7 @@ def __init__( agent_id=agent_id, ) self._initialized = False + self._snapshot: Optional["SyncSnapshotNamespace"] = None def initialize(self) -> None: """Initialize OpenViking storage and indexes.""" @@ -523,6 +524,14 @@ def observer(self): self.initialize() return self._async_client.observer + @property + def snapshot(self) -> "SyncSnapshotNamespace": + """Snapshot version control namespace (synchronous).""" + if getattr(self, "_snapshot", None) is None: + from openviking.snapshot_namespace import SyncSnapshotNamespace + self._snapshot = SyncSnapshotNamespace(self) + return self._snapshot + @classmethod def reset(cls) -> None: """Reset singleton (for testing).""" diff --git a/openviking/utils/agfs_utils.py b/openviking/utils/agfs_utils.py index 7178ab4257..b946af57e2 100644 --- a/openviking/utils/agfs_utils.py +++ b/openviking/utils/agfs_utils.py @@ -400,13 +400,72 @@ def _serialize_redirect_policy(policy: Any) -> Dict[str, Any]: return _dump_config_object(policy) -def create_agfs_client(config: RagfsBindingConfig) -> Any: +def _build_git_config_dict(git_config: Any, storage_path: Path) -> Dict[str, Any]: + """Build an in-memory git config dict consumed by the ragfs binding. + + The returned dict mirrors the Rust ``GitConfig`` serde struct so the binding + deserializes it verbatim from the ``config["git"]`` section, without writing + any TOML file. For ``backend == "local"`` a ``local`` sub-dict is emitted + (defaulting ``base_dir`` to ``{storage_path}/.ovgit`` when empty); for + ``backend == "s3"`` an ``s3`` sub-dict is emitted with keys matching the Rust + ``GitS3ConfigPy`` struct. + """ + backend = getattr(git_config, "backend", "local") + result: Dict[str, Any] = { + "enabled": bool(getattr(git_config, "enabled", False)), + "backend": backend, + "default_branch": getattr(git_config, "default_branch", "main"), + "author_name": getattr(git_config, "author_name", "viking-bot"), + "author_email": getattr(git_config, "author_email", "bot@viking.local"), + } + + if backend == "s3": + s3_cfg = getattr(git_config, "s3", None) + if s3_cfg is None: + raise ValueError("git backend 's3' requires a [git.s3] section") + s3_dict: Dict[str, Any] = { + "bucket": getattr(s3_cfg, "bucket", ""), + "region": getattr(s3_cfg, "region", "us-east-1"), + "prefix": getattr(s3_cfg, "prefix", ".ovgit"), + "endpoint": getattr(s3_cfg, "endpoint", ""), + "cas_mode": getattr(s3_cfg, "cas_mode", "native"), + "use_path_style": bool(getattr(s3_cfg, "use_path_style", True)), + } + # Only emit credentials when provided; otherwise the binding falls back + # to the SDK default credentials chain. + access_key = getattr(s3_cfg, "access_key", None) + secret_key = getattr(s3_cfg, "secret_key", None) + if access_key: + s3_dict["access_key"] = access_key + if secret_key: + s3_dict["secret_key"] = secret_key + result["s3"] = s3_dict + return result + + # Default: local backend + local_cfg = getattr(git_config, "local", None) + base_dir = getattr(local_cfg, "base_dir", "") if local_cfg is not None else "" + if not base_dir: + base_dir = str(storage_path / ".ovgit") + else: + base_dir = str(Path(base_dir).expanduser()) + + result["local"] = {"base_dir": base_dir} + return result + + +def create_agfs_client(config: RagfsBindingConfig, *, git_config: Any = None) -> Any: """ Create a RAGFS client based on the provided configuration. Args: config: Single runtime config object containing both backend mount settings and construction-time binding sections. + git_config: Optional GitConfig. When provided and ``enabled`` is True, + the git config is built into an in-memory dict and injected into the + binding ``config`` under the ``git`` key so the binding exposes git_* + methods. No file is written to disk. When None or disabled, the client + is constructed without a git section (legacy behavior). Returns: A RAGFSBindingClient instance. @@ -426,11 +485,20 @@ def create_agfs_client(config: RagfsBindingConfig) -> Any: "to build and install the RAGFS SDK with native bindings." ) + agfs_config = config.agfs if isinstance(config, RagfsBindingConfig) else config + binding_dict = config.to_binding_dict() + if git_config is not None and getattr(git_config, "enabled", False): + path_str = getattr(agfs_config, "path", None) + if path_str is None: + raise ValueError("agfs_config.path is required when git is enabled") + storage_path = Path(path_str).resolve() + binding_dict["git"] = _build_git_config_dict(git_config, storage_path) + # Construction-time decides whether the stack includes the encryption layer. config_path = resolve_config_path(None, OPENVIKING_CONFIG_ENV, DEFAULT_OV_CONF) client = RAGFSBindingClient( str(config_path) if config_path else None, - config=config.to_binding_dict(), + config=binding_dict, ) # Automatically mount backend for binding client diff --git a/openviking_cli/client/base.py b/openviking_cli/client/base.py index 7b844cb5c9..d68baa77b8 100644 --- a/openviking_cli/client/base.py +++ b/openviking_cli/client/base.py @@ -469,3 +469,49 @@ def is_healthy(self) -> bool: def observer(self) -> Any: """Get observer service for component status.""" ... + + # ============= Git Version Control ============= + + @abstractmethod + async def git_commit( + self, + *, + message: str, + paths: Optional[List[str]] = None, + branch: str = "main", + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + """Create a git snapshot. See VikingFS.commit for semantics.""" + + @abstractmethod + async def git_restore( + self, + *, + project_dir: Optional[str] = None, + source_commit: str, + branch: str = "main", + dry_run: bool = False, + message: Optional[str] = None, + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + """Restore a subtree, or the full account tree when project_dir is omitted.""" + + @abstractmethod + async def git_show( + self, + target_ref: str, + *, + path: Optional[str] = None, + ) -> Any: + """Read a commit's metadata or a single blob.""" + + @abstractmethod + async def git_log( + self, + *, + branch: str = "main", + limit: int = 20, + ) -> List[Dict[str, Any]]: + """Walk back along parents[0] up to limit commits.""" diff --git a/openviking_cli/utils/config/__init__.py b/openviking_cli/utils/config/__init__.py index 31ea7202cf..60dd313159 100644 --- a/openviking_cli/utils/config/__init__.py +++ b/openviking_cli/utils/config/__init__.py @@ -52,6 +52,7 @@ ) from .embedding_config import EmbeddingConfig from .grep_config import GrepConfig, GrepEngine +from .git_config import GitConfig, GitLocalConfig, GitS3Config from .log_config import LogConfig from .open_viking_config import ( OpenVikingConfig, @@ -161,4 +162,7 @@ "is_valid_openviking_config", "TelemetryConfig", "TracerConfig", + "GitConfig", + "GitLocalConfig", + "GitS3Config", ] diff --git a/openviking_cli/utils/config/git_config.py b/openviking_cli/utils/config/git_config.py new file mode 100644 index 0000000000..000962d37b --- /dev/null +++ b/openviking_cli/utils/config/git_config.py @@ -0,0 +1,121 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""Git version control configuration for OpenViking.""" +from typing import Literal, Optional + +from pydantic import BaseModel, Field, model_validator + + +class GitLocalConfig(BaseModel): + """Configuration for the local git object backend.""" + + base_dir: str = Field( + default="", + description="Filesystem directory holding git objects/refs. " + "When empty, defaults to '{storage.path}/.ovgit'.", + ) + + model_config = {"extra": "forbid"} + + +class GitS3Config(BaseModel): + """Configuration for the S3 git object/ref backend. + + Fields mirror the Rust ``GitS3ConfigPy`` serde struct so that the rendered + ``[git.s3]`` TOML section is consumed verbatim by the ragfs binding. + """ + + bucket: str = Field( + default="", + description="S3 bucket name holding git objects/refs.", + ) + region: str = Field( + default="us-east-1", + description="AWS region where the bucket is located (e.g., us-east-1, cn-beijing).", + ) + prefix: str = Field( + default=".ovgit", + description="Key prefix for git storage. All keys are stored under '{prefix}/{account}/...'.", + ) + endpoint: str = Field( + default="", + description="Custom S3 endpoint URL for S3-compatible services like MinIO/LocalStack/TOS. " + "Leave empty for standard AWS S3.", + ) + access_key: Optional[str] = Field( + default=None, + description="S3 access key ID read directly from config. " + "When empty, the SDK default credentials chain is used.", + ) + secret_key: Optional[str] = Field( + default=None, + description="S3 secret access key read directly from config. " + "When empty, the SDK default credentials chain is used.", + ) + cas_mode: Literal["native"] = Field( + default="native", + description="Ref CAS mode. 'native' uses S3 conditional writes (If-Match). " + "It is the only supported mode.", + ) + use_path_style: bool = Field( + default=True, + description="true uses path-style addressing (MinIO and some S3-compatible services); " + "false uses virtual-host style (TOS and some S3-compatible services).", + ) + + model_config = {"extra": "forbid"} + + +class GitConfig(BaseModel): + """Git multi-version management configuration.""" + + enabled: bool = Field( + default=True, + description="Enable git-based multi-version management for VikingFS content.", + ) + backend: Literal["local", "s3"] = Field( + default="local", + description="Git object backend. 'local' stores objects on the local filesystem; " + "'s3' stores them on a remote S3-compatible bucket. When unset, defaults to " + "the same backend as 'storage.agfs.backend' (a 'memory' storage backend maps " + "to 'local').", + ) + default_branch: str = Field( + default="main", + description="Default branch name for commits when not specified.", + ) + author_name: str = Field( + default="viking-bot", + description="Default author name used when callers omit author_name.", + ) + author_email: str = Field( + default="bot@viking.local", + description="Default author email used when callers omit author_email.", + ) + local: GitLocalConfig = Field( + default_factory=GitLocalConfig, + description="Configuration for the 'local' backend.", + ) + s3: Optional[GitS3Config] = Field( + default=None, + description="Configuration for the 's3' backend. Required when backend='s3'.", + ) + + model_config = {"extra": "forbid"} + + @model_validator(mode="after") + def _validate_backend(self) -> "GitConfig": + """Ensure the selected backend has the required configuration.""" + if self.enabled and self.backend == "s3": + if self.s3 is None: + raise ValueError("git backend 's3' requires a [git.s3] section") + missing = [] + if not self.s3.bucket: + missing.append("bucket") + if not self.s3.region: + missing.append("region") + if missing: + raise ValueError( + f"git backend 's3' requires the following fields: {', '.join(missing)}" + ) + return self diff --git a/openviking_cli/utils/config/open_viking_config.py b/openviking_cli/utils/config/open_viking_config.py index de72d14216..85e191c6f5 100644 --- a/openviking_cli/utils/config/open_viking_config.py +++ b/openviking_cli/utils/config/open_viking_config.py @@ -22,6 +22,7 @@ from .embedding_config import EmbeddingConfig from .encryption_config import EncryptionConfig from .grep_config import GrepConfig +from .git_config import GitConfig from .log_config import LogConfig from .memory_config import MemoryConfig from .oauth_config import OAuthConfig @@ -146,6 +147,10 @@ class OpenVikingConfig(BaseModel): default_factory=EncryptionConfig, description="Encryption configuration" ) + git: GitConfig = Field( + default_factory=GitConfig, description="Git version control configuration" + ) + # Parser configurations pdf: PDFConfig = Field(default_factory=PDFConfig, description="PDF parsing configuration") @@ -233,6 +238,58 @@ def _warn_on_deprecated_language_fallback(self) -> "OpenVikingConfig": ) return self + @model_validator(mode="before") + @classmethod + def _inherit_git_defaults_from_agfs(cls, data: Any) -> Any: + """Let the `git` section inherit unset defaults from `storage.agfs`. + + - `git.backend` defaults to `storage.agfs.backend` (a 'memory' storage + backend maps to 'local') when not set explicitly. + - When the effective git backend is 's3', the `git.s3` fields + bucket/region/endpoint/access_key/secret_key default to the matching + `storage.agfs.s3` values when not set explicitly and the source value + is non-empty. + + Injecting into the raw dict keeps GitConfig's own validation intact. + """ + if not isinstance(data, dict): + return data + + storage = data.get("storage") + agfs = storage.get("agfs", {}) if isinstance(storage, dict) else {} + if not isinstance(agfs, dict): + agfs = {} + + git = data.get("git") + if not isinstance(git, dict): + if git is not None: + # git provided as a model instance; respect it as-is. + return data + git = {} + git = dict(git) + + if "backend" not in git: + agfs_backend = agfs.get("backend", "local") + if agfs_backend == "memory": + agfs_backend = "local" + if agfs_backend in ("local", "s3"): + git["backend"] = agfs_backend + + if git.get("backend", "local") == "s3": + agfs_s3 = agfs.get("s3", {}) + if not isinstance(agfs_s3, dict): + agfs_s3 = {} + git_s3 = git.get("s3") + git_s3 = dict(git_s3) if isinstance(git_s3, dict) else {} + for field in ("bucket", "region", "endpoint", "access_key", "secret_key"): + if field not in git_s3 and agfs_s3.get(field): + git_s3[field] = agfs_s3[field] + git["s3"] = git_s3 + + data = dict(data) + data["git"] = git + return data + allow_private_networks: bool = Field( default=False, description=( diff --git a/sdk/python/openviking_sdk/client.py b/sdk/python/openviking_sdk/client.py index fa907092c2..c7414e6702 100644 --- a/sdk/python/openviking_sdk/client.py +++ b/sdk/python/openviking_sdk/client.py @@ -243,6 +243,7 @@ def __init__( self._upload_mode = config.upload_mode self._http: Optional[httpx.AsyncClient] = None self._observer: Optional[_HTTPObserver] = None + self._snapshot: Optional["AsyncHTTPSnapshotNamespace"] = None async def initialize(self) -> None: headers: Dict[str, str] = {} @@ -1225,11 +1226,108 @@ def observer(self) -> _HTTPObserver: self._observer = _HTTPObserver(self) return self._observer + # ============= Git Version Control ============= + + async def git_commit( + self, + *, + message: str, + paths: Optional[List[str]] = None, + branch: str = "main", + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + """Create a snapshot of the current workspace state.""" + body: Dict[str, Any] = {"message": message, "branch": branch} + if paths is not None: + body["paths"] = paths + if author_name is not None: + body["author_name"] = author_name + if author_email is not None: + body["author_email"] = author_email + response = await self._http.post("/api/v1/snapshot/commit", json=body) + return self._handle_response(response) + + async def git_restore( + self, + *, + project_dir: Optional[str] = None, + source_commit: str, + branch: str = "main", + dry_run: bool = False, + message: Optional[str] = None, + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + """Forward-commit restore of a subtree, or the full account tree when project_dir is omitted.""" + body: Dict[str, Any] = { + "source_commit": source_commit, + "branch": branch, + "dry_run": dry_run, + } + if project_dir is not None: + body["project_dir"] = project_dir + if message is not None: + body["message"] = message + if author_name is not None: + body["author_name"] = author_name + if author_email is not None: + body["author_email"] = author_email + response = await self._http.post("/api/v1/snapshot/restore", json=body) + return self._handle_response(response) + + async def git_show( + self, + target_ref: str, + *, + path: Optional[str] = None, + ) -> Any: + """Fetch commit metadata (path=None) or a blob's {oid, size, bytes} (path=).""" + params: Dict[str, Any] = {"target_ref": target_ref} + if path is not None: + params["path"] = path + response = await self._http.get("/api/v1/snapshot/show", params=params) + + if path is None: + return self._handle_response(response) + + # Binary branch: server sets application/octet-stream + X-Snapshot-* headers. + content_type = response.headers.get("content-type", "") + if content_type.startswith("application/octet-stream"): + return { + "oid": response.headers.get("x-snapshot-oid", ""), + "size": int(response.headers.get("x-snapshot-size", "0")), + "bytes": response.content, + } + # Fallback: server returned a JSON error envelope. Let the standard handler raise. + return self._handle_response(response) + + async def git_log( + self, + *, + branch: str = "main", + limit: int = 20, + ) -> List[Dict[str, Any]]: + """Walk commit history newest-first.""" + response = await self._http.get( + "/api/v1/snapshot/log", + params={"branch": branch, "limit": limit}, + ) + return self._handle_response(response) + + @property + def snapshot(self) -> "AsyncHTTPSnapshotNamespace": + """Snapshot version control namespace (async HTTP).""" + if self._snapshot is None: + self._snapshot = AsyncHTTPSnapshotNamespace(self) + return self._snapshot + class SyncHTTPClient: def __init__(self, *args, **kwargs): self._async_client = AsyncHTTPClient(*args, **kwargs) self._initialized = False + self._snapshot: Optional["SyncHTTPSnapshotNamespace"] = None def initialize(self) -> None: run_async(self._async_client.initialize()) @@ -1823,6 +1921,13 @@ def is_healthy(self) -> bool: def observer(self) -> _HTTPObserver: return self._async_client.observer + @property + def snapshot(self) -> "SyncHTTPSnapshotNamespace": + """Snapshot version control namespace (sync HTTP).""" + if self._snapshot is None: + self._snapshot = SyncHTTPSnapshotNamespace(self) + return self._snapshot + def __getattr__(self, name: str): attr = getattr(self._async_client, name) if inspect.iscoroutinefunction(attr): @@ -1832,3 +1937,132 @@ def wrapper(*args, **kwargs): return wrapper return attr + + +class AsyncHTTPSnapshotNamespace: + """Snapshot version control namespace forwarding to AsyncHTTPClient git_* methods.""" + + def __init__(self, client: "AsyncHTTPClient"): + self._client = client + + async def commit( + self, + *, + message: str, + paths: Optional[List[str]] = None, + branch: str = "main", + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + return await self._client.git_commit( + message=message, + paths=paths, + branch=branch, + author_name=author_name, + author_email=author_email, + ) + + async def restore( + self, + *, + project_dir: Optional[str] = None, + source_commit: str, + branch: str = "main", + dry_run: bool = False, + message: Optional[str] = None, + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + return await self._client.git_restore( + project_dir=project_dir, + source_commit=source_commit, + branch=branch, + dry_run=dry_run, + message=message, + author_name=author_name, + author_email=author_email, + ) + + async def show( + self, + target_ref: str, + *, + path: Optional[str] = None, + ) -> Any: + return await self._client.git_show(target_ref, path=path) + + async def log( + self, + *, + branch: str = "main", + limit: int = 20, + ) -> List[Dict[str, Any]]: + return await self._client.git_log(branch=branch, limit=limit) + + +class SyncHTTPSnapshotNamespace: + """Synchronous wrapper around the HTTP client's snapshot namespace.""" + + def __init__(self, client: "SyncHTTPClient"): + self._client = client + + def _ns(self) -> AsyncHTTPSnapshotNamespace: + return self._client._async_client.snapshot + + def commit( + self, + *, + message: str, + paths: Optional[List[str]] = None, + branch: str = "main", + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + return run_async( + self._ns().commit( + message=message, + paths=paths, + branch=branch, + author_name=author_name, + author_email=author_email, + ) + ) + + def restore( + self, + *, + project_dir: Optional[str] = None, + source_commit: str, + branch: str = "main", + dry_run: bool = False, + message: Optional[str] = None, + author_name: Optional[str] = None, + author_email: Optional[str] = None, + ) -> Dict[str, Any]: + return run_async( + self._ns().restore( + project_dir=project_dir, + source_commit=source_commit, + branch=branch, + dry_run=dry_run, + message=message, + author_name=author_name, + author_email=author_email, + ) + ) + + def show( + self, + target_ref: str, + *, + path: Optional[str] = None, + ) -> Any: + return run_async(self._ns().show(target_ref, path=path)) + + def log( + self, + *, + branch: str = "main", + limit: int = 20, + ) -> List[Dict[str, Any]]: + return run_async(self._ns().log(branch=branch, limit=limit)) diff --git a/tests/agfs/test_git_binding.py b/tests/agfs/test_git_binding.py new file mode 100644 index 0000000000..47bbcedac6 --- /dev/null +++ b/tests/agfs/test_git_binding.py @@ -0,0 +1,316 @@ +"""End-to-end tests for the git_commit/git_restore/git_show PyO3 bindings. + +These tests exercise the binding through ragfs_python.RAGFSBindingClient +directly so they don't require the higher-level VikingFS layer. +""" + +import shutil +import tempfile +from pathlib import Path + +import pytest + + +# Skip the whole module if the native extension is not built locally. +ragfs_python = pytest.importorskip("ragfs_python") + + +# ---------------- fixtures ---------------- + + +@pytest.fixture +def git_workspace(): + """Create a temp workspace containing a localfs root and a [git] config TOML. + + Yields (config_path, localfs_root, git_root) and removes the dir on exit. + """ + root = Path(tempfile.mkdtemp(prefix="ov-git-binding-")) + localfs_root = root / "fs" + git_root = root / "git" + localfs_root.mkdir() + git_root.mkdir() + + config_path = root / "ragfs.toml" + config_path.write_text( + f""" +[git] +enabled = true +backend = "local" +default_branch = "main" +author_name = "test-bot" +author_email = "test@example.com" + +[git.local] +base_dir = "{git_root}" +""" + ) + + yield config_path, localfs_root, git_root + + shutil.rmtree(root, ignore_errors=True) + + +@pytest.fixture +def git_disabled_workspace(): + """A workspace whose [git] section has enabled = false.""" + root = Path(tempfile.mkdtemp(prefix="ov-git-disabled-")) + config_path = root / "ragfs.toml" + config_path.write_text( + """ +[git] +enabled = false +""" + ) + yield config_path + shutil.rmtree(root, ignore_errors=True) + + +@pytest.fixture +def client(git_workspace): + config_path, localfs_root, _ = git_workspace + c = ragfs_python.RAGFSBindingClient(git_config_path=str(config_path)) + # Mount localfs at /local so we can write files into the account tree. + c.mount("localfs", "/local", {"local_dir": str(localfs_root)}) + return c + + +# ---------------- helper: write a file into account tree ---------------- + + +def _write(client, account: str, rel_path: str, body: bytes) -> str: + """Write `body` to /local// via the binding.""" + path = f"/local/{account}/{rel_path}" + client.ensure_parent_dirs(path) + client.write(path, body) + return path + + +# ---------------- tests ---------------- + + +def test_git_concurrent_commit_error_class_exists(): + from openviking.pyagfs import GitConcurrentCommitError + from openviking.pyagfs.exceptions import AGFSClientError + assert issubclass(GitConcurrentCommitError, AGFSClientError) + + +def test_health_reports_git_enabled(client): + h = client.health() + assert h["git_enabled"] == "true" + assert h.get("git_backend") == "local" + + +def test_commit_then_show_roundtrip(client): + """Write a file, commit it, then show it back and verify bytes match.""" + account = "acct1" + _write(client, account, "resources/a.md", b"hello world") + + resp = client.git_commit( + account=account, + branch="main", + message="initial", + author_name="alice", + author_email="a@e.com", + paths=["resources/a.md"], + ) + assert resp["result"] == "created" + assert resp["changed"] == 1 + commit_oid = resp["commit_oid"] + assert len(commit_oid) == 40 + + shown = client.git_show( + account=account, + target_ref="main", + path="resources/a.md", + ) + assert shown["bytes"] == b"hello world" + assert shown["size"] == 11 + + +def test_commit_then_show_commit_metadata(client): + account = "acct1" + _write(client, account, "resources/a.md", b"x") + resp = client.git_commit( + account=account, + branch="main", + message="m1", + author_name="alice", + author_email="a@e.com", + paths=["resources/a.md"], + ) + meta = client.git_show(account=account, target_ref="main") + assert meta["message"].startswith("m1") + assert meta["oid"] == resp["commit_oid"] + assert meta["parents"] == [] + assert meta["author"]["name"] == "alice" + + +def test_restore_roundtrip(client): + """Commit v1 → modify → commit v2 → restore v1 → file reverts; HEAD moves to v3.""" + account = "acct1" + _write(client, account, "resources/proj/a.md", b"v1-content") + + v1 = client.git_commit( + account=account, branch="main", message="v1", + author_name="a", author_email="a@e", + paths=["resources/proj/a.md"], + ) + v1_oid = v1["commit_oid"] + + _write(client, account, "resources/proj/a.md", b"v2-content") + v2 = client.git_commit( + account=account, branch="main", message="v2", + author_name="a", author_email="a@e", + paths=["resources/proj/a.md"], + ) + v2_oid = v2["commit_oid"] + + restored = client.git_restore( + account=account, branch="main", + project_dir="resources/proj", + source_commit=v1_oid, + author_name="a", author_email="a@e", + ) + assert restored["result"] == "applied" + assert restored["source_commit"] == v1_oid + assert restored["parent_commit"] == v2_oid + assert restored["new_commit_oid"] != v2_oid + assert restored["written"] >= 1 + + # VFS file content reverted + content = client.read(f"/local/{account}/resources/proj/a.md") + assert content == b"v1-content" + + # Branch now points at the new commit + head = client.git_show(account=account, target_ref="main") + assert head["oid"] == restored["new_commit_oid"] + assert head["parents"] == [v2_oid] + + +def test_restore_dry_run_does_not_mutate(client): + account = "acct1" + _write(client, account, "resources/proj/a.md", b"v1") + v1 = client.git_commit( + account=account, branch="main", message="v1", + author_name="a", author_email="a@e", + paths=["resources/proj/a.md"], + ) + + _write(client, account, "resources/proj/a.md", b"v2") + client.git_commit( + account=account, branch="main", message="v2", + author_name="a", author_email="a@e", + paths=["resources/proj/a.md"], + ) + + res = client.git_restore( + account=account, branch="main", + project_dir="resources/proj", + source_commit=v1["commit_oid"], + author_name="a", author_email="a@e", + dry_run=True, + ) + assert res["result"] == "dry_run" + assert "diff" in res + assert any(item["path"] == "a.md" for item in res["diff"]["to_write"]) + + # VFS still holds v2 — dry_run did not write + assert client.read(f"/local/{account}/resources/proj/a.md") == b"v2" + + +def test_account_isolation(client): + """A commit under account A is invisible to account B.""" + _write(client, "acct_a", "resources/a.md", b"x") + client.git_commit( + account="acct_a", branch="main", message="m", + author_name="n", author_email="e", + paths=["resources/a.md"], + ) + + from openviking.pyagfs import AGFSNotFoundError + with pytest.raises(AGFSNotFoundError): + client.git_show(account="acct_b", target_ref="main") + + +def test_feature_disabled_raises(git_disabled_workspace): + from openviking.pyagfs import AGFSNotSupportedError + c = ragfs_python.RAGFSBindingClient(git_config_path=str(git_disabled_workspace)) + with pytest.raises(AGFSNotSupportedError): + c.git_commit( + account="a", branch="main", message="m", + author_name="n", author_email="e", + ) + + +def test_invalid_backend_at_construct_time(tmp_path): + cfg = tmp_path / "bad.toml" + cfg.write_text( + """ +[git] +enabled = true +backend = "bogus" +""" + ) + with pytest.raises(Exception) as excinfo: + ragfs_python.RAGFSBindingClient(git_config_path=str(cfg)) + assert "unsupported git backend" in str(excinfo.value).lower() + + +def test_cas_conflict_surface(client): + """Two commits trying to advance from the same parent — one should win, + the other should raise GitConcurrentCommitError. + + We provoke this by writing two different files, then issuing two + git_commit calls back-to-back with paths overlapping enough that both + actually produce new tree objects. + """ + import threading + + from openviking.pyagfs import GitConcurrentCommitError + + account = "acct_cas" + _write(client, account, "resources/seed.md", b"seed") + client.git_commit( + account=account, branch="main", message="seed", + author_name="n", author_email="e", + paths=["resources/seed.md"], + ) + + # Prepare two divergent changes + _write(client, account, "resources/a.md", b"AAA") + _write(client, account, "resources/b.md", b"BBB") + + errors: list[BaseException] = [] + results: list[dict] = [] + barrier = threading.Barrier(2) + + def do_commit(path: str): + try: + barrier.wait() + r = client.git_commit( + account=account, branch="main", message=f"commit {path}", + author_name="n", author_email="e", + paths=[path], + ) + results.append(r) + except BaseException as e: + errors.append(e) + + t1 = threading.Thread(target=do_commit, args=("resources/a.md",)) + t2 = threading.Thread(target=do_commit, args=("resources/b.md",)) + t1.start(); t2.start() + t1.join(); t2.join() + + # The LocalRefStore mutex + CAS may serialize the two so well that the + # second sees the new parent and the conflict never surfaces. In that case + # both succeed and form a linear history — that is also correct behavior. + # We accept either outcome but verify that NO silent data loss occurs: + # if both succeed, the second's commit_oid != the first's; if one fails, + # the failure must be GitConcurrentCommitError. + if len(errors) == 1: + assert isinstance(errors[0], GitConcurrentCommitError), errors[0] + assert len(results) == 1 + else: + assert errors == [], errors + assert len(results) == 2 + assert results[0]["commit_oid"] != results[1]["commit_oid"] diff --git a/tests/agfs/test_git_binding_s3.py b/tests/agfs/test_git_binding_s3.py new file mode 100644 index 0000000000..d0a96926ea --- /dev/null +++ b/tests/agfs/test_git_binding_s3.py @@ -0,0 +1,337 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 + +"""End-to-end tests for the git_commit/git_restore/git_show PyO3 bindings on +the **S3/TOS** backend (audit plan §A4). + +These mirror the local-backend flow in ``test_git_binding.py`` but wire the +binding to a real S3-compatible store (TOS / MinIO / LocalStack): + +* git objects/refs go to ``[git.s3]`` (namespaced under ``{prefix}/_it/{uuid}``) +* the working tree is an ``s3fs`` mount at ``/local`` (namespaced under a + separate ``_it_fs/{uuid}`` prefix) + +so concurrent runs never collide and never touch real data. + +Skip gating mirrors ``test_fs_binding_s3.py``: the whole module is skipped +unless a config file with ``git.enabled = true`` and ``git.backend = "s3"`` is +discoverable. Resolution order (first hit wins): + + 1. ``OV_GIT_S3_CONF`` env var (explicit path to an ``ov.conf`` JSON file) + 2. ``OPENVIKING_CONFIG_FILE`` env var + +Run against TOS:: + + OV_GIT_S3_CONF=/path/to/ov.conf \ + python -m pytest tests/agfs/test_git_binding_s3.py -q +""" + +import json +import os +import shutil +import tempfile +import uuid +from pathlib import Path + +import pytest + +# Skip the whole module if the native extension is not built locally. +ragfs_python = pytest.importorskip("ragfs_python") + + +# ---------------- config discovery / skip gating ---------------- + + +def _resolve_conf_path(): + """Resolve an ov.conf path from the documented env-var chain.""" + for candidate in ( + os.getenv("OV_GIT_S3_CONF"), + os.getenv("OPENVIKING_CONFIG_FILE"), + ): + if candidate and Path(candidate).exists(): + return Path(candidate) + return None + + +def _load_git_s3_section(): + """Return the ``git`` dict from the resolved ov.conf when it is a usable + S3 git config, else ``None`` (treated as "skip"). + """ + path = _resolve_conf_path() + if path is None: + return None + try: + with open(path, "r") as f: + root = json.load(f) + except Exception: + return None + + git = root.get("git") or root.get("storage", {}).get("git") + if not git: + return None + if not git.get("enabled"): + return None + if git.get("backend") != "s3": + return None + s3 = git.get("s3") or {} + if not s3.get("bucket") or not s3.get("region"): + return None + return git + + +GIT_S3 = _load_git_s3_section() + +pytestmark = pytest.mark.skipif( + GIT_S3 is None, + reason="no usable [git.s3] config (set OV_GIT_S3_CONF to an ov.conf with " + "git.enabled=true, backend='s3')", +) + + +# ---------------- fixtures ---------------- + + +def _toml_quote(s: str) -> str: + return '"' + str(s).replace("\\", "\\\\").replace('"', '\\"') + '"' + + +def _toml_bool(v) -> str: + return "true" if v else "false" + + +@pytest.fixture +def git_s3_workspace(): + """Create a temp workspace with a ragfs.toml whose [git] backend is s3. + + The git object/ref keys are namespaced under ``{prefix}/_it/{uuid}`` so + repeated runs never collide. Yields ``(config_path, fs_prefix, s3)`` where + ``s3`` is the raw s3 config dict and ``fs_prefix`` is a unique prefix to + use for the s3fs working-tree mount. + """ + s3 = GIT_S3["s3"] + run_id = uuid.uuid4().hex + base_prefix = (s3.get("prefix") or "git").rstrip("/") + git_prefix = f"{base_prefix}/_it/{run_id}" + fs_prefix = f"{base_prefix}/_it_fs/{run_id}" + + root = Path(tempfile.mkdtemp(prefix="ov-git-s3-binding-")) + config_path = root / "ragfs.toml" + + lines = [ + "[git]\n", + "enabled = true\n", + 'backend = "s3"\n', + 'default_branch = "main"\n', + 'author_name = "test-bot"\n', + 'author_email = "test@example.com"\n', + "\n", + "[git.s3]\n", + f"bucket = {_toml_quote(s3['bucket'])}\n", + f"region = {_toml_quote(s3['region'])}\n", + f"prefix = {_toml_quote(git_prefix)}\n", + f"endpoint = {_toml_quote(s3.get('endpoint', ''))}\n", + ] + if s3.get("access_key"): + lines.append(f"access_key = {_toml_quote(s3['access_key'])}\n") + if s3.get("secret_key"): + lines.append(f"secret_key = {_toml_quote(s3['secret_key'])}\n") + lines.append(f"cas_mode = {_toml_quote(s3.get('cas_mode', 'native'))}\n") + lines.append(f"use_path_style = {_toml_bool(s3.get('use_path_style', True))}\n") + config_path.write_text("".join(lines)) + + yield config_path, fs_prefix, s3 + + shutil.rmtree(root, ignore_errors=True) + + +@pytest.fixture +def client(git_s3_workspace): + config_path, fs_prefix, s3 = git_s3_workspace + c = ragfs_python.RAGFSBindingClient(git_config_path=str(config_path)) + # Mount s3fs at /local so the working tree lives on the same S3 backend. + mount_cfg = { + "bucket": s3["bucket"], + "region": s3["region"], + "endpoint": s3.get("endpoint", ""), + "prefix": fs_prefix, + "use_path_style": bool(s3.get("use_path_style", True)), + "disable_ssl": not bool(s3.get("use_ssl", True)), + } + if s3.get("access_key"): + mount_cfg["access_key_id"] = s3["access_key"] + if s3.get("secret_key"): + mount_cfg["secret_access_key"] = s3["secret_key"] + c.mount("s3fs", "/local", mount_cfg) + return c + + +# ---------------- helpers ---------------- + + +def _write(client, account: str, rel_path: str, body: bytes) -> str: + """Write `body` to /local// via the binding.""" + path = f"/local/{account}/{rel_path}" + client.ensure_parent_dirs(path) + client.write(path, body) + return path + + +def _acct() -> str: + """Random account id so concurrent runs never share a namespace.""" + return f"acct-{uuid.uuid4().hex}" + + +# ---------------- tests ---------------- + + +def test_health_reports_git_backend_s3(client): + h = client.health() + assert h["git_enabled"] == "true" + assert h.get("git_backend") == "s3" + + +def test_commit_then_show_roundtrip_s3(client): + """Write a file, commit it, then show it back and verify bytes match.""" + account = _acct() + body = b"hello viking s3 \x00\x01\x02 binary-ish\n" + _write(client, account, "resources/a.md", body) + + resp = client.git_commit( + account=account, + branch="main", + message="initial", + author_name="alice", + author_email="a@e.com", + paths=["resources/a.md"], + ) + assert resp["result"] == "created" + assert resp["changed"] == 1 + assert len(resp["commit_oid"]) == 40 + + shown = client.git_show( + account=account, + target_ref="main", + path="resources/a.md", + ) + assert shown["bytes"] == body + assert shown["size"] == len(body) + + +def test_restore_roundtrip_s3(client): + """Commit v1 → modify → commit v2 → restore v1 → file reverts; HEAD moves to v3.""" + account = _acct() + _write(client, account, "resources/proj/a.md", b"v1-content") + v1 = client.git_commit( + account=account, branch="main", message="v1", + author_name="a", author_email="a@e", + paths=["resources/proj/a.md"], + ) + v1_oid = v1["commit_oid"] + + _write(client, account, "resources/proj/a.md", b"v2-content") + v2 = client.git_commit( + account=account, branch="main", message="v2", + author_name="a", author_email="a@e", + paths=["resources/proj/a.md"], + ) + v2_oid = v2["commit_oid"] + + restored = client.git_restore( + account=account, branch="main", + project_dir="resources/proj", + source_commit=v1_oid, + author_name="a", author_email="a@e", + ) + assert restored["result"] == "applied" + assert restored["source_commit"] == v1_oid + assert restored["parent_commit"] == v2_oid + assert restored["new_commit_oid"] != v2_oid + assert restored["written"] >= 1 + + # VFS file content reverted on S3. + assert client.read(f"/local/{account}/resources/proj/a.md") == b"v1-content" + + # Branch now points at the new (forward-only) commit, parented on v2. + head = client.git_show(account=account, target_ref="main") + assert head["oid"] == restored["new_commit_oid"] + assert head["parents"] == [v2_oid] + + +def test_commit_noop_when_unchanged_s3(client): + """A second commit with no working-tree change is a Noop on the same HEAD.""" + account = _acct() + _write(client, account, "resources/a.md", b"stable") + first = client.git_commit( + account=account, branch="main", message="first", + author_name="a", author_email="a@e", + paths=["resources/a.md"], + ) + assert first["result"] == "created" + + second = client.git_commit( + account=account, branch="main", message="second", + author_name="a", author_email="a@e", + paths=["resources/a.md"], + ) + assert second["result"] == "noop" + assert second["commit_oid"] == first["commit_oid"] + + +def test_cross_scope_atomic_snapshot_s3(client): + """A single commit captures files spanning multiple scopes atomically + (design §15.2): both resources/ and knowledge/ land in one commit tree. + """ + account = _acct() + _write(client, account, "resources/doc.md", b"resource body") + _write(client, account, "knowledge/note.md", b"knowledge body") + + resp = client.git_commit( + account=account, branch="main", message="snapshot both scopes", + author_name="a", author_email="a@e", + paths=["resources/doc.md", "knowledge/note.md"], + ) + assert resp["result"] == "created" + + assert client.git_show( + account=account, target_ref="main", path="resources/doc.md" + )["bytes"] == b"resource body" + assert client.git_show( + account=account, target_ref="main", path="knowledge/note.md" + )["bytes"] == b"knowledge body" + + +def test_derived_file_rolls_back_with_restore_s3(client): + """Derived files (e.g. .abstract.md) created after the source commit are + removed when restoring to that earlier commit (design §15.2). + """ + account = _acct() + _write(client, account, "resources/proj/a.md", b"A v1") + src = client.git_commit( + account=account, branch="main", message="source", + author_name="a", author_email="a@e", + paths=["resources/proj/a.md"], + ) + + # HEAD adds a derived file alongside an edit. + _write(client, account, "resources/proj/a.md", b"A v2") + _write(client, account, "resources/proj/.abstract.md", b"derived summary") + client.git_commit( + account=account, branch="main", message="head", + author_name="a", author_email="a@e", + paths=["resources/proj/a.md", "resources/proj/.abstract.md"], + ) + + restored = client.git_restore( + account=account, branch="main", + project_dir="resources/proj", + source_commit=src["commit_oid"], + author_name="a", author_email="a@e", + ) + assert restored["result"] == "applied" + assert restored["deleted"] >= 1 + + # a.md rolled back, derived file gone from the working tree. + assert client.read(f"/local/{account}/resources/proj/a.md") == b"A v1" + from openviking.pyagfs import AGFSNotFoundError + with pytest.raises(AGFSNotFoundError): + client.read(f"/local/{account}/resources/proj/.abstract.md") diff --git a/tests/agfs/test_viking_fs_git.py b/tests/agfs/test_viking_fs_git.py new file mode 100644 index 0000000000..7b4167eef0 --- /dev/null +++ b/tests/agfs/test_viking_fs_git.py @@ -0,0 +1,955 @@ +"""End-to-end tests for VikingFS git commit/restore/show/log Python layer. + +These exercise the full path: VikingFS.commit -> AsyncAGFSClient -> Rust +RAGFSBindingClient -> GitService, plus URI<->tree-path conversion and the +double-encryption invariant called out in the design doc. +""" + +from __future__ import annotations + +import asyncio +import os +import secrets +import tempfile +import shutil +from pathlib import Path +from typing import Tuple + +import pytest + + +ragfs_python = pytest.importorskip("ragfs_python") + +from openviking.pyagfs.exceptions import ( + AGFSNotFoundError, + AGFSNotSupportedError, +) +from openviking.server.identity import RequestContext, Role +from openviking.storage.viking_fs import VikingFS +from openviking_cli.session.user_id import UserIdentifier + + +# ----------------------------- helpers ----------------------------- + + +def _make_ctx(account: str = "acct_t", user: str = "user1") -> RequestContext: + return RequestContext(user=UserIdentifier(account, user), role=Role.ROOT) + + +def _write_workspace(tmp_root: Path) -> Tuple[Path, Path]: + """Lay out an fs/ dir for localfs and a git/ dir for git objects; return + (config_path, localfs_root).""" + fs_root = tmp_root / "fs" + git_root = tmp_root / "git" + fs_root.mkdir(parents=True, exist_ok=True) + git_root.mkdir(parents=True, exist_ok=True) + cfg = tmp_root / "ragfs.toml" + cfg.write_text( + f""" +[git] +enabled = true +backend = "local" +default_branch = "main" +author_name = "test-bot" +author_email = "test@example.com" + +[git.local] +base_dir = "{git_root}" +""" + ) + return cfg, fs_root + + +def _build_client(config_path: Path, fs_root: Path): + c = ragfs_python.RAGFSBindingClient(git_config_path=str(config_path)) + c.mount("localfs", "/local", {"local_dir": str(fs_root)}) + return c + + +# ----------------------------- fixtures ----------------------------- + + +@pytest.fixture +def workspace(): + root = Path(tempfile.mkdtemp(prefix="ov-vfs-git-")) + try: + yield root + finally: + shutil.rmtree(root, ignore_errors=True) + + +@pytest.fixture +def vfs(workspace): + from openviking.storage.transaction import init_lock_manager, reset_lock_manager + + cfg, fs_root = _write_workspace(workspace) + client = _build_client(cfg, fs_root) + init_lock_manager(client) + try: + yield VikingFS(agfs=client) + finally: + reset_lock_manager() + + +@pytest.fixture +def vfs_disabled(workspace): + from openviking.storage.transaction import init_lock_manager, reset_lock_manager + + cfg = workspace / "ragfs.toml" + cfg.write_text( + """ +[git] +enabled = false +""" + ) + fs_root = workspace / "fs" + fs_root.mkdir() + client = ragfs_python.RAGFSBindingClient(git_config_path=str(cfg)) + client.mount("localfs", "/local", {"local_dir": str(fs_root)}) + init_lock_manager(client) + try: + yield VikingFS(agfs=client) + finally: + reset_lock_manager() + + +# ========================================================================= +# 1. URI <-> tree path +# ========================================================================= + + +class TestUriToTreePath: + def test_resources_uri(self, vfs): + ctx = _make_ctx() + assert vfs._uri_to_tree_path("viking://resources/a.md", ctx=ctx) == "resources/a.md" + assert ( + vfs._uri_to_tree_path("viking://resources/proj_a/docs/a.md", ctx=ctx) + == "resources/proj_a/docs/a.md" + ) + + def test_session_uri(self, vfs): + # ``session`` is a virtual scope: it canonicalizes into the owning + # user's sessions subtree, and the git tree path mirrors that real + # storage layout (so commit/restore target the actual stored bytes). + ctx = _make_ctx() + assert vfs._uri_to_tree_path("viking://session", ctx=ctx) == "user/user1/sessions" + + def test_trailing_slash_kept_as_directory(self, vfs): + # Normalization may strip trailing slash; this is acceptable + ctx = _make_ctx() + out = vfs._uri_to_tree_path("viking://resources/proj_a/", ctx=ctx) + assert out.rstrip("/") == "resources/proj_a" + + def test_internal_scope_rejected(self, vfs): + ctx = _make_ctx() + for uri in ( + "viking://temp/x", + "viking://queue/y", + "viking://upload/z", + ): + with pytest.raises(ValueError): + vfs._uri_to_tree_path(uri, ctx=ctx) + + def test_root_uri_rejected(self, vfs): + ctx = _make_ctx() + with pytest.raises(ValueError): + vfs._uri_to_tree_path("viking://", ctx=ctx) + + def test_tree_path_to_uri_inverse(self, vfs): + assert vfs._tree_path_to_uri("resources/a.md") == "viking://resources/a.md" + assert vfs._tree_path_to_uri("/resources/a.md/") == "viking://resources/a.md" + + def test_tree_path_empty_rejected(self, vfs): + with pytest.raises(ValueError): + vfs._tree_path_to_uri("") + + +# ========================================================================= +# 2. commit / show / log +# ========================================================================= + + +@pytest.mark.asyncio +class TestCommitShowLog: + async def test_commit_then_show_roundtrip(self, vfs): + ctx = _make_ctx() + await vfs.write_file("viking://resources/a.md", b"hello", ctx=ctx) + resp = await vfs.commit( + message="initial", + paths=["viking://resources/a.md"], + ctx=ctx, + ) + assert resp["result"] == "created" + assert resp["changed"] == 1 + assert len(resp["commit_oid"]) == 40 + + # show with path -> bytes + body = await vfs.show("main", path="viking://resources/a.md", ctx=ctx) + assert body == b"hello" + + # show without path -> commit metadata + meta = await vfs.show("main", ctx=ctx) + assert meta["message"].startswith("initial") + assert meta["oid"] == resp["commit_oid"] + assert meta["parents"] == [] + assert meta["author"]["name"] == "viking-bot" + + async def test_commit_with_paths_none_enumerates_account(self, vfs): + ctx = _make_ctx(account="acct_full") + await vfs.write_file("viking://resources/a.md", b"a", ctx=ctx) + await vfs.write_file("viking://resources/b.md", b"b", ctx=ctx) + resp = await vfs.commit(message="all", ctx=ctx) + assert resp["result"] == "created" + assert resp["changed"] == 2 + + async def test_log_walks_parent_chain(self, vfs): + ctx = _make_ctx(account="acct_log") + await vfs.write_file("viking://resources/a.md", b"v1", ctx=ctx) + c1 = await vfs.commit(message="c1", paths=["viking://resources/a.md"], ctx=ctx) + await vfs.write_file("viking://resources/a.md", b"v2", ctx=ctx) + c2 = await vfs.commit(message="c2", paths=["viking://resources/a.md"], ctx=ctx) + await vfs.write_file("viking://resources/a.md", b"v3", ctx=ctx) + c3 = await vfs.commit(message="c3", paths=["viking://resources/a.md"], ctx=ctx) + + history = await vfs.log(limit=10, ctx=ctx) + oids = [h["oid"] for h in history] + assert oids == [c3["commit_oid"], c2["commit_oid"], c1["commit_oid"]] + + limited = await vfs.log(limit=2, ctx=ctx) + assert [h["oid"] for h in limited] == [c3["commit_oid"], c2["commit_oid"]] + + async def test_show_missing_branch_raises(self, vfs): + ctx = _make_ctx(account="acct_missing") + with pytest.raises(AGFSNotFoundError): + await vfs.show("main", ctx=ctx) + + +# ========================================================================= +# 3. restore +# ========================================================================= + + +@pytest.mark.asyncio +class TestRestore: + async def test_restore_reverts_file_and_advances_head(self, vfs): + ctx = _make_ctx(account="acct_r") + await vfs.write_file("viking://resources/proj/a.md", b"v1", ctx=ctx) + c1 = await vfs.commit(message="v1", paths=["viking://resources/proj/a.md"], ctx=ctx) + + await vfs.write_file("viking://resources/proj/a.md", b"v2", ctx=ctx) + c2 = await vfs.commit(message="v2", paths=["viking://resources/proj/a.md"], ctx=ctx) + + result = await vfs.restore( + project_dir="viking://resources/proj", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + assert result["result"] == "applied" + assert result["source_commit"] == c1["commit_oid"] + assert result["parent_commit"] == c2["commit_oid"] + assert result["new_commit_oid"] != c2["commit_oid"] + assert "resources/proj/a.md" in result["written_paths"] + + # File reverted via VFS + body = await vfs.read("viking://resources/proj/a.md", ctx=ctx) + assert body == b"v1" + + # HEAD moved forward (NOT back to c1) + head = await vfs.show("main", ctx=ctx) + assert head["oid"] == result["new_commit_oid"] + assert head["parents"] == [c2["commit_oid"]] + + async def test_restore_dry_run_does_not_mutate(self, vfs): + ctx = _make_ctx(account="acct_dry") + await vfs.write_file("viking://resources/proj/a.md", b"v1", ctx=ctx) + c1 = await vfs.commit(message="v1", paths=["viking://resources/proj/a.md"], ctx=ctx) + await vfs.write_file("viking://resources/proj/a.md", b"v2", ctx=ctx) + await vfs.commit(message="v2", paths=["viking://resources/proj/a.md"], ctx=ctx) + + result = await vfs.restore( + project_dir="viking://resources/proj", + source_commit=c1["commit_oid"], + dry_run=True, + ctx=ctx, + ) + assert result["result"] == "dry_run" + assert any(item["path"] == "a.md" for item in result["diff"]["to_write"]) + + body = await vfs.read("viking://resources/proj/a.md", ctx=ctx) + assert body == b"v2" + + async def test_restore_internal_scope_rejected(self, vfs): + ctx = _make_ctx(account="acct_inv") + with pytest.raises(ValueError): + await vfs.restore( + project_dir="viking://temp/xx", + source_commit="main", + ctx=ctx, + ) + + +# ========================================================================= +# 4. Cross-scope atomicity (resources + user in one commit) +# ========================================================================= + + +@pytest.mark.asyncio +async def test_cross_scope_atomic_commit_and_restore(vfs): + ctx = _make_ctx(account="acct_cross") + # Two files in distinct scopes (``user`` is a real writable scope; the + # virtual ``session``/``agent`` scopes are not directly writable). + await vfs.write_file("viking://resources/a.md", b"R1", ctx=ctx) + await vfs.write_file("viking://user/notes/b.py", b"S1", ctx=ctx) + c1 = await vfs.commit( + message="initial", + paths=["viking://resources/a.md", "viking://user/notes/b.py"], + ctx=ctx, + ) + assert c1["result"] == "created" + assert c1["changed"] == 2 + + # Both files modified + await vfs.write_file("viking://resources/a.md", b"R2", ctx=ctx) + await vfs.write_file("viking://user/notes/b.py", b"S2", ctx=ctx) + await vfs.commit( + message="v2", + paths=["viking://resources/a.md", "viking://user/notes/b.py"], + ctx=ctx, + ) + + # Restore only the resources scope to c1; user scope must remain at v2 + await vfs.restore( + project_dir="viking://resources", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + assert await vfs.read("viking://resources/a.md", ctx=ctx) == b"R1" + assert await vfs.read("viking://user/notes/b.py", ctx=ctx) == b"S2" + + # Restore the user scope too -> both back to c1 + await vfs.restore( + project_dir="viking://user/notes", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + assert await vfs.read("viking://resources/a.md", ctx=ctx) == b"R1" + assert await vfs.read("viking://user/notes/b.py", ctx=ctx) == b"S1" + + +# ========================================================================= +# 5. Derived files (.abstract.md etc.) versioned with source +# ========================================================================= + + +@pytest.mark.asyncio +async def test_derived_files_versioned_with_source(vfs): + ctx = _make_ctx(account="acct_derived") + await vfs.write_file("viking://resources/x.md", b"x-body", ctx=ctx) + await vfs.write_file("viking://resources/x.md.abstract.md", b"abstract-v1", ctx=ctx) + c1 = await vfs.commit(message="v1", ctx=ctx) + assert c1["result"] == "created" + assert c1["changed"] == 2 + + # show finds both + assert await vfs.show("main", path="viking://resources/x.md.abstract.md", ctx=ctx) == b"abstract-v1" + + # Update derived file + await vfs.write_file("viking://resources/x.md.abstract.md", b"abstract-v2", ctx=ctx) + await vfs.commit(message="v2", paths=["viking://resources/x.md.abstract.md"], ctx=ctx) + + # Restore to c1 -> derived file reverts too + await vfs.restore( + project_dir="viking://resources", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + body = await vfs.read("viking://resources/x.md.abstract.md", ctx=ctx) + assert body == b"abstract-v1" + + +# ========================================================================= +# 6. Account isolation +# ========================================================================= + + +@pytest.mark.asyncio +async def test_account_isolation_show_misses_other_account(vfs): + ctx_a = _make_ctx(account="acct_iso_a") + ctx_b = _make_ctx(account="acct_iso_b") + await vfs.write_file("viking://resources/a.md", b"a", ctx=ctx_a) + await vfs.commit(message="m", paths=["viking://resources/a.md"], ctx=ctx_a) + + with pytest.raises(AGFSNotFoundError): + await vfs.show("main", ctx=ctx_b) + + +# ========================================================================= +# 7. Double-encryption end-to-end (the §3.1 invariant) +# ========================================================================= + + +@pytest.fixture +def encryptor(workspace): + from openviking.crypto.encryptor import FileEncryptor + from openviking.crypto.providers import LocalFileProvider + + key_file = workspace / "master.key" + key_file.write_text(secrets.token_bytes(32).hex()) + os.chmod(key_file, 0o600) + provider = LocalFileProvider(key_file=str(key_file)) + return FileEncryptor(provider) + + +@pytest.fixture +def vfs_encrypted(workspace, encryptor): + from openviking.storage.transaction import init_lock_manager, reset_lock_manager + + cfg, fs_root = _write_workspace(workspace) + client = _build_client(cfg, fs_root) + init_lock_manager(client) + try: + yield VikingFS(agfs=client, encryptor=encryptor) + finally: + reset_lock_manager() + + +@pytest.mark.asyncio +async def test_double_encryption_restore_preserves_plaintext(vfs_encrypted): + """Write plaintext via encrypted VikingFS, commit (ciphertext stored in + git), modify, restore. After restore, VikingFS.read MUST return the + original plaintext — proving the Rust restore path bypasses the + VikingFS encryption layer (writes ciphertext back through MountableFS, + which then decrypts correctly on read). + """ + ctx = _make_ctx(account="acct_enc") + plaintext_v1 = b"top-secret-v1" + plaintext_v2 = b"top-secret-v2" + + await vfs_encrypted.write_file("viking://resources/secret.md", plaintext_v1, ctx=ctx) + c1 = await vfs_encrypted.commit( + message="v1", paths=["viking://resources/secret.md"], ctx=ctx, + ) + assert c1["result"] == "created" + + # Modify + await vfs_encrypted.write_file("viking://resources/secret.md", plaintext_v2, ctx=ctx) + await vfs_encrypted.commit( + message="v2", paths=["viking://resources/secret.md"], ctx=ctx, + ) + assert ( + await vfs_encrypted.read("viking://resources/secret.md", ctx=ctx) + == plaintext_v2 + ) + + # Restore + result = await vfs_encrypted.restore( + project_dir="viking://resources", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + assert result["result"] == "applied" + assert "resources/secret.md" in result["written_paths"] + + # The critical assertion: read returns original plaintext, not garbled + # double-encrypted bytes. + restored = await vfs_encrypted.read("viking://resources/secret.md", ctx=ctx) + assert restored == plaintext_v1 + + +# ========================================================================= +# 8. Feature disabled +# ========================================================================= + + +@pytest.mark.asyncio +async def test_feature_disabled_raises_not_supported(vfs_disabled): + ctx = _make_ctx() + with pytest.raises(AGFSNotSupportedError): + await vfs_disabled.commit(message="m", paths=["viking://resources/a.md"], ctx=ctx) + with pytest.raises(AGFSNotSupportedError): + await vfs_disabled.show("main", ctx=ctx) + with pytest.raises(AGFSNotSupportedError): + await vfs_disabled.restore( + project_dir="viking://resources/proj", + source_commit="main", + ctx=ctx, + ) + + +# ========================================================================= +# 9. Reindex redirect for derived files +# ========================================================================= + + +def test_classify_restore_path(vfs): + from openviking.core.context import ContextLevel + + # Directory-level markers -> (op, dir_uri, level) + assert vfs._classify_restore_path( + "resources/proj/.abstract.md", deleted=False + ) == ("reindex_marker", "viking://resources/proj", ContextLevel.ABSTRACT) + assert vfs._classify_restore_path( + "resources/proj/.overview.md", deleted=False + ) == ("reindex_marker", "viking://resources/proj", ContextLevel.OVERVIEW) + assert vfs._classify_restore_path( + "resources/proj/.abstract.md", deleted=True + ) == ("delete", "viking://resources/proj", ContextLevel.ABSTRACT) + assert vfs._classify_restore_path( + "resources/proj/.overview.md", deleted=True + ) == ("delete", "viking://resources/proj", ContextLevel.OVERVIEW) + + # .relations.json has no vector side-effect + assert vfs._classify_restore_path( + "resources/proj/.relations.json", deleted=False + ) is None + assert vfs._classify_restore_path( + "resources/proj/.relations.json", deleted=True + ) is None + + # Per-file sidecars do NOT exist in production -> treated as ordinary source files + assert vfs._classify_restore_path( + "resources/proj/x.md.abstract.md", deleted=False + ) == ("reindex_file", "viking://resources/proj/x.md.abstract.md", ContextLevel.DETAIL) + assert vfs._classify_restore_path( + "resources/proj/x.md.overview.md", deleted=True + ) == ("delete", "viking://resources/proj/x.md.overview.md", ContextLevel.DETAIL) + + # Source files -> DETAIL reindex/delete + assert vfs._classify_restore_path( + "resources/proj/x.md", deleted=False + ) == ("reindex_file", "viking://resources/proj/x.md", ContextLevel.DETAIL) + assert vfs._classify_restore_path( + "resources/proj/x.md", deleted=True + ) == ("delete", "viking://resources/proj/x.md", ContextLevel.DETAIL) + + # Directory marker at the account root -> None (no parent dir to scope) + assert vfs._classify_restore_path(".abstract.md", deleted=False) is None + + +class _SpyExecutor: + """Records every scheduled vector task as a normalized tuple.""" + + def __init__(self): + self.calls: list[tuple] = [] + + async def execute(self, *, uri, mode, wait, ctx): + self.calls.append(("reindex_file", uri)) + return {"ok": True} + + async def reindex_directory_marker(self, *, dir_uri, level, ctx): + self.calls.append(("reindex_marker", dir_uri, int(level))) + + async def delete_uri_level(self, *, uri, level, ctx): + self.calls.append(("delete", uri, int(level))) + return 0 + + +@pytest.mark.asyncio +async def test_restore_schedules_reindex_for_derived_only_change(vfs, monkeypatch): + """When a restore only changes a directory `.abstract.md` (source file + unchanged), exactly that directory's L0 vector must be recomputed via + reindex_directory_marker — and nothing else (no whole-tree rebuild). + """ + spy = _SpyExecutor() + + import openviking.service.reindex_executor as reindex_mod + monkeypatch.setattr(reindex_mod, "get_reindex_executor", lambda: spy) + + ctx = _make_ctx(account="acct_derived_only") + await vfs.write_file("viking://resources/proj/x.md", b"body", ctx=ctx) + await vfs.write_file( + "viking://resources/proj/.abstract.md", b"abs-v1", ctx=ctx + ) + c1 = await vfs.commit(message="v1", ctx=ctx) + assert c1["result"] == "created" + + # Modify ONLY the directory marker; source file untouched + await vfs.write_file( + "viking://resources/proj/.abstract.md", b"abs-v2", ctx=ctx + ) + c2 = await vfs.commit( + message="v2", + paths=["viking://resources/proj/.abstract.md"], + ctx=ctx, + ) + assert c2["result"] == "created" + assert c2["changed"] == 1 + + result = await vfs.restore( + project_dir="viking://resources/proj", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + assert result["result"] == "applied" + assert "resources/proj/.abstract.md" in result["written_paths"] + + # Let the fire-and-forget tasks run + await asyncio.sleep(0) + await asyncio.sleep(0) + + assert spy.calls == [("reindex_marker", "viking://resources/proj", 0)] + + +@pytest.mark.asyncio +async def test_restore_schedules_marker_and_files_independently(vfs, monkeypatch): + """Ancestor subsumption is gone: a changed directory marker recomputes the + directory's L0/L1, while each changed source file independently reindexes + its own DETAIL vector — neither subsumes the other. + """ + spy = _SpyExecutor() + + import openviking.service.reindex_executor as reindex_mod + monkeypatch.setattr(reindex_mod, "get_reindex_executor", lambda: spy) + + ctx = _make_ctx(account="acct_dedup") + await vfs.write_file("viking://resources/proj/x.md", b"v1", ctx=ctx) + await vfs.write_file("viking://resources/proj/y.md", b"yv1", ctx=ctx) + await vfs.write_file( + "viking://resources/proj/.abstract.md", b"a-v1", ctx=ctx + ) + c1 = await vfs.commit(message="v1", ctx=ctx) + + await vfs.write_file("viking://resources/proj/x.md", b"v2", ctx=ctx) + await vfs.write_file("viking://resources/proj/y.md", b"yv2", ctx=ctx) + await vfs.write_file( + "viking://resources/proj/.abstract.md", b"a-v2", ctx=ctx + ) + await vfs.commit(message="v2", ctx=ctx) + + await vfs.restore( + project_dir="viking://resources/proj", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + await asyncio.sleep(0) + await asyncio.sleep(0) + + # Directory marker recompute + each source file's DETAIL, all independent. + assert sorted(spy.calls) == sorted([ + ("reindex_marker", "viking://resources/proj", 0), + ("reindex_file", "viking://resources/proj/x.md"), + ("reindex_file", "viking://resources/proj/y.md"), + ]) + + +@pytest.mark.asyncio +async def test_restore_schedules_siblings_independently(vfs, monkeypatch): + """Source files in sibling directories are each scheduled independently; + a directory marker change only affects its own directory. + """ + spy = _SpyExecutor() + + import openviking.service.reindex_executor as reindex_mod + monkeypatch.setattr(reindex_mod, "get_reindex_executor", lambda: spy) + + ctx = _make_ctx(account="acct_subsume_sibling") + # proj_a: source file + directory marker + await vfs.write_file("viking://resources/proj_a/x.md", b"v1", ctx=ctx) + await vfs.write_file( + "viking://resources/proj_a/.abstract.md", b"a-v1", ctx=ctx + ) + # proj_b: source file only — sibling directory + await vfs.write_file("viking://resources/proj_b/y.md", b"v1", ctx=ctx) + c1 = await vfs.commit(message="v1", ctx=ctx) + + await vfs.write_file("viking://resources/proj_a/x.md", b"v2", ctx=ctx) + await vfs.write_file( + "viking://resources/proj_a/.abstract.md", b"a-v2", ctx=ctx + ) + await vfs.write_file("viking://resources/proj_b/y.md", b"v2", ctx=ctx) + await vfs.commit(message="v2", ctx=ctx) + + # Restore the whole resources scope so proj_a + proj_b both revert + await vfs.restore( + project_dir="viking://resources", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + await asyncio.sleep(0) + await asyncio.sleep(0) + + assert sorted(spy.calls) == sorted([ + ("reindex_marker", "viking://resources/proj_a", 0), + ("reindex_file", "viking://resources/proj_a/x.md"), + ("reindex_file", "viking://resources/proj_b/y.md"), + ]) + + +@pytest.mark.asyncio +async def test_restore_deletes_marker_and_source_vectors(vfs, monkeypatch): + """Bug 1 regression: restoring to a revision that predates a whole + directory must delete BOTH the directory's L0/L1 marker vectors and the + deleted source file's DETAIL vector — no orphaned vectors left behind. + """ + spy = _SpyExecutor() + + import openviking.service.reindex_executor as reindex_mod + monkeypatch.setattr(reindex_mod, "get_reindex_executor", lambda: spy) + + ctx = _make_ctx(account="acct_del_marker") + await vfs.write_file("viking://resources/keep/k.md", b"keep", ctx=ctx) + c1 = await vfs.commit(message="v1", ctx=ctx) + + # v2 adds a whole new directory with a source file + directory markers. + await vfs.write_file("viking://resources/gone/g.md", b"gone", ctx=ctx) + await vfs.write_file("viking://resources/gone/.abstract.md", b"abs", ctx=ctx) + await vfs.write_file("viking://resources/gone/.overview.md", b"ovr", ctx=ctx) + await vfs.commit(message="v2", ctx=ctx) + + # Restore back to v1: everything under gone/ must be removed. + result = await vfs.restore( + project_dir="viking://resources", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + assert result["result"] == "applied" + await asyncio.sleep(0) + await asyncio.sleep(0) + + assert ("delete", "viking://resources/gone", 0) in spy.calls + assert ("delete", "viking://resources/gone", 1) in spy.calls + assert ("delete", "viking://resources/gone/g.md", 2) in spy.calls + # No whole-tree reindex of the deleted dir. + assert all(c[0] != "reindex_marker" or c[1] != "viking://resources/gone" for c in spy.calls) + + +@pytest.mark.asyncio +async def test_restore_relations_json_has_no_vector_side_effect(vfs, monkeypatch): + """A restore that only touches `.relations.json` must schedule no vector + reindex/delete tasks at all. + """ + spy = _SpyExecutor() + + import openviking.service.reindex_executor as reindex_mod + monkeypatch.setattr(reindex_mod, "get_reindex_executor", lambda: spy) + + ctx = _make_ctx(account="acct_relations") + await vfs.write_file( + "viking://resources/proj/.relations.json", b"{\"v\":1}", ctx=ctx + ) + c1 = await vfs.commit(message="v1", ctx=ctx) + + await vfs.write_file( + "viking://resources/proj/.relations.json", b"{\"v\":2}", ctx=ctx + ) + c2 = await vfs.commit( + message="v2", + paths=["viking://resources/proj/.relations.json"], + ctx=ctx, + ) + assert c2["result"] == "created" + + result = await vfs.restore( + project_dir="viking://resources/proj", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + assert result["result"] == "applied" + await asyncio.sleep(0) + await asyncio.sleep(0) + + assert spy.calls == [] + # No vector side-effect -> no tracked task. + assert "task_id" not in result + + +@pytest.mark.asyncio +async def test_restore_returns_pollable_task_id(vfs, monkeypatch): + """An applied restore with vector side-effects returns a ``task_id`` that + can be polled via the TaskTracker and reaches ``completed``. + """ + spy = _SpyExecutor() + + import openviking.service.reindex_executor as reindex_mod + monkeypatch.setattr(reindex_mod, "get_reindex_executor", lambda: spy) + + from openviking.service.task_tracker import ( + TaskTracker, + reset_task_tracker, + set_task_tracker, + ) + + class _MemTaskStore: + async def create(self, task): + return None + + async def update(self, task): + return None + + async def get(self, task_id, *, account_id=None, user_id=None): + return None + + async def list(self, account_id, *, user_id=None): + return [] + + async def delete(self, task_id, *, account_id, user_id=None): + return None + + set_task_tracker(TaskTracker(store=_MemTaskStore())) + try: + ctx = _make_ctx(account="acct_taskid") + await vfs.write_file("viking://resources/proj/x.md", b"v1", ctx=ctx) + c1 = await vfs.commit(message="v1", ctx=ctx) + await vfs.write_file("viking://resources/proj/x.md", b"v2", ctx=ctx) + await vfs.commit(message="v2", ctx=ctx) + + result = await vfs.restore( + project_dir="viking://resources/proj", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + assert result["result"] == "applied" + task_id = result.get("task_id") + assert task_id + + # Let the tracked background worker run to completion. + for _ in range(5): + await asyncio.sleep(0) + + from openviking.service.task_tracker import get_task_tracker + + tracker = get_task_tracker() + task = await tracker.get( + task_id, account_id=ctx.account_id, user_id=ctx.user.user_id + ) + assert task is not None + assert task.task_type == "snapshot_restore_reindex" + assert task.status.value == "completed" + assert ("reindex_file", "viking://resources/proj/x.md") in spy.calls + finally: + reset_task_tracker() + + +# ========================================================================= +# 7. Restore concurrency / locking +# ========================================================================= + + +@pytest.mark.asyncio +async def test_restore_concurrent_same_dir_is_rejected(vfs, monkeypatch): + """While one restore holds the project_dir tree lock during writeback, a + second restore on the same subtree fails fast with ResourceBusyError + (manager default timeout is non-blocking). + """ + from openviking.storage.errors import ResourceBusyError + + spy = _SpyExecutor() + import openviking.service.reindex_executor as reindex_mod + monkeypatch.setattr(reindex_mod, "get_reindex_executor", lambda: spy) + + ctx = _make_ctx(account="acct_lock_same") + await vfs.write_file("viking://resources/proj/a.md", b"v1", ctx=ctx) + c1 = await vfs.commit(message="v1", paths=["viking://resources/proj/a.md"], ctx=ctx) + await vfs.write_file("viking://resources/proj/a.md", b"v2", ctx=ctx) + await vfs.commit(message="v2", paths=["viking://resources/proj/a.md"], ctx=ctx) + + # Gate the first restore *inside* the writeback so it keeps holding the + # tree lock while the second restore attempts to acquire it. + orig_run = vfs._async_agfs.run + holding_lock = asyncio.Event() + release = asyncio.Event() + + async def gated_run(method_name, *args, **kwargs): + if method_name == "git_restore" and not kwargs.get("dry_run"): + holding_lock.set() + await release.wait() + return await orig_run(method_name, *args, **kwargs) + + monkeypatch.setattr(vfs._async_agfs, "run", gated_run) + + first = asyncio.create_task( + vfs.restore( + project_dir="viking://resources/proj", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + ) + await asyncio.wait_for(holding_lock.wait(), timeout=5) + + # Second restore on the same subtree must be rejected immediately. + with pytest.raises(ResourceBusyError): + await vfs.restore( + project_dir="viking://resources/proj", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + + # Release the first restore and let it complete normally. + release.set() + result = await asyncio.wait_for(first, timeout=5) + assert result["result"] == "applied" + + # Lock released: a follow-up restore on the same subtree now succeeds. + again = await vfs.restore( + project_dir="viking://resources/proj", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + assert again["result"] in ("applied", "noop") + + +@pytest.mark.asyncio +async def test_restore_concurrent_sibling_dirs_do_not_block(vfs, monkeypatch): + """Restores on sibling subtrees hold disjoint tree locks and run + concurrently — neither blocks the other. + """ + spy = _SpyExecutor() + import openviking.service.reindex_executor as reindex_mod + monkeypatch.setattr(reindex_mod, "get_reindex_executor", lambda: spy) + + ctx = _make_ctx(account="acct_lock_sibling") + await vfs.write_file("viking://resources/proj_a/x.md", b"v1", ctx=ctx) + await vfs.write_file("viking://resources/proj_b/y.md", b"v1", ctx=ctx) + c1 = await vfs.commit(message="v1", ctx=ctx) + await vfs.write_file("viking://resources/proj_a/x.md", b"v2", ctx=ctx) + await vfs.write_file("viking://resources/proj_b/y.md", b"v2", ctx=ctx) + await vfs.commit(message="v2", ctx=ctx) + + # Block both writebacks until both have entered, proving they hold their + # (distinct) tree locks simultaneously. If the locks conflicted, the second + # restore would raise ResourceBusyError before reaching git_restore and + # ``both_in`` would never fire. + orig_run = vfs._async_agfs.run + both_in = asyncio.Event() + release_a = asyncio.Event() + release_b = asyncio.Event() + entered = 0 + + async def gated_run(method_name, *args, **kwargs): + nonlocal entered + if method_name == "git_restore" and not kwargs.get("dry_run"): + entered += 1 + if entered == 2: + both_in.set() + # proj_a is released first; proj_b waits on its own gate. + await (release_a if kwargs.get("project_dir", "").endswith("proj_a") else release_b).wait() + return await orig_run(method_name, *args, **kwargs) + + monkeypatch.setattr(vfs._async_agfs, "run", gated_run) + + task_a = asyncio.create_task( + vfs.restore( + project_dir="viking://resources/proj_a", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + ) + task_b = asyncio.create_task( + vfs.restore( + project_dir="viking://resources/proj_b", + source_commit=c1["commit_oid"], + ctx=ctx, + ) + ) + # Both reaching the writeback concurrently proves the tree locks are disjoint. + await asyncio.wait_for(both_in.wait(), timeout=5) + + # Drain sequentially: both share branch ``main``, so the final ref CAS would + # otherwise conflict — that guard is independent of the directory lock. + release_a.set() + res_a = await asyncio.wait_for(task_a, timeout=5) + release_b.set() + res_b = await asyncio.wait_for(task_b, timeout=5) + assert res_a["result"] == "applied" + assert res_b["result"] == "applied" + diff --git a/tests/cli/test_cli_snapshot.py b/tests/cli/test_cli_snapshot.py new file mode 100644 index 0000000000..3024d766fa --- /dev/null +++ b/tests/cli/test_cli_snapshot.py @@ -0,0 +1,281 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""CLI snapshot (git version control) tests.""" + +import re +import time +import uuid + +import pytest +from conftest import ov + +pytestmark = pytest.mark.cli_remote + + +def _commit(message: str): + """Helper: take a snapshot, return the full envelope dict. + + Retries briefly on transient server busy errors. + """ + r = None + for _attempt in range(5): + r = ov(["snapshot", "commit", "-m", message, "-o", "json"], timeout=120) + if r["exit_code"] == 0: + break + if "busy" in r["stderr"].lower() or "internal" in r["stderr"].lower(): + time.sleep(10) + continue + break + assert r["exit_code"] == 0, f"snapshot commit failed: {r['stderr'][:300]}" + assert r["json"] is not None, f"expected JSON, got: {r['stdout'][:200]}" + assert r["json"].get("ok") is True, f"expected ok=true, got {r['json']}" + return r["json"]["result"] + + +class TestSnapshotCommit: + def test_commit_returns_oid_json(self, test_pack_uri): + # test_pack_uri ensures at least one resource exists + result = _commit(f"cli-test commit {uuid.uuid4().hex[:6]}") + assert "commit_oid" in result, f"expected commit_oid in result, got {result}" + assert isinstance(result["commit_oid"], str) and len(result["commit_oid"]) >= 12 + + def test_commit_human_prints_short_oid(self, test_pack_uri): + msg = f"cli-test human {uuid.uuid4().hex[:6]}" + r = ov(["snapshot", "commit", "-m", msg], timeout=120) + assert r["exit_code"] == 0, f"snapshot commit failed: {r['stderr'][:300]}" + out = r["stdout"] + assert re.match( + r"^(Created [0-9a-f]{12}|No changes)", out + ), f"unexpected commit stdout: {out[:200]}" + + +class TestSnapshotLog: + def test_log_lists_commits(self, test_pack_uri): + r_before = ov(["snapshot", "log", "--limit", "100"], timeout=60) + assert r_before["exit_code"] == 0 + before_lines = [ln for ln in r_before["stdout"].splitlines() if ln.strip()] + + commit = _commit(f"log-test setup {uuid.uuid4().hex[:6]}") + short_oid = commit["commit_oid"][:12] + + r_after = ov(["snapshot", "log", "--limit", "100"], timeout=60) + assert r_after["exit_code"] == 0 + after_lines = [ln for ln in r_after["stdout"].splitlines() if ln.strip()] + + # The new commit's short oid must appear in the log, proving log reflects HEAD. + assert any(short_oid in ln for ln in after_lines), ( + f"new commit {short_oid} should appear in log; " + f"first few lines: {after_lines[:3]}" + ) + # A single commit should add at most one row (noop commits add zero). + delta = len(after_lines) - len(before_lines) + assert 0 <= delta <= 1, ( + f"expected log delta of 0 or 1 after one commit, got {delta} " + f"(before={len(before_lines)}, after={len(after_lines)})" + ) + + def test_log_json_returns_array(self, test_pack_uri): + _commit(f"log-json setup {uuid.uuid4().hex[:6]}") + r = ov(["snapshot", "log", "--limit", "5", "-o", "json"], timeout=60) + assert r["exit_code"] == 0, f"snapshot log -o json failed: {r['stderr'][:300]}" + # Server returns {"ok": true, "result": [...]}, so r["json"] works + assert r["json"] is not None, f"expected JSON, got: {r['stdout'][:200]}" + assert r["json"].get("ok") is True + result = r["json"]["result"] + assert isinstance(result, list), f"expected list, got {type(result).__name__}: {result}" + assert len(result) >= 1 + first = result[0] + assert "oid" in first and "message" in first + + +class TestSnapshotShow: + def test_show_metadata(self, test_pack_uri): + commit = _commit(f"show-meta setup {uuid.uuid4().hex[:6]}") + oid = commit["commit_oid"] + r = ov(["snapshot", "show", oid, "-o", "json"], timeout=60) + assert r["exit_code"] == 0, f"snapshot show failed: {r['stderr'][:300]}" + assert r["json"] is not None and r["json"].get("ok") is True + meta = r["json"]["result"] + assert meta.get("oid") == oid or meta.get("oid", "").startswith(oid[:12]) + assert "tree" in meta and "author" in meta + + def test_show_blob_to_stdout(self, test_file_uri, tmp_path): + commit = _commit(f"show-stdout setup {uuid.uuid4().hex[:6]}") + oid = commit["commit_oid"] + + # Capture canonical bytes via `get` (writes the file directly — no shell echo + # framing). + canonical_path = tmp_path / "canonical.bin" + r_get = ov( + ["get", test_file_uri, str(canonical_path), "-o", "json"], + timeout=60, + ) + assert r_get["exit_code"] == 0, f"get failed: {r_get['stderr'][:300]}" + expected_bytes = canonical_path.read_bytes() + + r_show = ov(["snapshot", "show", oid, "--path", test_file_uri], timeout=60) + assert r_show["exit_code"] == 0, f"snapshot show failed: {r_show['stderr'][:300]}" + + # `ov()` returns stdout as a stripped str; the CLI's `echo_command=True` may + # prepend the command line. Locate the blob body by suffix match: stdout must + # end with the file's content (with the encoding `ov()` used to decode bytes). + try: + expected_text = expected_bytes.decode("utf-8") + except UnicodeDecodeError: + pytest.skip("fixture content is not utf-8; this test assumes a text blob") + assert r_show["stdout"].endswith(expected_text.rstrip()), ( + f"snapshot show stdout did not end with canonical blob content; " + f"tail of show stdout: {r_show['stdout'][-200:]!r} vs " + f"expected tail: {expected_text[-200:]!r}" + ) + + def test_show_blob_to_file(self, test_file_uri, tmp_path): + commit = _commit(f"show-blob setup {uuid.uuid4().hex[:6]}") + oid = commit["commit_oid"] + + canonical_path = tmp_path / "canonical.bin" + r_get = ov( + ["get", test_file_uri, str(canonical_path), "-o", "json"], + timeout=60, + ) + assert r_get["exit_code"] == 0, f"get failed: {r_get['stderr'][:300]}" + expected_bytes = canonical_path.read_bytes() + + out_path = tmp_path / "blob.bin" + r = ov( + [ + "snapshot", + "show", + oid, + "--path", + test_file_uri, + "--out-file", + str(out_path), + ], + timeout=60, + ) + assert r["exit_code"] == 0, f"snapshot show --out-file failed: {r['stderr'][:300]}" + assert out_path.exists(), f"out-file {out_path} should exist" + + contents = out_path.read_bytes() + assert contents == expected_bytes, ( + f"out-file bytes ({len(contents)} bytes) should match canonical " + f"({len(expected_bytes)} bytes)" + ) + # Stderr summary must report bytes + blob-oid + path. + # Note: the oid in the summary is the *blob* oid (from X-Snapshot-Oid), + # not the commit oid. + import re as _re + assert ( + "Wrote" in r["stderr"] + and "bytes from" in r["stderr"] + and _re.search(r"from [0-9a-f]{7,}", r["stderr"]) + and str(out_path) in r["stderr"] + ), f"missing stderr summary, got: {r['stderr'][:300]}" + + def test_show_blob_stdout_byte_exact_via_subprocess(self, test_file_uri, tmp_path): + """Run `snapshot show --path X` without --out-file and capture raw + bytes — must match the file content exactly (no UTF-8 decoding, no + trailing-newline tolerance). The `ov()` helper decodes as text and + strips, which hides binary-mangling bugs. + """ + import subprocess + from conftest import CLI_BIN, _env, _inject_global_args + + commit = _commit(f"show-stdout-bytes setup {uuid.uuid4().hex[:6]}") + oid = commit["commit_oid"] + + canonical_path = tmp_path / "canonical.bin" + r_get = ov( + ["get", test_file_uri, str(canonical_path), "-o", "json"], timeout=60 + ) + assert r_get["exit_code"] == 0, f"get failed: {r_get['stderr'][:300]}" + expected_bytes = canonical_path.read_bytes() + + # Run the CLI ourselves so we can capture stdout as bytes. + args = _inject_global_args(["snapshot", "show", oid, "--path", test_file_uri]) + proc = subprocess.run( + [CLI_BIN] + args, + capture_output=True, + timeout=60, + env=_env(), + ) + assert proc.returncode == 0, ( + f"snapshot show stdout exit={proc.returncode}, " + f"stderr={proc.stderr.decode('utf-8', errors='replace')[:300]}" + ) + + # The CLI's echo_command prefix may land on stdout. Bytes must + # END with the canonical payload (no decoding, no rstrip). + assert proc.stdout.endswith(expected_bytes), ( + f"snapshot show stdout did not end with canonical bytes; " + f"got tail (hex): {proc.stdout[-64:].hex()} vs " + f"expected tail (hex): {expected_bytes[-64:].hex()}" + ) + + # The stderr summary line ("Read N bytes from ") must be + # present — this is the user-visible signal that bytes flowed. + # The oid in the summary is the blob oid (X-Snapshot-Oid), not the + # commit oid we asked about. + import re as _re + stderr_text = proc.stderr.decode("utf-8", errors="replace") + assert ( + "Read" in stderr_text + and "bytes from" in stderr_text + and _re.search(r"from [0-9a-f]{7,}", stderr_text) + ), f"expected stderr summary 'Read N bytes from ', got: {stderr_text[:300]}" + + +class TestSnapshotRestore: + def test_restore_dry_run_does_not_mutate(self, test_pack_uri): + # Capture ls before + ls_before = ov(["ls", "viking://resources", "-r", "-o", "json", "-n", "50"], timeout=60) + assert ls_before["exit_code"] == 0 + + commit = _commit(f"restore-dry setup {uuid.uuid4().hex[:6]}") + oid = commit["commit_oid"] + r = ov( + [ + "snapshot", + "restore", + oid, + "viking://resources", + "--dry-run", + "-o", + "json", + ], + timeout=60, + ) + assert r["exit_code"] == 0, f"snapshot restore --dry-run failed: {r['stderr'][:300]}" + assert r["json"] is not None and r["json"].get("ok") is True + result = r["json"]["result"] + # Dry-run shape includes a "diff" key + assert "diff" in result, f"expected diff in dry-run result, got keys: {list(result.keys())}" + + # ls should be unchanged after dry-run + ls_after = ov(["ls", "viking://resources", "-r", "-o", "json", "-n", "50"], timeout=60) + assert ls_after["exit_code"] == 0 + # Compare result lists if both present + if ls_before["json"] and ls_after["json"]: + assert ls_before["json"].get("result") == ls_after["json"].get("result"), ( + "ls output should be unchanged after dry-run restore" + ) + + def test_restore_dry_run_without_project_dir(self, test_pack_uri): + commit = _commit(f"restore-full-dry setup {uuid.uuid4().hex[:6]}") + oid = commit["commit_oid"] + r = ov( + [ + "snapshot", + "restore", + oid, + "--dry-run", + "-o", + "json", + ], + timeout=60, + ) + assert r["exit_code"] == 0, f"snapshot restore full --dry-run failed: {r['stderr'][:300]}" + assert r["json"] is not None and r["json"].get("ok") is True + result = r["json"]["result"] + assert "diff" in result or result.get("result") == "noop" diff --git a/tests/client/test_git_versioning.py b/tests/client/test_git_versioning.py new file mode 100644 index 0000000000..3edc0d8952 --- /dev/null +++ b/tests/client/test_git_versioning.py @@ -0,0 +1,347 @@ +"""End-to-end tests for the OpenViking.snapshot namespace. + +These exercise the user-facing namespace path: +OpenViking -> LocalClient -> FSService -> VikingFS -> RAGFSBindingClient -> Rust GitService. +""" + +from __future__ import annotations + +import re +import shutil +import tempfile +from pathlib import Path +from typing import NamedTuple, Tuple +from unittest.mock import MagicMock + +import pytest + +ragfs_python = pytest.importorskip("ragfs_python") + +from openviking.async_client import AsyncOpenViking +from openviking.client.local import LocalClient +from openviking.pyagfs.exceptions import AGFSNotSupportedError +from openviking.server.identity import RequestContext, Role +from openviking_cli.exceptions import InvalidURIError +from openviking.service.fs_service import FSService +from openviking.storage.viking_fs import VikingFS +from openviking.sync_client import SyncOpenViking +from openviking_cli.session.user_id import UserIdentifier + + +OID_RE = re.compile(r"^[0-9a-f]{40}$") +DEFAULT_AUTHOR_NAME = VikingFS._DEFAULT_GIT_AUTHOR_NAME + + +class ClientHarness(NamedTuple): + client: SyncOpenViking + async_client: AsyncOpenViking + vfs: VikingFS + ctx: RequestContext + + +def _make_ctx(account: str = "acct_t", user: str = "user1") -> RequestContext: + return RequestContext(user=UserIdentifier(account, user), role=Role.ROOT) + + +def _write_workspace(tmp_root: Path) -> Tuple[Path, Path]: + """Create ragfs config and backing localfs root for git-enabled tests.""" + fs_root = tmp_root / "fs" + git_root = tmp_root / "git" + fs_root.mkdir(parents=True, exist_ok=True) + git_root.mkdir(parents=True, exist_ok=True) + cfg = tmp_root / "ragfs.toml" + cfg.write_text( + f""" +[git] +enabled = true +backend = "local" +default_branch = "main" +author_name = "test-bot" +author_email = "test@example.com" + +[git.local] +base_dir = "{git_root}" +""" + ) + return cfg, fs_root + + +def _write_disabled_workspace(tmp_root: Path) -> Tuple[Path, Path]: + fs_root = tmp_root / "fs" + fs_root.mkdir(parents=True, exist_ok=True) + cfg = tmp_root / "ragfs.toml" + cfg.write_text( + """ +[git] +enabled = false +""" + ) + return cfg, fs_root + + +def _build_binding_client(config_path: Path, fs_root: Path): + client = ragfs_python.RAGFSBindingClient(git_config_path=str(config_path)) + client.mount("localfs", "/local", {"local_dir": str(fs_root)}) + return client + + +def _build_harness(config_path: Path, fs_root: Path) -> ClientHarness: + from openviking.storage.transaction import init_lock_manager + + ctx = _make_ctx() + binding_client = _build_binding_client(config_path, fs_root) + init_lock_manager(binding_client) + vfs = VikingFS(agfs=binding_client) + + fs_service = FSService() + fs_service.set_dependencies(viking_fs=vfs) + + local_client = object.__new__(LocalClient) + local_client._service = MagicMock() + local_client._service.fs = fs_service + local_client._ctx = ctx + + async_client = object.__new__(AsyncOpenViking) + async_client._client = local_client + async_client._initialized = True + async_client._singleton_initialized = True + async_client._snapshot = None + + sync_client = object.__new__(SyncOpenViking) + sync_client._async_client = async_client + sync_client._initialized = True + sync_client._snapshot = None + + return ClientHarness( + client=sync_client, + async_client=async_client, + vfs=vfs, + ctx=ctx, + ) + + +@pytest.fixture +def workspace(): + root = Path(tempfile.mkdtemp(prefix="ov-client-git-")) + try: + yield root + finally: + shutil.rmtree(root, ignore_errors=True) + + +@pytest.fixture +def git_harness(workspace) -> ClientHarness: + from openviking.storage.transaction import reset_lock_manager + + cfg, fs_root = _write_workspace(workspace) + try: + yield _build_harness(cfg, fs_root) + finally: + reset_lock_manager() + + +@pytest.fixture +def git_disabled_harness(workspace) -> ClientHarness: + from openviking.storage.transaction import reset_lock_manager + + cfg, fs_root = _write_disabled_workspace(workspace) + try: + yield _build_harness(cfg, fs_root) + finally: + reset_lock_manager() + + +async def test_write_commit_show_roundtrip(git_harness): + await git_harness.vfs.write_file( + "viking://resources/a.md", + b"hello", + ctx=git_harness.ctx, + ) + + commit = git_harness.client.snapshot.commit( + message="initial", + paths=["viking://resources/a.md"], + ) + + assert commit["result"] == "created" + assert OID_RE.match(commit["commit_oid"]) + assert git_harness.client.snapshot.show( + "main", + path="viking://resources/a.md", + ) == b"hello" + + +async def test_show_metadata_without_path(git_harness): + await git_harness.vfs.write_file( + "viking://resources/meta.md", + b"metadata", + ctx=git_harness.ctx, + ) + commit = git_harness.client.snapshot.commit( + message="metadata commit", + paths=["viking://resources/meta.md"], + ) + + metadata = git_harness.client.snapshot.show("main") + + assert metadata["oid"] == commit["commit_oid"] + assert metadata["message"].startswith("metadata commit") + assert metadata["author"]["name"] == DEFAULT_AUTHOR_NAME + assert metadata["parents"] == [] + + +async def test_log_walks_parents(git_harness): + commits = [] + for idx, body in enumerate((b"v1", b"v2", b"v3"), start=1): + await git_harness.vfs.write_file( + "viking://resources/log.md", + body, + ctx=git_harness.ctx, + ) + commits.append( + git_harness.client.snapshot.commit( + message=f"c{idx}", + paths=["viking://resources/log.md"], + ) + ) + + history = git_harness.client.snapshot.log(limit=10) + limited = git_harness.client.snapshot.log(limit=2) + + assert [item["oid"] for item in history] == [ + commits[2]["commit_oid"], + commits[1]["commit_oid"], + commits[0]["commit_oid"], + ] + assert [item["oid"] for item in limited] == [ + commits[2]["commit_oid"], + commits[1]["commit_oid"], + ] + + +async def test_restore_reverts_file_and_advances_head(git_harness): + await git_harness.vfs.write_file( + "viking://resources/proj/a.md", + b"v1", + ctx=git_harness.ctx, + ) + v1 = git_harness.client.snapshot.commit( + message="v1", + paths=["viking://resources/proj/a.md"], + ) + + await git_harness.vfs.write_file( + "viking://resources/proj/a.md", + b"v2", + ctx=git_harness.ctx, + ) + v2 = git_harness.client.snapshot.commit( + message="v2", + paths=["viking://resources/proj/a.md"], + ) + + restore = git_harness.client.snapshot.restore( + project_dir="viking://resources/proj", + source_commit=v1["commit_oid"], + ) + + assert restore["result"] == "applied" + assert restore["source_commit"] == v1["commit_oid"] + assert restore["parent_commit"] == v2["commit_oid"] + assert restore["new_commit_oid"] != v2["commit_oid"] + assert await git_harness.vfs.read( + "viking://resources/proj/a.md", + ctx=git_harness.ctx, + ) == b"v1" + assert git_harness.client.snapshot.show("main")["parents"] == [v2["commit_oid"]] + + +async def test_restore_dry_run_does_not_mutate(git_harness): + await git_harness.vfs.write_file( + "viking://resources/proj/a.md", + b"v1", + ctx=git_harness.ctx, + ) + v1 = git_harness.client.snapshot.commit( + message="v1", + paths=["viking://resources/proj/a.md"], + ) + await git_harness.vfs.write_file( + "viking://resources/proj/a.md", + b"v2", + ctx=git_harness.ctx, + ) + git_harness.client.snapshot.commit( + message="v2", + paths=["viking://resources/proj/a.md"], + ) + before_log = git_harness.client.snapshot.log() + + dry_run = git_harness.client.snapshot.restore( + project_dir="viking://resources/proj", + source_commit=v1["commit_oid"], + dry_run=True, + ) + + assert dry_run["result"] == "dry_run" + assert any(item["path"] == "a.md" for item in dry_run["diff"]["to_write"]) + assert await git_harness.vfs.read( + "viking://resources/proj/a.md", + ctx=git_harness.ctx, + ) == b"v2" + assert len(git_harness.client.snapshot.log()) == len(before_log) + + +async def test_restore_internal_scope_rejected(git_harness): + await git_harness.vfs.write_file( + "viking://resources/a.md", + b"content", + ctx=git_harness.ctx, + ) + commit = git_harness.client.snapshot.commit( + message="commit", + paths=["viking://resources/a.md"], + ) + + # Client-level calls cross FSService first; its URI validator rejects + # internal scopes before VikingFS.restore can raise ValueError. + with pytest.raises(InvalidURIError): + git_harness.client.snapshot.restore( + project_dir="viking://temp/x", + source_commit=commit["commit_oid"], + ) + + +async def test_disabled_raises_not_supported(git_disabled_harness): + with pytest.raises(AGFSNotSupportedError): + git_disabled_harness.client.snapshot.commit(message="disabled") + with pytest.raises(AGFSNotSupportedError): + git_disabled_harness.client.snapshot.show("main") + with pytest.raises(AGFSNotSupportedError): + git_disabled_harness.client.snapshot.restore( + project_dir="viking://resources/proj", + source_commit="main", + ) + with pytest.raises(AGFSNotSupportedError): + git_disabled_harness.client.snapshot.log() + + +async def test_async_api_parity(git_harness): + await git_harness.vfs.write_file( + "viking://resources/async.md", + b"async hello", + ctx=git_harness.ctx, + ) + + commit = await git_harness.async_client.snapshot.commit( + message="async initial", + paths=["viking://resources/async.md"], + ) + body = await git_harness.async_client.snapshot.show( + "main", + path="viking://resources/async.md", + ) + + assert commit["result"] == "created" + assert OID_RE.match(commit["commit_oid"]) + assert body == b"async hello" diff --git a/tests/client/test_git_versioning_http.py b/tests/client/test_git_versioning_http.py new file mode 100644 index 0000000000..ae680496cf --- /dev/null +++ b/tests/client/test_git_versioning_http.py @@ -0,0 +1,241 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""End-to-end parity tests for client.snapshot.* over HTTP. + +These exercise the AsyncHTTPClient.snapshot namespace surface that mirrors +the LocalClient.snapshot surface covered by tests/client/test_git_versioning.py, +routed through AsyncHTTPClient -> real FastAPI server (via httpx +ASGITransport) -> real OpenVikingService -> real VikingFS. + +The full stack is genuine: real httpx response parsing, real envelope +handling, real X-Snapshot-* header round-tripping. No mocks at the +client.snapshot or AsyncHTTPClient layer. +""" + +from __future__ import annotations + +import re +import shutil +import uuid +from pathlib import Path +from typing import AsyncGenerator + +import httpx +import pytest +import pytest_asyncio + +ragfs_python = pytest.importorskip("ragfs_python") + +from openviking.models.embedder.base import DenseEmbedderBase, EmbedResult +from openviking.server.app import create_app +from openviking.server.config import ServerConfig +from openviking.server.identity import RequestContext, Role +from openviking.service.core import OpenVikingService +from openviking.storage.transaction import reset_lock_manager +from openviking_cli.client.http import AsyncHTTPClient +from openviking_cli.session.user_id import UserIdentifier +from openviking_cli.utils.config.embedding_config import EmbeddingConfig +from openviking_cli.utils.config.vlm_config import VLMConfig + + +pytestmark = pytest.mark.asyncio + +OID_RE = re.compile(r"^[0-9a-f]{40}$") + +PROJECT_ROOT = Path(__file__).parent.parent.parent +HTTP_TEST_TMP_DIR = PROJECT_ROOT / "test_data" / "tmp_client_git_http" + + +def _install_fake_embedder(monkeypatch): + dimension = 1024 + + class FakeEmbedder(DenseEmbedderBase): + def __init__(self): + super().__init__(model_name="test-fake-embedder") + + def embed(self, text: str, is_query: bool = False) -> EmbedResult: + return EmbedResult(dense_vector=[0.1] * dimension) + + def embed_batch(self, texts, is_query: bool = False): + return [self.embed(t, is_query=is_query) for t in texts] + + def get_dimension(self) -> int: + return dimension + + monkeypatch.setattr(EmbeddingConfig, "get_embedder", lambda self: FakeEmbedder()) + return FakeEmbedder + + +def _install_fake_vlm(monkeypatch): + async def _fake_get_completion(self, prompt, thinking=False): + return "fake summary" + + async def _fake_get_vision_completion(self, prompt, images, thinking=False): + return "fake vision" + + monkeypatch.setattr(VLMConfig, "is_available", lambda self: True) + monkeypatch.setattr(VLMConfig, "get_completion_async", _fake_get_completion) + monkeypatch.setattr(VLMConfig, "get_vision_completion_async", _fake_get_vision_completion) + + +@pytest.fixture(scope="function") +def http_temp_dir(): + unique = HTTP_TEST_TMP_DIR / uuid.uuid4().hex[:8] + unique.mkdir(parents=True, exist_ok=True) + yield unique + shutil.rmtree(unique, ignore_errors=True) + + +@pytest_asyncio.fixture(scope="function") +async def http_service(http_temp_dir: Path, monkeypatch): + """Stand up a real OpenVikingService backed by a temp data dir.""" + reset_lock_manager() + fake_embedder_cls = _install_fake_embedder(monkeypatch) + _install_fake_vlm(monkeypatch) + + svc = OpenVikingService( + path=str(http_temp_dir / "data"), + user=UserIdentifier.the_default_user("git_http_test_user"), + ) + await svc.initialize() + svc.viking_fs.query_embedder = fake_embedder_cls() + + test_ctx = RequestContext( + user=UserIdentifier("git_http_test_account", "git_http_test_user"), + role=Role.ADMIN, + ) + await svc.initialize_account_directories(test_ctx) + await svc.initialize_user_directories(test_ctx) + try: + yield svc + finally: + await svc.close() + reset_lock_manager() + + +@pytest_asyncio.fixture(scope="function") +async def http_app(http_service: OpenVikingService): + """FastAPI app with the test service wired in (no auth).""" + from openviking.server.auth.plugins import DevAuthPlugin + from openviking.server.auth.registry import get_registry + from openviking.server.dependencies import set_service + + config = ServerConfig() + app = create_app(config=config, service=http_service) + set_service(http_service) + # ASGITransport doesn't trigger lifespan, so wire up the auth plugin manually. + registry = get_registry() + if registry.get("dev") is None: + registry.register(DevAuthPlugin) + app.state.auth_plugin = registry.get("dev")() + return app + + +@pytest_asyncio.fixture(scope="function") +async def http_git_client(http_app) -> AsyncGenerator[AsyncHTTPClient, None]: + """Real AsyncHTTPClient whose underlying httpx talks to the ASGI app. + + The returned client exposes the production `.snapshot` namespace; the only + swap is the transport — every other layer is the real stack. + """ + client = AsyncHTTPClient( + url="http://testserver", + api_key="test-key", + account="git_http_test_account", + user="git_http_test_user", + ) + transport = httpx.ASGITransport(app=http_app) + headers = { + "X-API-Key": "test-key", + "X-OpenViking-Account": "git_http_test_account", + "X-OpenViking-User": "git_http_test_user", + } + client._http = httpx.AsyncClient( + transport=transport, + base_url="http://testserver", + headers=headers, + timeout=30.0, + ) + try: + yield client + finally: + await client._http.aclose() + + +async def _write_blob(service: OpenVikingService, uri: str, body: bytes) -> None: + ctx = RequestContext( + user=UserIdentifier("git_http_test_account", "git_http_test_user"), + role=Role.ROOT, + ) + await service.viking_fs.write_file(uri, body, ctx=ctx) + + +async def test_http_commit_and_log_roundtrip(http_git_client, http_service): + client = http_git_client + + await _write_blob(http_service, "viking://resources/http_a.md", b"hello-http") + + commit = await client.snapshot.commit(message="http parity") + assert commit["result"] in ("created", "noop") + assert isinstance(commit["commit_oid"], str) + assert OID_RE.match(commit["commit_oid"]) + + log = await client.snapshot.log(limit=5) + assert isinstance(log, list) and len(log) >= 1 + assert "oid" in log[0] and "message" in log[0] + + +async def test_http_show_blob_byte_exact_roundtrip(http_git_client, http_service): + client = http_git_client + blob_uri = "viking://resources/http_show_blob.txt" + expected = b"byte exact \x00\x01\x02 payload\n" + + await _write_blob(http_service, blob_uri, expected) + commit = await client.snapshot.commit(message="with blob") + assert OID_RE.match(commit["commit_oid"]) + + result = await client.snapshot.show(commit["commit_oid"], path=blob_uri) + assert isinstance(result, dict) + assert result["bytes"] == expected + assert result["size"] == len(expected) + assert OID_RE.match(result["oid"]) + + +async def test_http_show_metadata_without_path(http_git_client, http_service): + client = http_git_client + + await _write_blob(http_service, "viking://resources/http_meta.md", b"metadata") + commit = await client.snapshot.commit(message="meta commit") + + meta = await client.snapshot.show(commit["commit_oid"]) + assert meta["oid"] == commit["commit_oid"] + assert meta["message"].startswith("meta commit") + assert meta["parents"] == [] + + +async def test_http_restore_dry_run_does_not_mutate(http_git_client, http_service): + client = http_git_client + + await _write_blob(http_service, "viking://resources/proj/a.md", b"v1") + v1 = await client.snapshot.commit(message="v1") + assert OID_RE.match(v1["commit_oid"]) + + await _write_blob(http_service, "viking://resources/proj/a.md", b"v2") + v2 = await client.snapshot.commit(message="v2") + assert v2["commit_oid"] != v1["commit_oid"] + + log_before = await client.snapshot.log(limit=10) + + dry = await client.snapshot.restore( + project_dir="viking://resources/proj", + source_commit=v1["commit_oid"], + dry_run=True, + ) + + assert "diff" in dry or dry.get("result") == "noop" + + blob_after = await client.snapshot.show(v2["commit_oid"], path="viking://resources/proj/a.md") + assert blob_after["bytes"] == b"v2" + + log_after = await client.snapshot.log(limit=10) + assert len(log_after) == len(log_before) diff --git a/tests/client/test_http_client_snapshot.py b/tests/client/test_http_client_snapshot.py new file mode 100644 index 0000000000..94c5769408 --- /dev/null +++ b/tests/client/test_http_client_snapshot.py @@ -0,0 +1,214 @@ +"""Unit tests for AsyncHTTPClient git_* methods that drive /api/v1/snapshot/*.""" + +from typing import Any, Dict, List, Optional + +import pytest + +from openviking_cli.client.http import AsyncHTTPClient + +pytestmark = pytest.mark.asyncio + + +class _FakeHTTPClient: + """Records the last request and returns a canned response.""" + + def __init__(self): + self.calls: List[Dict[str, Any]] = [] + self.next_response: Any = None + + async def get(self, path, *, params=None, headers=None): + self.calls.append({"method": "GET", "path": path, "params": params, "headers": headers}) + return self.next_response + + async def post(self, path, *, json=None, headers=None): + self.calls.append({"method": "POST", "path": path, "json": json, "headers": headers}) + return self.next_response + + +def _client_with_fake() -> tuple[AsyncHTTPClient, _FakeHTTPClient]: + client = AsyncHTTPClient(url="http://localhost:1933") + fake = _FakeHTTPClient() + client._http = fake + client._handle_response = lambda response: {"commit_oid": "a" * 40, "result": "created", "changed": 1} + return client, fake + + +async def test_git_commit_posts_to_snapshot_commit(): + client, fake = _client_with_fake() + fake.next_response = object() + + result = await client.git_commit( + message="hello", + paths=["viking://resources/a.md"], + branch="main", + author_name="bot", + author_email="bot@example.com", + ) + + assert result == {"commit_oid": "a" * 40, "result": "created", "changed": 1} + call = fake.calls[-1] + assert call["method"] == "POST" + assert call["path"] == "/api/v1/snapshot/commit" + assert call["json"] == { + "message": "hello", + "paths": ["viking://resources/a.md"], + "branch": "main", + "author_name": "bot", + "author_email": "bot@example.com", + } + + +async def test_git_commit_omits_none_fields(): + client, fake = _client_with_fake() + fake.next_response = object() + + await client.git_commit(message="hi") + + call = fake.calls[-1] + assert call["json"] == {"message": "hi", "branch": "main"} + + +async def test_git_restore_omits_project_dir_when_none(): + client, fake = _client_with_fake() + client._handle_response = lambda resp: {"result": "noop", "head": "h" * 40, "source": "s" * 40} + fake.next_response = object() + + result = await client.git_restore( + source_commit="s" * 40, + dry_run=True, + ) + + assert result["result"] == "noop" + call = fake.calls[-1] + assert call["method"] == "POST" + assert call["path"] == "/api/v1/snapshot/restore" + assert call["json"] == { + "source_commit": "s" * 40, + "branch": "main", + "dry_run": True, + } + + +async def test_git_restore_includes_project_dir_when_provided(): + client, fake = _client_with_fake() + client._handle_response = lambda resp: {"result": "noop", "head": "h" * 40, "source": "s" * 40} + fake.next_response = object() + + await client.git_restore( + project_dir="viking://resources", + source_commit="s" * 40, + dry_run=True, + ) + + call = fake.calls[-1] + assert call["json"] == { + "source_commit": "s" * 40, + "branch": "main", + "dry_run": True, + "project_dir": "viking://resources", + } + + +class _FakeBinaryResponse: + """Mimics enough of httpx.Response for the show-blob branch.""" + + def __init__(self, content: bytes, oid: str, size: int): + self.content = content + self.status_code = 200 + self.headers = { + "content-type": "application/octet-stream", + "x-snapshot-oid": oid, + "x-snapshot-size": str(size), + } + + +async def test_git_show_metadata_uses_handle_response(): + client, fake = _client_with_fake() + client._handle_response = lambda resp: {"oid": "c" * 40, "tree": "t" * 40, "message": "m"} + fake.next_response = type("R", (), {"headers": {"content-type": "application/json"}})() + + result = await client.git_show(target_ref="c" * 40) + + assert result == {"oid": "c" * 40, "tree": "t" * 40, "message": "m"} + call = fake.calls[-1] + assert call["method"] == "GET" + assert call["path"] == "/api/v1/snapshot/show" + assert call["params"] == {"target_ref": "c" * 40} + + +async def test_git_show_blob_returns_envelope_from_headers(): + client = AsyncHTTPClient(url="http://localhost:1933") + fake = _FakeHTTPClient() + client._http = fake + + fake.next_response = _FakeBinaryResponse(b"hello\n", oid="b" * 40, size=6) + + result = await client.git_show(target_ref="c" * 40, path="viking://resources/x.txt") + + assert result == {"oid": "b" * 40, "size": 6, "bytes": b"hello\n"} + call = fake.calls[-1] + assert call["params"] == {"target_ref": "c" * 40, "path": "viking://resources/x.txt"} + + +async def test_git_show_blob_handles_case_insensitive_headers(): + """A real httpx.Response uses case-insensitive headers (httpx.Headers). + Confirm the client reads x-snapshot-* correctly even when the server + capitalises them (FastAPI may send X-Snapshot-Oid as-is). + """ + import httpx + + client = AsyncHTTPClient(url="http://localhost:1933") + fake = _FakeHTTPClient() + client._http = fake + + class _CaseyResponse: + def __init__(self, content: bytes, oid: str, size: int): + self.content = content + self.status_code = 200 + self.headers = httpx.Headers( + { + "Content-Type": "application/octet-stream", + "X-Snapshot-Oid": oid, + "X-Snapshot-Size": str(size), + } + ) + + fake.next_response = _CaseyResponse(b"\x00\x01\x02", oid="e" * 40, size=3) + result = await client.git_show(target_ref="ref", path="viking://resources/bin.dat") + assert result == {"oid": "e" * 40, "size": 3, "bytes": b"\x00\x01\x02"} + + +async def test_git_show_blob_missing_headers_degrades_gracefully(): + """Server bug / proxy strip: headers absent but content-type still binary. + Client must not crash — empty oid and size=0 are acceptable fallbacks + (the byte payload is what matters for the caller). + """ + client = AsyncHTTPClient(url="http://localhost:1933") + fake = _FakeHTTPClient() + client._http = fake + + class _NoHeadersResponse: + def __init__(self, content: bytes): + self.content = content + self.status_code = 200 + self.headers = {"content-type": "application/octet-stream"} + + fake.next_response = _NoHeadersResponse(b"payload") + result = await client.git_show(target_ref="r", path="viking://resources/x") + assert result["bytes"] == b"payload" + assert result["oid"] == "" + assert result["size"] == 0 + + +async def test_git_log_gets_with_params(): + client, fake = _client_with_fake() + client._handle_response = lambda resp: [{"oid": "d" * 40, "message": "x"}] + fake.next_response = object() + + result = await client.git_log(branch="main", limit=5) + + assert result == [{"oid": "d" * 40, "message": "x"}] + call = fake.calls[-1] + assert call["method"] == "GET" + assert call["path"] == "/api/v1/snapshot/log" + assert call["params"] == {"branch": "main", "limit": 5} diff --git a/tests/server/test_api_snapshot.py b/tests/server/test_api_snapshot.py new file mode 100644 index 0000000000..a83defd49b --- /dev/null +++ b/tests/server/test_api_snapshot.py @@ -0,0 +1,441 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""End-to-end tests for /api/v1/snapshot/*.""" + +import pytest +import pytest_asyncio + +import httpx + +pytestmark = pytest.mark.asyncio + + +@pytest_asyncio.fixture(scope="function") +async def client_with_no_repo(app): + """Plain in-process client with no resources or commits added. + + The conftest's ``app`` fixture wires the service into the global + dependency store without authentication, so a vanilla AsyncClient + is enough to hit ``/api/v1/snapshot/log`` against an empty repo. + """ + transport = httpx.ASGITransport(app=app) + async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as c: + yield c + + +async def test_commit_creates_snapshot(client_with_resource): + client, _root_uri = client_with_resource + resp = await client.post( + "/api/v1/snapshot/commit", + json={"message": "first snapshot"}, + ) + assert resp.status_code == 200, resp.text + body = resp.json() + assert body["status"] == "ok" + result = body["result"] + assert result["result"] in ("created", "noop") + assert isinstance(result["commit_oid"], str) and len(result["commit_oid"]) == 40 + + +async def test_log_returns_recent_commits(client_with_resource): + client, _ = client_with_resource + await client.post("/api/v1/snapshot/commit", json={"message": "for log"}) + + resp = await client.get("/api/v1/snapshot/log", params={"branch": "main", "limit": 5}) + assert resp.status_code == 200, resp.text + body = resp.json() + assert body["status"] == "ok" + log = body["result"] + assert isinstance(log, list) and len(log) >= 1 + assert "oid" in log[0] and "message" in log[0] + + +async def test_log_empty_repo_returns_404(client_with_no_repo): + """When the branch has no commits, /log should surface 404.""" + client = client_with_no_repo + resp = await client.get("/api/v1/snapshot/log", params={"branch": "main", "limit": 5}) + assert resp.status_code == 404 + assert resp.json()["status"] == "error" + + +@pytest_asyncio.fixture(scope="function") +async def client_with_resource_and_blob(client_with_resource, service): + """client_with_resource + a known blob written via VikingFS.write_file, then committed.""" + from openviking.server.identity import RequestContext, Role + from openviking_cli.session.user_id import UserIdentifier + + client, _root = client_with_resource + blob_uri = "viking://resources/snapshot_blob_fixture.txt" + expected_bytes = b"hello from snapshot fixture\n" + + ctx = RequestContext(user=UserIdentifier.the_default_user(), role=Role.ROOT) + await service.viking_fs.write_file(blob_uri, expected_bytes, ctx=ctx) + + commit_resp = await client.post( + "/api/v1/snapshot/commit", + json={"message": "with blob"}, + ) + assert commit_resp.status_code == 200, commit_resp.text + commit_oid = commit_resp.json()["result"]["commit_oid"] + + yield client, commit_oid, blob_uri, expected_bytes + + +async def test_restore_dry_run_does_not_mutate(client_with_resource): + client, _root = client_with_resource + v1 = (await client.post("/api/v1/snapshot/commit", json={"message": "v1"})).json()["result"] + + resp = await client.post( + "/api/v1/snapshot/restore", + json={ + "project_dir": "viking://resources", + "source_commit": v1["commit_oid"], + "dry_run": True, + }, + ) + assert resp.status_code == 200, resp.text + body = resp.json() + assert body["status"] == "ok" + result = body["result"] + # Per VikingFS.restore contract, dry_run responses carry 'diff'. + assert "diff" in result or result.get("result") == "noop" + + +async def test_show_commit_metadata(client_with_resource): + client, _ = client_with_resource + commit = (await client.post("/api/v1/snapshot/commit", json={"message": "meta"})).json()["result"] + resp = await client.get( + "/api/v1/snapshot/show", + params={"target_ref": commit["commit_oid"]}, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["status"] == "ok" + meta = body["result"] + assert meta["oid"] == commit["commit_oid"] + assert "tree" in meta and "message" in meta + + +async def test_show_blob_returns_binary_with_headers(client_with_resource_and_blob): + """show?path= must return raw bytes + X-Snapshot-* headers.""" + client, commit_oid, blob_uri, expected_bytes = client_with_resource_and_blob + resp = await client.get( + "/api/v1/snapshot/show", + params={"target_ref": commit_oid, "path": blob_uri}, + ) + assert resp.status_code == 200 + assert resp.headers["content-type"].startswith("application/octet-stream") + assert "x-snapshot-oid" in {k.lower() for k in resp.headers} + assert "x-snapshot-size" in {k.lower() for k in resp.headers} + assert int(resp.headers["x-snapshot-size"]) == len(expected_bytes) + assert resp.content == expected_bytes + + +async def test_show_path_not_found_returns_404(client_with_resource): + client, _ = client_with_resource + commit = (await client.post("/api/v1/snapshot/commit", json={"message": "for 404"})).json()["result"] + resp = await client.get( + "/api/v1/snapshot/show", + params={"target_ref": commit["commit_oid"], "path": "viking://resources/does_not_exist.txt"}, + ) + assert resp.status_code == 404 + assert resp.json()["status"] == "error" + + +# --------------------------------------------------------------------------- +# restore (apply) — forward-commit chain + reindex hook + concurrent-commit 409 +# --------------------------------------------------------------------------- + + +async def test_restore_apply_advances_head_with_forward_commit(client_with_resource_and_blob, service): + """End-to-end restore (dry_run=False) over HTTP: verify forward-commit + semantics — the new commit's parent is the previous HEAD, NOT the source + commit, and the workspace bytes match the source. + """ + from openviking.server.identity import RequestContext, Role + from openviking_cli.session.user_id import UserIdentifier + + client, c1_oid, blob_uri, v1_bytes = client_with_resource_and_blob + ctx = RequestContext(user=UserIdentifier.the_default_user(), role=Role.ROOT) + + # Overwrite the blob and commit a second snapshot (c2 becomes new HEAD). + v2_bytes = b"v2 modified content\n" + await service.viking_fs.write_file(blob_uri, v2_bytes, ctx=ctx) + c2_resp = await client.post("/api/v1/snapshot/commit", json={"message": "v2"}) + assert c2_resp.status_code == 200, c2_resp.text + c2 = c2_resp.json()["result"] + assert c2["result"] == "created" + c2_oid = c2["commit_oid"] + + # Apply restore back to c1 over the whole resources scope. + restore_resp = await client.post( + "/api/v1/snapshot/restore", + json={ + "project_dir": "viking://resources", + "source_commit": c1_oid, + }, + ) + assert restore_resp.status_code == 200, restore_resp.text + body = restore_resp.json() + assert body["status"] == "ok" + result = body["result"] + assert result["result"] == "applied" + assert result["source_commit"] == c1_oid + assert result["parent_commit"] == c2_oid # forward-commit: parent = old HEAD + new_oid = result["new_commit_oid"] + assert new_oid not in (c1_oid, c2_oid) + + # HEAD must point at the new commit, whose parent[0] == c2 (NOT c1). + head_resp = await client.get("/api/v1/snapshot/show", params={"target_ref": "main"}) + assert head_resp.status_code == 200 + head = head_resp.json()["result"] + assert head["oid"] == new_oid + assert head["parents"] == [c2_oid] + + # The blob in the restored commit must equal v1. + show_resp = await client.get( + "/api/v1/snapshot/show", + params={"target_ref": new_oid, "path": blob_uri}, + ) + assert show_resp.status_code == 200 + assert show_resp.content == v1_bytes + + +async def test_restore_without_project_dir_restores_full_account_tree(client_with_resource_and_blob, service): + from openviking.server.identity import RequestContext, Role + from openviking_cli.session.user_id import UserIdentifier + + client, c1_oid, blob_uri, v1_bytes = client_with_resource_and_blob + ctx = RequestContext(user=UserIdentifier.the_default_user(), role=Role.ROOT) + extra_uri = "viking://resources/full_restore_new.txt" + + await service.viking_fs.write_file(blob_uri, b"v2 modified content\n", ctx=ctx) + await service.viking_fs.write_file(extra_uri, b"new at v2\n", ctx=ctx) + c2_resp = await client.post("/api/v1/snapshot/commit", json={"message": "v2 full restore setup"}) + assert c2_resp.status_code == 200, c2_resp.text + c2_oid = c2_resp.json()["result"]["commit_oid"] + + restore_resp = await client.post( + "/api/v1/snapshot/restore", + json={"source_commit": c1_oid}, + ) + assert restore_resp.status_code == 200, restore_resp.text + result = restore_resp.json()["result"] + assert result["result"] == "applied" + assert result["source_commit"] == c1_oid + assert result["parent_commit"] == c2_oid + assert blob_uri.removeprefix("viking://") in result["written_paths"] + assert extra_uri.removeprefix("viking://") in result["deleted_paths"] + + show_resp = await client.get( + "/api/v1/snapshot/show", + params={"target_ref": result["new_commit_oid"], "path": blob_uri}, + ) + assert show_resp.status_code == 200 + assert show_resp.content == v1_bytes + + missing_resp = await client.get( + "/api/v1/snapshot/show", + params={"target_ref": result["new_commit_oid"], "path": extra_uri}, + ) + assert missing_resp.status_code == 404 + + +async def test_restore_apply_triggers_reindex_hook(client_with_resource_and_blob, service, monkeypatch): + """Verify the HTTP restore path actually invokes the vector-reindex + scheduler — protects the chain router -> viking_fs.restore -> _schedule_vector_rebuild. + """ + from openviking.server.identity import RequestContext, Role + from openviking_cli.session.user_id import UserIdentifier + import openviking.service.reindex_executor as reindex_mod + + client, c1_oid, blob_uri, _v1 = client_with_resource_and_blob + ctx = RequestContext(user=UserIdentifier.the_default_user(), role=Role.ROOT) + + calls: list[str] = [] + + class _SpyExecutor: + async def execute(self, *, uri, mode, wait, ctx): + calls.append(uri) + return {"ok": True} + + monkeypatch.setattr(reindex_mod, "get_reindex_executor", lambda: _SpyExecutor()) + + # Mutate, commit v2, then restore back to c1 — must produce a reindex call. + await service.viking_fs.write_file(blob_uri, b"v2-bytes\n", ctx=ctx) + await client.post("/api/v1/snapshot/commit", json={"message": "v2"}) + + restore_resp = await client.post( + "/api/v1/snapshot/restore", + json={"project_dir": "viking://resources", "source_commit": c1_oid}, + ) + assert restore_resp.status_code == 200, restore_resp.text + assert restore_resp.json()["result"]["result"] == "applied" + + # The restore reindex now runs in a tracked background task; poll a little + # to let start() + the gathered rebuild coroutines flush. + import asyncio + for _ in range(100): + if calls: + break + await asyncio.sleep(0.02) + + assert calls, "expected at least one reindex call after restore apply" + + +async def test_restore_delete_removes_orphaned_vectors(client_with_resource_and_blob, service, monkeypatch): + """Restoring to a revision that predates a file must purge that file's + vectors, not merely skip them. + + ReindexExecutor only upserts from on-disk content and never deletes, so a + file removed by the restore would otherwise leave orphaned vectors behind. + viking_fs.restore must route deleted source paths to the executor's + level-precise delete (DETAIL). + """ + from openviking.server.identity import RequestContext, Role + from openviking_cli.session.user_id import UserIdentifier + import openviking.service.reindex_executor as reindex_mod + + client, c1_oid, _blob_uri, _v1 = client_with_resource_and_blob + ctx = RequestContext(user=UserIdentifier.the_default_user(), role=Role.ROOT) + + deleted_calls: list[tuple] = [] + + class _SpyExecutor: + async def execute(self, *, uri, mode, wait, ctx): + return {"ok": True} + + async def reindex_directory_marker(self, *, dir_uri, level, ctx): + return None + + async def delete_uri_level(self, *, uri, level, ctx): + deleted_calls.append((uri, int(level))) + return 0 + + monkeypatch.setattr(reindex_mod, "get_reindex_executor", lambda: _SpyExecutor()) + + # Add a brand-new file that does not exist at c1, then commit v2. + new_uri = "viking://resources/restore_delete_fixture.txt" + await service.viking_fs.write_file(new_uri, b"only-exists-at-v2\n", ctx=ctx) + await client.post("/api/v1/snapshot/commit", json={"message": "v2 add file"}) + + # Restore back to c1: the new file must be deleted, and its vectors purged. + restore_resp = await client.post( + "/api/v1/snapshot/restore", + json={"project_dir": "viking://resources", "source_commit": c1_oid}, + ) + assert restore_resp.status_code == 200, restore_resp.text + assert restore_resp.json()["result"]["result"] == "applied" + + import asyncio + for _ in range(100): + if (new_uri, 2) in deleted_calls: + break + await asyncio.sleep(0.02) + + assert (new_uri, 2) in deleted_calls, ( + f"deleted file's DETAIL vector must be purged; got {deleted_calls!r}" + ) + + + +async def test_restore_concurrent_commit_returns_409(client_with_resource_and_blob, service, monkeypatch): + """Force the underlying git CAS swap to raise GitConcurrentCommitError + and verify the HTTP layer maps it to 409 with CONFLICT code. + """ + from openviking.pyagfs.exceptions import GitConcurrentCommitError + + client, c1_oid, _blob_uri, _v1 = client_with_resource_and_blob + + async def _raise_conflict(self, *, message=None, **kwargs): + raise GitConcurrentCommitError("git ref refs/heads/main changed under us") + + from openviking.storage.viking_fs import VikingFS + + monkeypatch.setattr(VikingFS, "restore", _raise_conflict) + + resp = await client.post( + "/api/v1/snapshot/restore", + json={"project_dir": "viking://resources", "source_commit": c1_oid}, + ) + assert resp.status_code == 409, resp.text + body = resp.json() + assert body["status"] == "error" + assert body["error"]["code"] == "CONFLICT" + + +async def test_restore_partial_writeback_surfaces_structured_error( + client_with_resource_and_blob, service, monkeypatch +): + """When VikingFS.restore raises GitRestoreWritebackPartialError, the + router must turn it into an OpenVikingError(code='RESTORE_WRITEBACK_PARTIAL') + whose ``details`` carry the full payload (new_commit_oid, failed paths, + task_id) — NOT a generic InternalError. + """ + from openviking.pyagfs.exceptions import GitRestoreWritebackPartialError + + client, c1_oid, _blob_uri, _v1 = client_with_resource_and_blob + + async def _raise_partial(self, *, message=None, **kwargs): + payload = { + "new_commit_oid": "f" * 40, + "source_commit": c1_oid, + "parent_commit": "e" * 40, + "written": 1, + "deleted": 0, + "unchanged": 0, + "written_paths": ["resources/ok.md"], + "deleted_paths": [], + "failed_writes": [("resources/bad.md", "vfs write boom")], + "failed_deletes": [], + } + exc = GitRestoreWritebackPartialError( + "restore writeback partial: 1 write(s) and 0 delete(s) failed", + payload=payload, + ) + exc.task_id = "task-fixture-xyz" + raise exc + + from openviking.storage.viking_fs import VikingFS + + monkeypatch.setattr(VikingFS, "restore", _raise_partial) + + resp = await client.post( + "/api/v1/snapshot/restore", + json={"project_dir": "viking://resources", "source_commit": c1_oid}, + ) + assert resp.status_code == 500, resp.text + body = resp.json() + assert body["status"] == "error" + assert body["error"]["code"] == "RESTORE_WRITEBACK_PARTIAL" + details = body["error"]["details"] + assert details["new_commit_oid"] == "f" * 40 + assert details["task_id"] == "task-fixture-xyz" + assert details["written_paths"] == ["resources/ok.md"] + # failed_writes round-trips as list-of-list under JSON since to_dict + # serialises the tuples that way. + assert details["failed_writes"] == [["resources/bad.md", "vfs write boom"]] + + +async def test_restore_rejects_unknown_field_per_pydantic_forbid(client_with_resource): + """Pydantic ConfigDict(extra='forbid') on RestoreRequest must reject typo'd fields. + + The OpenViking error mapper rewrites FastAPI's default 422 into HTTP 400 + with code INVALID_ARGUMENT — that's the contract callers see. + """ + client, _ = client_with_resource + commit = (await client.post("/api/v1/snapshot/commit", json={"message": "v"})).json()["result"] + resp = await client.post( + "/api/v1/snapshot/restore", + json={ + "project_dir": "viking://resources", + "source_commit": commit["commit_oid"], + "dryRun": True, # typo: should be dry_run + }, + ) + assert resp.status_code == 400, resp.text + body = resp.json() + assert body["status"] == "error" + assert body["error"]["code"] == "INVALID_ARGUMENT" + # The offending field name must surface in the error so the user can fix it. + assert "dryRun" in body["error"]["message"] diff --git a/tests/server/test_error_mapping.py b/tests/server/test_error_mapping.py index 3e1c78c63b..7129cf60e4 100644 --- a/tests/server/test_error_mapping.py +++ b/tests/server/test_error_mapping.py @@ -3,8 +3,14 @@ """Focused tests for HTTP server exception-to-error mapping.""" -from openviking.pyagfs.exceptions import AGFSClientError, AGFSHTTPError, AGFSIsADirectoryError +from openviking.pyagfs.exceptions import ( + AGFSClientError, + AGFSHTTPError, + AGFSIsADirectoryError, + GitConcurrentCommitError, +) from openviking.server.error_mapping import map_exception +from openviking.server.models import ERROR_CODE_TO_HTTP_STATUS from openviking.storage.errors import LockAcquisitionError, ResourceBusyError from openviking_cli.exceptions import ( FailedPreconditionError, @@ -198,3 +204,11 @@ def test_lock_acquisition_maps_to_structured_conflict(): "conflict_type": "path_busy", "retryable": True, } + + +def test_git_concurrent_commit_maps_to_conflict(): + err = GitConcurrentCommitError("ref moved") + mapped = map_exception(err) + assert mapped is not None + assert mapped.code == "CONFLICT" + assert ERROR_CODE_TO_HTTP_STATUS.get(mapped.code) == 409 diff --git a/tests/storage/test_viking_fs_git.py b/tests/storage/test_viking_fs_git.py new file mode 100644 index 0000000000..81ed5674ac --- /dev/null +++ b/tests/storage/test_viking_fs_git.py @@ -0,0 +1,17 @@ +# tests/storage/test_viking_fs_git.py +import pytest + +pytestmark = pytest.mark.asyncio + + +@pytest.mark.skip(reason="needs git-enabled VikingFS fixture") +async def test_show_blob_raw_returns_envelope(viking_fs_with_two_commits): + """show_blob_raw must return the full {oid, size, bytes} dict, not strip it.""" + vfs, _account, commit_oid, sample_path, sample_bytes = viking_fs_with_two_commits + + raw = await vfs.show_blob_raw(commit_oid, path=sample_path) + + assert isinstance(raw, dict) + assert raw["bytes"] == sample_bytes + assert raw["size"] == len(sample_bytes) + assert isinstance(raw["oid"], str) and len(raw["oid"]) == 40 diff --git a/tests/unit/test_create_agfs_client_git.py b/tests/unit/test_create_agfs_client_git.py new file mode 100644 index 0000000000..c13af0cf13 --- /dev/null +++ b/tests/unit/test_create_agfs_client_git.py @@ -0,0 +1,118 @@ +"""Tests for git wiring in create_agfs_client. + +Verifies that when GitConfig.enabled is True, create_agfs_client builds an +in-memory git config dict and injects it into the binding ``config`` under the +``git`` key (no file is written to disk). When git is disabled (or git_config is +None), no ``git`` section is added to the binding config. +""" +from types import SimpleNamespace + +import pytest + +from openviking_cli.utils.config import GitConfig, GitLocalConfig +from openviking.utils.agfs_utils import RagfsBindingConfig, create_agfs_client + + +class _FakeAgfsConfig: + """Minimal stand-in for StorageConfig.agfs — only what mount + binding need.""" + + def __init__(self, path): + self.path = str(path) + self.backend = "local" + self.s3 = None + # cache section consumed by RagfsBindingConfig.to_binding_dict() + self.cache = SimpleNamespace(model_dump=lambda **kwargs: {}) + # queuefs default + self.queuefs = SimpleNamespace( + backend="sqlite", recover_stale_sec=0, busy_timeout_ms=5000, db_path=None + ) + + +@pytest.fixture +def agfs_config(tmp_path): + return _FakeAgfsConfig(tmp_path / "data") + + +@pytest.fixture +def fake_binding(monkeypatch): + """Stub out RAGFSBindingClient to capture constructor kwargs.""" + instances = [] + + class _FakeClient: + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + instances.append(self) + + def mount(self, *a, **k): + pass + + def unmount(self, *a, **k): + pass + + from openviking import pyagfs as pyagfs_mod + monkeypatch.setattr( + pyagfs_mod, "get_binding_client", lambda: (_FakeClient, None) + ) + return instances + + +def test_git_disabled_omits_git_section(agfs_config, fake_binding): + """git_config=None → binding config has no 'git' section.""" + create_agfs_client(RagfsBindingConfig(agfs=agfs_config)) + assert len(fake_binding) == 1 + assert "git" not in fake_binding[0].kwargs.get("config", {}) + assert "git_config_path" not in fake_binding[0].kwargs + + +def test_git_disabled_explicit_omits_git_section(agfs_config, fake_binding): + """An explicitly-disabled GitConfig is equivalent to None.""" + cfg = GitConfig(enabled=False) + create_agfs_client(RagfsBindingConfig(agfs=agfs_config), git_config=cfg) + assert "git" not in fake_binding[0].kwargs.get("config", {}) + + +def test_git_enabled_injects_git_dict_into_config(agfs_config, fake_binding, tmp_path): + """enabled=True → injects a 'git' dict into the binding config, no file written.""" + cfg = GitConfig( + enabled=True, + backend="local", + default_branch="main", + author_name="viking-bot", + author_email="bot@viking.local", + local=GitLocalConfig(base_dir=str(tmp_path / "git")), + ) + create_agfs_client(RagfsBindingConfig(agfs=agfs_config), git_config=cfg) + + kwargs = fake_binding[0].kwargs + assert "git_config_path" not in kwargs + git = kwargs["config"]["git"] + assert git["enabled"] is True + assert git["backend"] == "local" + assert git["author_name"] == "viking-bot" + assert git["local"]["base_dir"] == str(tmp_path / "git") + + +def test_git_enabled_with_empty_base_dir_defaults_to_storage_git(agfs_config, fake_binding): + """When local.base_dir is empty, the git dict should fill it with {storage_path}/.ovgit.""" + from pathlib import Path + + cfg = GitConfig(enabled=True, local=GitLocalConfig(base_dir="")) + create_agfs_client(RagfsBindingConfig(agfs=agfs_config), git_config=cfg) + git = fake_binding[0].kwargs["config"]["git"] + expected = str(Path(agfs_config.path).resolve() / ".ovgit") + assert git["local"]["base_dir"] == expected + + +def test_git_enabled_preserves_special_chars_in_strings(agfs_config, fake_binding, tmp_path): + """Strings with backslashes / quotes are preserved verbatim in the dict.""" + cfg = GitConfig( + enabled=True, + author_name='He said "hi"', + author_email='a\\b@x.com', + local=GitLocalConfig(base_dir=str(tmp_path / "git")), + ) + create_agfs_client(RagfsBindingConfig(agfs=agfs_config), git_config=cfg) + git = fake_binding[0].kwargs["config"]["git"] + assert git["author_name"] == 'He said "hi"' + assert git["author_email"] == 'a\\b@x.com' diff --git a/tests/unit/test_fs_service_git_forwarders.py b/tests/unit/test_fs_service_git_forwarders.py new file mode 100644 index 0000000000..06afc20239 --- /dev/null +++ b/tests/unit/test_fs_service_git_forwarders.py @@ -0,0 +1,144 @@ +"""Tests for FSService git forwarder methods. + +These tests verify FSService.{commit, restore, show, log} pass the right +args to VikingFS. They don't exercise real git — that's covered by +tests/agfs/test_viking_fs_git.py. +""" +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from openviking.server.identity import RequestContext, Role +from openviking.service.fs_service import FSService +from openviking_cli.exceptions import NotInitializedError +from openviking_cli.session.user_id import UserIdentifier + + +def _ctx(): + return RequestContext( + user=UserIdentifier(account_id="acc", user_id="user"), + role=Role.ROOT, + ) + + +@pytest.fixture +def viking_fs_mock(): + m = MagicMock() + m.commit = AsyncMock(return_value={"result": "created", "commit_oid": "a" * 40, "changed": 1}) + m.restore = AsyncMock(return_value={"result": "applied", "commit_oid": "b" * 40}) + m.show = AsyncMock(return_value={"oid": "c" * 40, "message": "m", "parents": []}) + m.log = AsyncMock(return_value=[{"oid": "c" * 40, "message": "m"}]) + return m + + +@pytest.fixture +def svc(viking_fs_mock): + s = FSService(viking_fs=viking_fs_mock) + return s + + +@pytest.mark.asyncio +async def test_commit_forwards_all_kwargs(svc, viking_fs_mock): + ctx = _ctx() + out = await svc.commit( + message="snapshot", + ctx=ctx, + paths=["viking://resources/a.md"], + branch="main", + author_name="me", + author_email="me@x", + ) + viking_fs_mock.commit.assert_awaited_once_with( + message="snapshot", + paths=["viking://resources/a.md"], + branch="main", + author_name="me", + author_email="me@x", + ctx=ctx, + ) + assert out["commit_oid"] == "a" * 40 + + +@pytest.mark.asyncio +async def test_commit_defaults_paths_to_none(svc, viking_fs_mock): + ctx = _ctx() + await svc.commit(message="m", ctx=ctx) + kwargs = viking_fs_mock.commit.await_args.kwargs + assert kwargs["paths"] is None + assert kwargs["branch"] == "main" + assert kwargs["author_name"] is None + assert kwargs["author_email"] is None + + +@pytest.mark.asyncio +async def test_restore_forwards_all_kwargs(svc, viking_fs_mock): + ctx = _ctx() + out = await svc.restore( + project_dir="viking://resources/proj", + source_commit="d" * 40, + ctx=ctx, + branch="main", + dry_run=True, + message="rolling back", + author_name="me", + author_email="me@x", + ) + viking_fs_mock.restore.assert_awaited_once_with( + project_dir="viking://resources/proj", + source_commit="d" * 40, + branch="main", + dry_run=True, + message="rolling back", + author_name="me", + author_email="me@x", + ctx=ctx, + ) + assert out["result"] == "applied" + + +@pytest.mark.asyncio +async def test_show_metadata_without_path(svc, viking_fs_mock): + ctx = _ctx() + out = await svc.show("main", ctx=ctx) + viking_fs_mock.show.assert_awaited_once_with("main", path=None, ctx=ctx) + assert out["oid"] == "c" * 40 + + +@pytest.mark.asyncio +async def test_show_with_path_validated(svc, viking_fs_mock): + ctx = _ctx() + viking_fs_mock.show = AsyncMock(return_value=b"hello") + out = await svc.show("main", ctx=ctx, path="viking://resources/a.md") + viking_fs_mock.show.assert_awaited_once_with( + "main", path="viking://resources/a.md", ctx=ctx + ) + assert out == b"hello" + + +@pytest.mark.asyncio +async def test_log_defaults(svc, viking_fs_mock): + ctx = _ctx() + out = await svc.log(ctx=ctx) + viking_fs_mock.log.assert_awaited_once_with(branch="main", limit=20, ctx=ctx) + assert len(out) == 1 + + +@pytest.mark.asyncio +async def test_log_with_overrides(svc, viking_fs_mock): + ctx = _ctx() + await svc.log(ctx=ctx, branch="dev", limit=5) + viking_fs_mock.log.assert_awaited_once_with(branch="dev", limit=5, ctx=ctx) + + +@pytest.mark.asyncio +async def test_methods_raise_when_not_initialized(): + svc = FSService() # no viking_fs set + ctx = _ctx() + with pytest.raises(NotInitializedError): + await svc.commit(message="m", ctx=ctx) + with pytest.raises(NotInitializedError): + await svc.restore(project_dir="viking://x", source_commit="a", ctx=ctx) + with pytest.raises(NotInitializedError): + await svc.show("main", ctx=ctx) + with pytest.raises(NotInitializedError): + await svc.log(ctx=ctx) diff --git a/tests/unit/test_git_config.py b/tests/unit/test_git_config.py new file mode 100644 index 0000000000..0a74e9e7c1 --- /dev/null +++ b/tests/unit/test_git_config.py @@ -0,0 +1,277 @@ +"""Unit tests for GitConfig pydantic model.""" +import json +from pathlib import Path + +import pytest +from pydantic import ValidationError + +from openviking_cli.utils.config import ( + GitConfig, + GitLocalConfig, + GitS3Config, + OpenVikingConfig, +) +from openviking.utils.agfs_utils import _build_git_config_dict + + +class TestGitConfigDefaults: + def test_enabled_by_default(self): + cfg = GitConfig() + assert cfg.enabled is True + assert cfg.backend == "local" + assert cfg.default_branch == "main" + assert cfg.author_name == "viking-bot" + assert cfg.author_email == "bot@viking.local" + + def test_local_subconfig_defaults(self): + cfg = GitConfig() + assert isinstance(cfg.local, GitLocalConfig) + assert cfg.local.base_dir == "" + + +class TestGitConfigValidation: + def test_invalid_backend_rejected(self): + with pytest.raises(ValidationError): + GitConfig(backend="ftp") + + def test_unknown_field_rejected(self): + with pytest.raises(ValidationError): + GitConfig(unknown_thing=True) + + def test_enabled_with_local_backend_ok(self): + cfg = GitConfig(enabled=True, backend="local", local=GitLocalConfig(base_dir="/tmp/git")) + assert cfg.enabled is True + assert cfg.local.base_dir == "/tmp/git" + + +class TestGitConfigOnOpenVikingConfig: + def test_open_viking_config_has_git_field_with_default(self): + cfg = OpenVikingConfig(storage={"workspace": "/tmp/x"}) + assert isinstance(cfg.git, GitConfig) + assert cfg.git.enabled is True + assert cfg.git.backend == "local" + assert isinstance(cfg.git.local, GitLocalConfig) + + def test_open_viking_config_accepts_git_section(self): + cfg = OpenVikingConfig( + storage={"workspace": "/tmp/x"}, + git={ + "enabled": True, + "backend": "local", + "local": {"base_dir": "/tmp/g"}, + }, + ) + assert cfg.git.enabled is True + assert cfg.git.local.base_dir == "/tmp/g" + + def test_git_config_round_trip_via_config_file(self, tmp_path): + """Round-trip the new `git` section through the runtime JSON file loader.""" + from openviking_cli.utils.config.open_viking_config import ( + OpenVikingConfigSingleton, + ) + + cfg_dict = { + "storage": {"workspace": str(tmp_path / "data")}, + "git": { + "enabled": True, + "backend": "local", + "default_branch": "main", + "author_name": "viking-bot", + "author_email": "bot@viking.local", + "local": {"base_dir": str(tmp_path / "git")}, + }, + } + cfg_path = tmp_path / "ov.conf" + cfg_path.write_text(json.dumps(cfg_dict)) + + cfg = OpenVikingConfigSingleton._load_from_file(str(cfg_path)) + + assert cfg.git.enabled is True + assert cfg.git.local.base_dir == str(tmp_path / "git") + + +class TestGitInheritsFromAgfs: + """git section inherits unset defaults from storage.agfs.""" + + FULL_S3 = { + "bucket": "B", + "region": "R", + "endpoint": "E", + "access_key": "AK", + "secret_key": "SK", + } + + def test_no_git_section_local_agfs(self): + cfg = OpenVikingConfig(storage={"workspace": "/tmp/x"}) + assert cfg.git.enabled is True + assert cfg.git.backend == "local" + assert cfg.git.s3 is None + + def test_backend_inherits_s3_from_agfs(self): + cfg = OpenVikingConfig( + storage={"workspace": "/tmp/x", "agfs": {"backend": "s3", "s3": self.FULL_S3}} + ) + assert cfg.git.backend == "s3" + assert cfg.git.s3.bucket == "B" + assert cfg.git.s3.region == "R" + assert cfg.git.s3.endpoint == "E" + assert cfg.git.s3.access_key == "AK" + assert cfg.git.s3.secret_key == "SK" + + def test_memory_agfs_maps_to_local(self): + cfg = OpenVikingConfig( + storage={"workspace": "/tmp/x", "agfs": {"backend": "memory"}} + ) + assert cfg.git.backend == "local" + + def test_explicit_git_backend_overrides_inheritance(self): + cfg = OpenVikingConfig( + storage={"workspace": "/tmp/x", "agfs": {"backend": "s3", "s3": self.FULL_S3}}, + git={"backend": "local"}, + ) + assert cfg.git.backend == "local" + assert cfg.git.s3 is None + + def test_explicit_git_s3_field_overrides_only_that_field(self): + cfg = OpenVikingConfig( + storage={"workspace": "/tmp/x", "agfs": {"backend": "s3", "s3": self.FULL_S3}}, + git={"s3": {"bucket": "GB"}}, + ) + assert cfg.git.s3.bucket == "GB" + # remaining fields inherited from agfs.s3 + assert cfg.git.s3.region == "R" + assert cfg.git.s3.endpoint == "E" + assert cfg.git.s3.access_key == "AK" + assert cfg.git.s3.secret_key == "SK" + + def test_disabled_git_still_inherits_backend(self): + cfg = OpenVikingConfig( + storage={"workspace": "/tmp/x", "agfs": {"backend": "s3", "s3": self.FULL_S3}}, + git={"enabled": False}, + ) + assert cfg.git.enabled is False + assert cfg.git.backend == "s3" + + +class TestGitS3ConfigParsing: + """A5.1 — parsing of the s3 backend config.""" + + def test_s3_config_defaults(self): + s3 = GitS3Config() + assert s3.bucket == "" + assert s3.region == "us-east-1" + assert s3.prefix == ".ovgit" + assert s3.endpoint == "" + assert s3.access_key is None + assert s3.secret_key is None + assert s3.cas_mode == "native" + assert s3.use_path_style is True + + def test_backend_s3_with_s3_section_ok(self): + cfg = GitConfig( + enabled=True, + backend="s3", + s3=GitS3Config(bucket="b", region="cn-beijing"), + ) + assert cfg.backend == "s3" + assert cfg.s3.bucket == "b" + assert cfg.s3.region == "cn-beijing" + + def test_enabled_backend_s3_without_s3_section_rejected(self): + with pytest.raises(ValidationError): + GitConfig(enabled=True, backend="s3", s3=None) + + def test_enabled_backend_s3_missing_bucket_rejected(self): + with pytest.raises(ValidationError): + GitConfig(enabled=True, backend="s3", s3=GitS3Config(region="cn-beijing")) + + def test_enabled_backend_s3_missing_region_rejected(self): + # region has a non-empty default, so explicitly blank it to trigger the check. + with pytest.raises(ValidationError): + GitConfig( + enabled=True, + backend="s3", + s3=GitS3Config(bucket="b", region=""), + ) + + def test_disabled_backend_s3_skips_validation(self): + # When git is disabled the s3 section is not required. + cfg = GitConfig(enabled=False, backend="s3", s3=None) + assert cfg.s3 is None + + def test_s3_unknown_field_rejected(self): + with pytest.raises(ValidationError): + GitS3Config(bucket="b", region="r", unknown="x") + + def test_invalid_cas_mode_rejected(self): + with pytest.raises(ValidationError): + GitS3Config(cas_mode="invalid") + + def test_redis_lock_cas_mode_rejected(self): + with pytest.raises(ValidationError): + GitS3Config(cas_mode="redis_lock") + + +class TestBuildGitConfigDictS3: + """A5.2 — _build_git_config_dict output for the s3 backend.""" + + def _build_s3(self, **s3_kwargs): + s3_defaults = {"bucket": "my-bucket", "region": "cn-beijing"} + s3_defaults.update(s3_kwargs) + cfg = GitConfig(enabled=True, backend="s3", s3=GitS3Config(**s3_defaults)) + return _build_git_config_dict(cfg, Path("/tmp/storage")) + + def test_builds_header_and_s3_section(self): + out = self._build_s3() + assert out["backend"] == "s3" + assert "s3" in out + assert "local" not in out + + def test_builds_required_s3_keys(self): + out = self._build_s3(prefix="gitobj", endpoint="https://tos.example.com") + s3 = out["s3"] + assert s3["bucket"] == "my-bucket" + assert s3["region"] == "cn-beijing" + assert s3["prefix"] == "gitobj" + assert s3["endpoint"] == "https://tos.example.com" + assert s3["cas_mode"] == "native" + + def test_use_path_style_is_bool(self): + out_true = self._build_s3(use_path_style=True) + assert out_true["s3"]["use_path_style"] is True + out_false = self._build_s3(use_path_style=False) + assert out_false["s3"]["use_path_style"] is False + + def test_credentials_emitted_when_present(self): + out = self._build_s3(access_key="AK", secret_key="SK") + assert out["s3"]["access_key"] == "AK" + assert out["s3"]["secret_key"] == "SK" + + def test_credentials_omitted_when_empty(self): + out = self._build_s3() + assert "access_key" not in out["s3"] + assert "secret_key" not in out["s3"] + + def test_missing_s3_section_raises(self): + # Build a disabled config (skips model validation) then force s3=None to + # exercise the builder's own guard. + cfg = GitConfig(enabled=False, backend="s3", s3=None) + with pytest.raises(ValueError): + _build_git_config_dict(cfg, Path("/tmp/storage")) + + +class TestBuildGitConfigDictLocal: + """A5.3 — regression: local backend still builds only a 'local' section.""" + + def test_local_builds_local_section_only(self): + cfg = GitConfig(enabled=True, backend="local", local=GitLocalConfig(base_dir="/tmp/g")) + out = _build_git_config_dict(cfg, Path("/tmp/storage")) + assert out["backend"] == "local" + assert "local" in out + assert "s3" not in out + assert out["local"]["base_dir"] == "/tmp/g" + + def test_local_base_dir_defaults_to_storage_git(self): + cfg = GitConfig(enabled=True, backend="local") + out = _build_git_config_dict(cfg, Path("/tmp/storage")) + assert out["local"]["base_dir"] == str(Path("/tmp/storage") / ".ovgit") diff --git a/tests/unit/test_local_client_git.py b/tests/unit/test_local_client_git.py new file mode 100644 index 0000000000..d4105f5601 --- /dev/null +++ b/tests/unit/test_local_client_git.py @@ -0,0 +1,133 @@ +"""Tests for LocalClient git version control methods. + +Verifies that LocalClient.{git_commit, git_restore, git_show, git_log} +forward the right kwargs to FSService.{commit, restore, show, log}. +""" +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from openviking.client.local import LocalClient +from openviking.server.identity import RequestContext, Role +from openviking_cli.session.user_id import UserIdentifier + + +@pytest.fixture +def mock_fs(): + m = MagicMock() + m.commit = AsyncMock(return_value={"result": "created", "commit_oid": "a" * 40}) + m.restore = AsyncMock(return_value={"result": "applied", "commit_oid": "b" * 40}) + m.show = AsyncMock(return_value={"oid": "c" * 40, "message": "m", "parents": []}) + m.log = AsyncMock(return_value=[{"oid": "c" * 40, "message": "m"}]) + return m + + +@pytest.fixture +def local_client(mock_fs): + """Build a LocalClient with a mocked FSService, bypassing __init__.""" + ctx = RequestContext( + user=UserIdentifier(account_id="acc", user_id="u"), + role=Role.ROOT, + ) + client = object.__new__(LocalClient) + client._service = MagicMock() + client._service.fs = mock_fs + client._ctx = ctx + return client + + +@pytest.mark.asyncio +async def test_commit_forwards_kwargs(local_client, mock_fs): + out = await local_client.git_commit( + message="snapshot", + paths=["viking://resources/a.md"], + branch="main", + author_name="me", + author_email="me@x", + ) + mock_fs.commit.assert_awaited_once_with( + message="snapshot", + paths=["viking://resources/a.md"], + branch="main", + author_name="me", + author_email="me@x", + ctx=local_client._ctx, + ) + assert out["commit_oid"] == "a" * 40 + + +@pytest.mark.asyncio +async def test_commit_defaults(local_client, mock_fs): + await local_client.git_commit(message="m") + kwargs = mock_fs.commit.await_args.kwargs + assert kwargs["paths"] is None + assert kwargs["branch"] == "main" + assert kwargs["author_name"] is None + assert kwargs["author_email"] is None + + +@pytest.mark.asyncio +async def test_restore_forwards_kwargs(local_client, mock_fs): + out = await local_client.git_restore( + project_dir="viking://resources/proj", + source_commit="d" * 40, + branch="main", + dry_run=True, + message="rollback", + author_name="me", + author_email="me@x", + ) + mock_fs.restore.assert_awaited_once_with( + project_dir="viking://resources/proj", + source_commit="d" * 40, + branch="main", + dry_run=True, + message="rollback", + author_name="me", + author_email="me@x", + ctx=local_client._ctx, + ) + assert out["result"] == "applied" + + +@pytest.mark.asyncio +async def test_restore_defaults_project_dir_none(local_client, mock_fs): + await local_client.git_restore(source_commit="d" * 40) + mock_fs.restore.assert_awaited_once_with( + project_dir=None, + source_commit="d" * 40, + branch="main", + dry_run=False, + message=None, + author_name=None, + author_email=None, + ctx=local_client._ctx, + ) + + +@pytest.mark.asyncio +async def test_show_metadata(local_client, mock_fs): + out = await local_client.git_show("main") + mock_fs.show.assert_awaited_once_with("main", path=None, ctx=local_client._ctx) + assert out["oid"] == "c" * 40 + + +@pytest.mark.asyncio +async def test_show_with_path(local_client, mock_fs): + mock_fs.show = AsyncMock(return_value=b"blob data") + out = await local_client.git_show("main", path="viking://resources/a.md") + mock_fs.show.assert_awaited_once_with("main", path="viking://resources/a.md", ctx=local_client._ctx) + assert out == b"blob data" + + +@pytest.mark.asyncio +async def test_log_defaults(local_client, mock_fs): + out = await local_client.git_log() + mock_fs.log.assert_awaited_once_with(branch="main", limit=20, ctx=local_client._ctx) + assert len(out) == 1 + + +@pytest.mark.asyncio +async def test_log_overrides(local_client, mock_fs): + await local_client.git_log(branch="dev", limit=5) + mock_fs.log.assert_awaited_once_with(branch="dev", limit=5, ctx=local_client._ctx) diff --git a/tests/unit/test_snapshot_namespace.py b/tests/unit/test_snapshot_namespace.py new file mode 100644 index 0000000000..e24c648e08 --- /dev/null +++ b/tests/unit/test_snapshot_namespace.py @@ -0,0 +1,176 @@ +"""Unit tests for AsyncSnapshotNamespace and SyncSnapshotNamespace. + +These tests verify the namespace classes forward to the underlying +client's git_* methods correctly. They don't exercise real git. +""" +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from openviking.snapshot_namespace import AsyncSnapshotNamespace, SyncSnapshotNamespace + + +@pytest.fixture +def fake_async_client(): + """A fake AsyncOpenViking with a mocked _client (BaseClient).""" + parent = MagicMock() + parent._ensure_initialized = AsyncMock(return_value=None) + parent._client = MagicMock() + parent._client.git_commit = AsyncMock(return_value={"result": "created", "commit_oid": "a" * 40}) + parent._client.git_restore = AsyncMock(return_value={"result": "applied", "commit_oid": "b" * 40}) + parent._client.git_show = AsyncMock(return_value={"oid": "c" * 40, "parents": []}) + parent._client.git_log = AsyncMock(return_value=[{"oid": "c" * 40}]) + return parent + + +@pytest.fixture +def async_ns(fake_async_client): + return AsyncSnapshotNamespace(fake_async_client) + + +# -------- AsyncSnapshotNamespace -------- + + +@pytest.mark.asyncio +async def test_async_commit_forwards(async_ns, fake_async_client): + out = await async_ns.commit(message="m", paths=["viking://x/a"], branch="dev", + author_name="me", author_email="me@x") + fake_async_client._ensure_initialized.assert_awaited() + fake_async_client._client.git_commit.assert_awaited_once_with( + message="m", paths=["viking://x/a"], branch="dev", + author_name="me", author_email="me@x", + ) + assert out["commit_oid"] == "a" * 40 + + +@pytest.mark.asyncio +async def test_async_commit_defaults(async_ns, fake_async_client): + await async_ns.commit(message="m") + kwargs = fake_async_client._client.git_commit.await_args.kwargs + assert kwargs == { + "message": "m", "paths": None, "branch": "main", + "author_name": None, "author_email": None, + } + + +@pytest.mark.asyncio +async def test_async_restore_forwards(async_ns, fake_async_client): + out = await async_ns.restore( + project_dir="viking://resources/proj", + source_commit="d" * 40, + dry_run=True, + message="rollback", + ) + fake_async_client._client.git_restore.assert_awaited_once_with( + project_dir="viking://resources/proj", + source_commit="d" * 40, + branch="main", + dry_run=True, + message="rollback", + author_name=None, + author_email=None, + ) + assert out["result"] == "applied" + + +@pytest.mark.asyncio +async def test_async_restore_defaults_project_dir_none(async_ns, fake_async_client): + await async_ns.restore(source_commit="d" * 40) + fake_async_client._client.git_restore.assert_awaited_once_with( + project_dir=None, + source_commit="d" * 40, + branch="main", + dry_run=False, + message=None, + author_name=None, + author_email=None, + ) + + +@pytest.mark.asyncio +async def test_async_show_no_path(async_ns, fake_async_client): + out = await async_ns.show("main") + fake_async_client._client.git_show.assert_awaited_once_with("main", path=None) + assert out["oid"] == "c" * 40 + + +@pytest.mark.asyncio +async def test_async_show_with_path(async_ns, fake_async_client): + fake_async_client._client.git_show = AsyncMock(return_value=b"data") + out = await async_ns.show("main", path="viking://x/a") + fake_async_client._client.git_show.assert_awaited_once_with("main", path="viking://x/a") + assert out == b"data" + + +@pytest.mark.asyncio +async def test_async_log_defaults(async_ns, fake_async_client): + out = await async_ns.log() + fake_async_client._client.git_log.assert_awaited_once_with(branch="main", limit=20) + assert len(out) == 1 + + +@pytest.mark.asyncio +async def test_async_log_overrides(async_ns, fake_async_client): + await async_ns.log(branch="dev", limit=5) + fake_async_client._client.git_log.assert_awaited_once_with(branch="dev", limit=5) + + +@pytest.mark.asyncio +async def test_async_ensures_initialized_before_every_call(async_ns, fake_async_client): + await async_ns.commit(message="m") + await async_ns.show("main") + await async_ns.log() + assert fake_async_client._ensure_initialized.await_count == 3 + + +# -------- SyncSnapshotNamespace -------- + + +def test_sync_namespace_delegates_through_async(monkeypatch): + """SyncSnapshotNamespace.commit() runs the async equivalent via run_async.""" + # Build a fake SyncOpenViking exposing an async_client with a snapshot namespace. + sync_parent = MagicMock() + inner_async_ns = MagicMock() + inner_async_ns.commit = AsyncMock(return_value={"commit_oid": "z" * 40}) + inner_async_ns.restore = AsyncMock(return_value={"result": "applied"}) + inner_async_ns.show = AsyncMock(return_value=b"blob") + inner_async_ns.log = AsyncMock(return_value=[]) + sync_parent._async_client.snapshot = inner_async_ns + + sync_ns = SyncSnapshotNamespace(sync_parent) + + out = sync_ns.commit(message="m") + assert out["commit_oid"] == "z" * 40 + inner_async_ns.commit.assert_awaited_once_with( + message="m", paths=None, branch="main", + author_name=None, author_email=None, + ) + + sync_ns.show("main", path="viking://x/a") + inner_async_ns.show.assert_awaited_once_with("main", path="viking://x/a") + + sync_ns.log(branch="dev", limit=3) + inner_async_ns.log.assert_awaited_once_with(branch="dev", limit=3) + + +def test_async_client_snapshot_property_is_lazy_and_cached(): + """Accessing .snapshot twice returns the same instance and doesn't construct early.""" + from openviking.async_client import AsyncOpenViking + # Avoid real construction by faking the singleton. + inst = object.__new__(AsyncOpenViking) + # Patch the lazy attribute machinery + assert not hasattr(inst, "_snapshot") + ns1 = inst.snapshot + assert isinstance(ns1, AsyncSnapshotNamespace) + ns2 = inst.snapshot + assert ns1 is ns2 + + +def test_sync_client_snapshot_property_is_lazy_and_cached(): + from openviking.sync_client import SyncOpenViking + inst = object.__new__(SyncOpenViking) + assert not hasattr(inst, "_snapshot") + ns1 = inst.snapshot + assert isinstance(ns1, SyncSnapshotNamespace) + ns2 = inst.snapshot + assert ns1 is ns2