diff --git a/CHANGELOG.md b/CHANGELOG.md index 796728d..3515684 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,12 +21,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Crates -- **nvisy-core** - Domain types, error types, and plugin trait system -- **nvisy-engine** - DAG compiler, executor, and connection routing -- **nvisy-codec** - File-format handlers with span-based content access -- **nvisy-object** - Cloud storage providers and streaming I/O -- **nvisy-pattern** - Detection patterns, dictionaries, and validators -- **nvisy-pipeline** - Detection, redaction, generation actions, and audit trails -- **nvisy-python** - PyO3 bridge for Python NER models +- **nvisy-asr:** ASR/speech-to-text backend trait and provider integration +- **nvisy-augment:** Content augmentation actions (OCR, transcription, synthetic data) +- **nvisy-cli:** CLI entry point for the nvisy API server +- **nvisy-codec:** File-format codecs — read, edit, and write documents +- **nvisy-core:** Domain types, traits, and errors +- **nvisy-engine:** DAG compiler and executor for pipeline graphs +- **nvisy-identify:** Entity ontology types and detection layers +- **nvisy-ocr:** OCR backend trait and provider integration +- **nvisy-ontology:** Domain data types, entity taxonomy, and spatial primitives +- **nvisy-pattern:** Built-in regex patterns and dictionaries for PII/PHI detection +- **nvisy-python:** PyO3 bridge for AI NER/OCR detection via embedded Python +- **nvisy-rig:** LLM/VLM-driven detection, redaction, and OCR backends +- **nvisy-server:** HTTP server exposing the Engine pipeline via REST endpoints [Unreleased]: https://github.com/nvisycom/runtime/commits/main diff --git a/Cargo.lock b/Cargo.lock index 1daed6e..e69a15f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -622,7 +622,6 @@ checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" dependencies = [ "iana-time-zone", "num-traits", - "serde", "windows-link", ] @@ -1696,12 +1695,6 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" -[[package]] -name = "humantime" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" - [[package]] name = "hyper" version = "1.8.1" @@ -1735,7 +1728,6 @@ dependencies = [ "hyper", "hyper-util", "rustls", - "rustls-native-certs", "rustls-pki-types", "tokio", "tokio-rustls", @@ -2647,15 +2639,28 @@ dependencies = [ "libm", ] +[[package]] +name = "nvisy-asr" +version = "0.1.0" +dependencies = [ + "async-trait", + "nvisy-core", + "nvisy-ontology", + "nvisy-python", + "serde_json", +] + [[package]] name = "nvisy-augment" version = "0.1.0" dependencies = [ "async-trait", "bytes", + "nvisy-asr", "nvisy-codec", "nvisy-core", "nvisy-ontology", + "nvisy-paddle", "nvisy-python", "nvisy-rig", "serde", @@ -2667,6 +2672,7 @@ dependencies = [ name = "nvisy-cli" version = "0.1.0" dependencies = [ + "anyhow", "axum", "clap", "nvisy-core", @@ -2756,13 +2762,12 @@ version = "0.1.0" dependencies = [ "async-trait", "jiff", + "nvisy-asr", "nvisy-codec", "nvisy-core", "nvisy-ontology", "nvisy-pattern", - "nvisy-python", "nvisy-rig", - "regex", "schemars", "semver", "serde", @@ -2773,20 +2778,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "nvisy-object" -version = "0.1.0" -dependencies = [ - "async-trait", - "bytes", - "futures", - "nvisy-core", - "object_store", - "serde", - "tokio", - "tracing", -] - [[package]] name = "nvisy-ontology" version = "0.1.0" @@ -2801,6 +2792,17 @@ dependencies = [ "uuid", ] +[[package]] +name = "nvisy-paddle" +version = "0.1.0" +dependencies = [ + "async-trait", + "nvisy-core", + "nvisy-ontology", + "nvisy-python", + "serde_json", +] + [[package]] name = "nvisy-pattern" version = "0.1.0" @@ -2835,11 +2837,20 @@ name = "nvisy-rig" version = "0.1.0" dependencies = [ "async-trait", - "nvisy-codec", + "base64", "nvisy-core", + "nvisy-ontology", + "reqwest-middleware", + "reqwest-retry", + "reqwest-tracing", "rig-core", + "schemars", "serde", "serde_json", + "thiserror 2.0.18", + "tokio", + "tracing", + "uuid", ] [[package]] @@ -2861,43 +2872,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "object_store" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2858065e55c148d294a9f3aae3b0fa9458edadb41a108397094566f4e3c0dfb" -dependencies = [ - "async-trait", - "base64", - "bytes", - "chrono", - "form_urlencoded", - "futures", - "http", - "http-body-util", - "httparse", - "humantime", - "hyper", - "itertools", - "md-5", - "parking_lot", - "percent-encoding", - "quick-xml 0.38.4", - "rand 0.9.2", - "reqwest 0.12.28", - "ring", - "rustls-pki-types", - "serde", - "serde_json", - "serde_urlencoded", - "thiserror 2.0.18", - "tokio", - "tracing", - "url", - "wasm-bindgen-futures", - "web-time", -] - [[package]] name = "once_cell" version = "1.21.3" @@ -3362,7 +3336,6 @@ checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" dependencies = [ "encoding_rs", "memchr", - "serde", ] [[package]] @@ -3664,12 +3637,13 @@ checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" [[package]] name = "reqwest" -version = "0.12.28" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801" dependencies = [ "base64", "bytes", + "encoding_rs", "futures-core", "futures-util", "h2", @@ -3681,15 +3655,16 @@ dependencies = [ "hyper-util", "js-sys", "log", + "mime", + "mime_guess", "percent-encoding", "pin-project-lite", "quinn", "rustls", - "rustls-native-certs", "rustls-pki-types", + "rustls-platform-verifier", "serde", "serde_json", - "serde_urlencoded", "sync_wrapper", "tokio", "tokio-rustls", @@ -3700,52 +3675,69 @@ dependencies = [ "url", "wasm-bindgen", "wasm-bindgen-futures", - "wasm-streams 0.4.2", + "wasm-streams", "web-sys", ] [[package]] -name = "reqwest" -version = "0.13.2" +name = "reqwest-middleware" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801" +checksum = "199dda04a536b532d0cc04d7979e39b1c763ea749bf91507017069c00b96056f" dependencies = [ - "base64", - "bytes", - "encoding_rs", - "futures-core", - "futures-util", - "h2", + "anyhow", + "async-trait", "http", - "http-body", - "http-body-util", - "hyper", - "hyper-rustls", - "hyper-util", - "js-sys", - "log", - "mime", - "mime_guess", - "percent-encoding", - "pin-project-lite", - "quinn", - "rustls", - "rustls-pki-types", - "rustls-platform-verifier", + "reqwest", "serde", - "serde_json", - "sync_wrapper", - "tokio", - "tokio-rustls", - "tokio-util", - "tower", - "tower-http", + "thiserror 2.0.18", "tower-service", - "url", - "wasm-bindgen", - "wasm-bindgen-futures", - "wasm-streams 0.5.0", - "web-sys", +] + +[[package]] +name = "reqwest-retry" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe2412db2af7d2268e7a5406be0431f37d9eb67ff390f35b395716f5f06c2eaa" +dependencies = [ + "anyhow", + "async-trait", + "futures", + "getrandom 0.2.17", + "http", + "hyper", + "reqwest", + "reqwest-middleware", + "retry-policies", + "thiserror 2.0.18", + "tokio", + "tracing", + "wasmtimer", +] + +[[package]] +name = "reqwest-tracing" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5c1a1510677d43dce9e9c0c07fc5db8772c0e5a43e4f9cef75a11affa05a578" +dependencies = [ + "anyhow", + "async-trait", + "getrandom 0.2.17", + "http", + "matchit", + "reqwest", + "reqwest-middleware", + "tracing", +] + +[[package]] +name = "retry-policies" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46a4bd6027df676bcb752d3724db0ea3c0c5fc1dd0376fec51ac7dcaf9cc69be" +dependencies = [ + "rand 0.9.2", ] [[package]] @@ -3775,7 +3767,8 @@ dependencies = [ "nanoid", "ordered-float", "pin-project-lite", - "reqwest 0.13.2", + "reqwest", + "reqwest-middleware", "rig-derive", "schemars", "serde", @@ -3875,7 +3868,6 @@ checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" dependencies = [ "aws-lc-rs", "once_cell", - "ring", "rustls-pki-types", "rustls-webpki", "subtle", @@ -5077,19 +5069,6 @@ dependencies = [ "wasmparser", ] -[[package]] -name = "wasm-streams" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" -dependencies = [ - "futures-util", - "js-sys", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", -] - [[package]] name = "wasm-streams" version = "0.5.0" @@ -5115,6 +5094,20 @@ dependencies = [ "semver", ] +[[package]] +name = "wasmtimer" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c598d6b99ea013e35844697fc4670d08339d5cda15588f193c6beedd12f644b" +dependencies = [ + "futures", + "js-sys", + "parking_lot", + "pin-utils", + "slab", + "wasm-bindgen", +] + [[package]] name = "web-sys" version = "0.3.85" diff --git a/Cargo.toml b/Cargo.toml index 88b50e4..03eb266 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,18 +3,19 @@ [workspace] resolver = "2" members = [ + "./crates/nvisy-asr", + "./crates/nvisy-augment", + "./crates/nvisy-cli", + "./crates/nvisy-codec", "./crates/nvisy-core", "./crates/nvisy-engine", - "./crates/nvisy-codec", - "./crates/nvisy-object", - "./crates/nvisy-pattern", "./crates/nvisy-identify", - "./crates/nvisy-augment", "./crates/nvisy-ontology", - "./crates/nvisy-rig", + "./crates/nvisy-paddle", + "./crates/nvisy-pattern", "./crates/nvisy-python", + "./crates/nvisy-rig", "./crates/nvisy-server", - "./crates/nvisy-cli", ] [workspace.package] @@ -36,57 +37,64 @@ documentation = "https://docs.rs/nvisy-runtime" # See for more details: https://github.com/rust-lang/cargo/issues/11329 # Internal crates +nvisy-asr = { path = "./crates/nvisy-asr", version = "0.1.0" } +nvisy-augment = { path = "./crates/nvisy-augment", version = "0.1.0" } +nvisy-codec = { path = "./crates/nvisy-codec", version = "0.1.0" } nvisy-core = { path = "./crates/nvisy-core", version = "0.1.0" } nvisy-engine = { path = "./crates/nvisy-engine", version = "0.1.0" } -nvisy-codec = { path = "./crates/nvisy-codec", version = "0.1.0" } -nvisy-object = { path = "./crates/nvisy-object", version = "0.1.0" } -nvisy-pattern = { path = "./crates/nvisy-pattern", version = "0.1.0" } nvisy-identify = { path = "./crates/nvisy-identify", version = "0.1.0" } -nvisy-augment = { path = "./crates/nvisy-augment", version = "0.1.0" } nvisy-ontology = { path = "./crates/nvisy-ontology", version = "0.1.0" } -nvisy-rig = { path = "./crates/nvisy-rig", version = "0.1.0" } +nvisy-paddle = { path = "./crates/nvisy-paddle", version = "0.1.0" } +nvisy-pattern = { path = "./crates/nvisy-pattern", version = "0.1.0" } nvisy-python = { path = "./crates/nvisy-python", version = "0.1.0" } +nvisy-rig = { path = "./crates/nvisy-rig", version = "0.1.0" } nvisy-server = { path = "./crates/nvisy-server", version = "0.1.0" } # LLM framework rig-core = { version = "0.31", features = [] } +# HTTP middleware +reqwest-middleware = { version = "0.5", features = [] } +reqwest-retry = { version = "0.9", features = [] } +reqwest-tracing = { version = "0.7", features = [] } + # Async runtime -tokio = { version = "1", features = [] } +tokio = { version = "1.0", features = [] } tokio-util = { version = "0.7", features = [] } futures = { version = "0.3", features = [] } async-trait = { version = "0.1", features = [] } # Observability tracing = { version = "0.1", features = ["attributes"] } +tracing-subscriber = { version = "0.3", features = [] } # (De)serialization serde = { version = "1.0", features = ["derive"] } serde_json = { version = "1.0", features = [] } -schemars = { version = "1", features = ["uuid1", "bytes1"] } +schemars = { version = "1.0", features = ["uuid1", "bytes1"] } +csv = { version = "1.0", features = [] } # Derive macros and error handling thiserror = { version = "2.0", features = [] } anyhow = { version = "1.0", features = [] } -derive_more = { version = "2", features = ["display", "from", "into"] } +derive_more = { version = "2.0", features = ["display", "from", "into"] } strum = { version = "0.28", features = ["derive"] } -# Encoding -base64 = { version = "0.22", features = [] } - # Primitive datatypes -uuid = { version = "1", features = ["serde", "v4", "v7"] } -bytes = { version = "1", features = ["serde"] } +uuid = { version = "1.0", features = ["serde", "v4", "v7"] } +bytes = { version = "1.0", features = ["serde"] } +hipstr = { version = "0.6", features = [] } jiff = { version = "0.2", features = ["serde"] } +semver = { version = "1.0", features = ["serde"] } + +# Encoding and hashing +base64 = { version = "0.22", features = [] } sha2 = { version = "0.10", features = [] } hex = { version = "0.4", features = [] } -# Text processing -hipstr = { version = "0.6", features = [] } +# Pattern matching regex = { version = "1.0", features = [] } -aho-corasick = { version = "1", features = [] } -csv = { version = "1", features = [] } -include_dir = { version = "0.7", features = [] } +aho-corasick = { version = "1.0", features = [] } # Graph data structures petgraph = { version = "0.8", features = [] } @@ -94,31 +102,28 @@ petgraph = { version = "0.8", features = [] } # File type detection infer = { version = "0.19", features = [] } -# Python interop -pyo3 = { version = "0.24", features = [] } -pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"] } -pythonize = { version = "0.24", features = [] } - -# Cloud object storage (S3, Azure Blob, GCS) -object_store = { version = "0.13", default-features = false } - -# Image processing -image = { version = "0.25", default-features = false, features = ["png", "jpeg", "tiff"] } -imageproc = { version = "0.26", features = [] } - # Document parsing pdf-extract = { version = "0.7", features = [] } lopdf = { version = "0.39", features = [] } scraper = { version = "0.22", features = [] } calamine = { version = "0.33", features = [] } -zip = { version = "8", features = [] } +zip = { version = "8.0", features = [] } quick-xml = { version = "0.37", features = [] } -# Semantic versioning -semver = { version = "1", features = ["serde"] } +# Image processing +image = { version = "0.25", default-features = false, features = ["png", "jpeg", "tiff"] } +imageproc = { version = "0.26", features = [] } + +# Compile-time asset embedding +include_dir = { version = "0.7", features = [] } + +# Python interop +pyo3 = { version = "0.24", features = [] } +pyo3-async-runtimes = { version = "0.24", features = ["tokio-runtime"] } +pythonize = { version = "0.24", features = [] } # CLI -clap = { version = "4", features = [] } +clap = { version = "4.0", features = [] } # HTTP server axum = { version = "0.8", features = [] } @@ -127,7 +132,7 @@ tower = { version = "0.5", features = [] } tower-http = { version = "0.6", features = [] } # Testing -tempfile = { version = "3", features = [] } +tempfile = { version = "3.0", features = [] } # Randomness rand = { version = "0.10", features = [] } diff --git a/README.md b/README.md index 9c09b80..ecbad49 100644 --- a/README.md +++ b/README.md @@ -37,4 +37,3 @@ Apache 2.0 License, see [LICENSE.txt](LICENSE.txt) - **Documentation**: [docs.nvisy.com](https://docs.nvisy.com) - **Issues**: [GitHub Issues](https://github.com/nvisycom/runtime/issues) - **Email**: [support@nvisy.com](mailto:support@nvisy.com) -- **API Status**: [nvisy.openstatus.dev](https://nvisy.openstatus.dev) diff --git a/crates/nvisy-asr/Cargo.toml b/crates/nvisy-asr/Cargo.toml new file mode 100644 index 0000000..b8ff004 --- /dev/null +++ b/crates/nvisy-asr/Cargo.toml @@ -0,0 +1,34 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-asr" +description = "ASR/speech-to-text backend trait and provider integration for Nvisy" +keywords = ["nvisy", "asr", "speech", "transcription"] +categories = ["multimedia::audio"] + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +# Internal crates +nvisy-core = { workspace = true, features = [] } +nvisy-ontology = { workspace = true, features = [] } +nvisy-python = { workspace = true, features = [] } + +# (De)serialization +serde_json = { workspace = true, features = [] } + +# Async runtime +async-trait = { workspace = true, features = [] } diff --git a/crates/nvisy-object/README.md b/crates/nvisy-asr/README.md similarity index 67% rename from crates/nvisy-object/README.md rename to crates/nvisy-asr/README.md index a0040e9..d725219 100644 --- a/crates/nvisy-object/README.md +++ b/crates/nvisy-asr/README.md @@ -1,8 +1,10 @@ -# nvisy-object +# nvisy-asr [![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) -Object store plugin for the Nvisy runtime. Provides cloud storage providers (S3, Azure Blob Storage, Google Cloud Storage) and streaming read/write interfaces for ingesting and outputting data through the processing pipeline. +ASR/speech-to-text backend trait and provider integration for the Nvisy runtime. + +Defines the `TranscribeBackend` trait for automatic speech recognition providers, configuration types, result parsing from raw JSON into entity types, and a `PythonBridge` implementation that delegates to the `nvisy_ai` Python module. ## Documentation diff --git a/crates/nvisy-asr/src/backend.rs b/crates/nvisy-asr/src/backend.rs new file mode 100644 index 0000000..8ee4dc7 --- /dev/null +++ b/crates/nvisy-asr/src/backend.rs @@ -0,0 +1,34 @@ +//! Transcription backend trait and configuration. + +use serde_json::Value; + +use nvisy_core::Error; + +/// Configuration passed to a [`TranscribeBackend`] implementation. +#[derive(Debug, Clone)] +pub struct TranscribeConfig { + /// BCP-47 language tag for transcription. + pub language: String, + /// Whether to perform speaker diarization. + pub enable_speaker_diarization: bool, + /// Minimum confidence threshold for results. + pub confidence_threshold: f64, +} + +/// Backend trait for transcription providers. +/// +/// Implementations call an external speech-to-text service and return +/// raw JSON results. Entity construction is handled by the consuming crate. +#[async_trait::async_trait] +pub trait TranscribeBackend: Send + Sync + 'static { + /// Transcribe audio bytes, returning raw dicts. + /// + /// Each dict should contain: `text`, `start_time`, `end_time`, `confidence`, + /// and optionally `speaker_id`. + async fn transcribe( + &self, + audio_data: &[u8], + mime_type: &str, + config: &TranscribeConfig, + ) -> Result, Error>; +} diff --git a/crates/nvisy-asr/src/bridge.rs b/crates/nvisy-asr/src/bridge.rs new file mode 100644 index 0000000..8edb249 --- /dev/null +++ b/crates/nvisy-asr/src/bridge.rs @@ -0,0 +1,28 @@ +//! [`TranscribeBackend`] implementation for [`PythonBridge`]. + +use serde_json::Value; + +use nvisy_core::Error; +use nvisy_python::bridge::PythonBridge; +use nvisy_python::transcribe::TranscribeParams; + +use crate::backend::{TranscribeBackend, TranscribeConfig}; + +/// Converts [`TranscribeConfig`] to [`TranscribeParams`] and delegates to +/// `nvisy_python::transcribe`. +#[async_trait::async_trait] +impl TranscribeBackend for PythonBridge { + async fn transcribe( + &self, + audio_data: &[u8], + mime_type: &str, + config: &TranscribeConfig, + ) -> Result, Error> { + let params = TranscribeParams { + language: config.language.clone(), + enable_speaker_diarization: config.enable_speaker_diarization, + confidence_threshold: config.confidence_threshold, + }; + nvisy_python::transcribe::transcribe(self, audio_data, mime_type, ¶ms).await + } +} diff --git a/crates/nvisy-asr/src/lib.rs b/crates/nvisy-asr/src/lib.rs new file mode 100644 index 0000000..7e3e178 --- /dev/null +++ b/crates/nvisy-asr/src/lib.rs @@ -0,0 +1,10 @@ +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + +mod backend; +mod bridge; +mod parse; + +pub use backend::{TranscribeBackend, TranscribeConfig}; +pub use parse::parse_transcribe_entities; diff --git a/crates/nvisy-asr/src/parse.rs b/crates/nvisy-asr/src/parse.rs new file mode 100644 index 0000000..b23c8b3 --- /dev/null +++ b/crates/nvisy-asr/src/parse.rs @@ -0,0 +1,67 @@ +//! Transcription result parsing. + +use serde_json::Value; + +use nvisy_core::math::TimeSpan; +use nvisy_core::Error; +use nvisy_ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityKind}; +use nvisy_ontology::location::{AudioLocation, Location}; + +/// Parse raw JSON dicts from a transcription backend into [`Entity`] values. +/// +/// Expected dict keys: `text`, `start_time`, `end_time`, `confidence`, +/// and optionally `speaker_id`. +pub fn parse_transcribe_entities(raw: &[Value]) -> Result, Error> { + let mut entities = Vec::new(); + + for item in raw { + let obj = item.as_object().ok_or_else(|| { + Error::python("Expected JSON object in transcription results".to_string()) + })?; + + let text = obj + .get("text") + .and_then(Value::as_str) + .ok_or_else(|| Error::python("Missing 'text' in transcription result".to_string()))?; + + let start_time = obj + .get("start_time") + .and_then(Value::as_f64) + .ok_or_else(|| Error::python("Missing 'start_time'".to_string()))?; + + let end_time = obj + .get("end_time") + .and_then(Value::as_f64) + .ok_or_else(|| Error::python("Missing 'end_time'".to_string()))?; + + let confidence = obj + .get("confidence") + .and_then(Value::as_f64) + .unwrap_or(0.0); + + let speaker_id = obj + .get("speaker_id") + .and_then(Value::as_str) + .map(String::from); + + let entity = Entity::new( + EntityCategory::Pii, + EntityKind::PersonName, + text, + DetectionMethod::SpeechTranscript, + confidence, + ) + .with_location(Location::Audio(AudioLocation { + time_span: TimeSpan { + start_secs: start_time, + end_secs: end_time, + }, + speaker_id, + audio_id: None, + })); + + entities.push(entity); + } + + Ok(entities) +} diff --git a/crates/nvisy-augment/Cargo.toml b/crates/nvisy-augment/Cargo.toml index f543f92..222d382 100644 --- a/crates/nvisy-augment/Cargo.toml +++ b/crates/nvisy-augment/Cargo.toml @@ -28,6 +28,8 @@ nvisy-ontology = { workspace = true, features = [] } nvisy-codec = { workspace = true, features = [] } nvisy-python = { workspace = true, features = [] } nvisy-rig = { workspace = true, features = [] } +nvisy-paddle = { workspace = true, features = [] } +nvisy-asr = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } diff --git a/crates/nvisy-augment/README.md b/crates/nvisy-augment/README.md index 9bb6ffb..f75fcc3 100644 --- a/crates/nvisy-augment/README.md +++ b/crates/nvisy-augment/README.md @@ -4,7 +4,7 @@ Content augmentation actions for the Nvisy runtime. -Provides OCR text extraction from images, audio transcription, and synthetic data generation for replacing redacted entities with realistic placeholder values. +Provides OCR text extraction from images (via `nvisy-rig`), audio transcription (via `nvisy-asr`), and synthetic data generation for replacing redacted entities with realistic placeholder values. ## Documentation diff --git a/crates/nvisy-augment/src/ocr.rs b/crates/nvisy-augment/src/ocr.rs index 3b5eeb8..92574e4 100644 --- a/crates/nvisy-augment/src/ocr.rs +++ b/crates/nvisy-augment/src/ocr.rs @@ -2,19 +2,14 @@ //! from image documents. use serde::Deserialize; -use serde_json::Value; use nvisy_codec::document::Document; use nvisy_codec::handler::{Handler, PngHandler, TxtHandler}; -use nvisy_ontology::entity::{EntityCategory, EntityKind}; use nvisy_core::Error; -use nvisy_core::math::BoundingBox; -use nvisy_python::bridge::PythonBridge; -use nvisy_python::ocr::OcrParams; +use nvisy_ontology::entity::Entity; -use nvisy_ontology::entity::{DetectionMethod, Entity}; -use nvisy_ontology::location::{ImageLocation, Location}; +pub use nvisy_paddle::{OcrBackend, OcrConfig, parse_ocr_entities}; fn default_language() -> String { "eng".into() @@ -28,32 +23,6 @@ fn default_confidence() -> f64 { 0.5 } -/// Configuration passed to an [`OcrBackend`] implementation. -#[derive(Debug, Clone)] -pub struct OcrConfig { - /// Language hint (e.g. `"eng"` for English). - pub language: String, - /// OCR engine to use (`"tesseract"`, `"google-vision"`, `"aws-textract"`). - pub engine: String, - /// Minimum confidence threshold for OCR results. - pub confidence_threshold: f64, -} - -/// Backend trait for OCR providers. -/// -/// Implementations call an external OCR service and return raw JSON -/// results. Entity construction is handled by [`GenerateOcrAction`]. -#[async_trait::async_trait] -pub trait OcrBackend: Send + Sync + 'static { - /// Run OCR on image bytes, returning raw dicts. - async fn detect_ocr( - &self, - image_data: &[u8], - mime_type: &str, - config: &OcrConfig, - ) -> Result, Error>; -} - /// Typed parameters for [`GenerateOcrAction`]. #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] @@ -139,64 +108,3 @@ impl GenerateOcrAction { }) } } - -/// Parse raw JSON dicts from an OCR backend into [`Entity`] values. -/// -/// Expected dict keys: `text`, `x`, `y`, `width`, `height`, `confidence`. -pub fn parse_ocr_entities(raw: &[Value]) -> Result, Error> { - let mut entities = Vec::new(); - - for item in raw { - let obj = item.as_object().ok_or_else(|| { - Error::python("Expected JSON object in OCR results".to_string()) - })?; - - let text = obj - .get("text") - .and_then(Value::as_str) - .ok_or_else(|| Error::python("Missing 'text' in OCR result".to_string()))?; - - let x = obj.get("x").and_then(Value::as_f64).unwrap_or(0.0); - let y = obj.get("y").and_then(Value::as_f64).unwrap_or(0.0); - let width = obj.get("width").and_then(Value::as_f64).unwrap_or(0.0); - let height = obj.get("height").and_then(Value::as_f64).unwrap_or(0.0); - let confidence = obj.get("confidence").and_then(Value::as_f64).unwrap_or(0.0); - - let entity = Entity::new( - EntityCategory::Pii, - EntityKind::Handwriting, - text, - DetectionMethod::Ocr, - confidence, - ) - .with_location(Location::Image(ImageLocation { - bounding_box: BoundingBox { x, y, width, height }, - image_id: None, - page_number: None, - })); - - entities.push(entity); - } - - Ok(entities) -} - -/// [`OcrBackend`] implementation for [`PythonBridge`]. -/// -/// Converts [`OcrConfig`] to [`OcrParams`] and delegates to `nvisy_python::ocr`. -#[async_trait::async_trait] -impl OcrBackend for PythonBridge { - async fn detect_ocr( - &self, - image_data: &[u8], - mime_type: &str, - config: &OcrConfig, - ) -> Result, Error> { - let params = OcrParams { - language: config.language.clone(), - engine: config.engine.clone(), - confidence_threshold: config.confidence_threshold, - }; - nvisy_python::ocr::detect_ocr(self, image_data, mime_type, ¶ms).await - } -} diff --git a/crates/nvisy-augment/src/transcribe.rs b/crates/nvisy-augment/src/transcribe.rs index 3eac6c9..8dee0d5 100644 --- a/crates/nvisy-augment/src/transcribe.rs +++ b/crates/nvisy-augment/src/transcribe.rs @@ -2,18 +2,14 @@ //! locations and transcript documents from audio input. use serde::Deserialize; -use serde_json::Value; use nvisy_codec::document::Document; use nvisy_codec::handler::{Handler, WavHandler, TxtHandler}; -use nvisy_core::math::TimeSpan; use nvisy_core::Error; -use nvisy_ontology::entity::{EntityCategory, EntityKind}; -use nvisy_ontology::entity::{DetectionMethod, Entity}; -use nvisy_ontology::location::{AudioLocation, Location}; -use nvisy_python::bridge::PythonBridge; -use nvisy_python::transcribe::TranscribeParams; +use nvisy_ontology::entity::Entity; + +pub use nvisy_asr::{TranscribeBackend, TranscribeConfig, parse_transcribe_entities}; fn default_language() -> String { "en".into() @@ -23,35 +19,6 @@ fn default_confidence() -> f64 { 0.5 } -/// Configuration passed to a [`TranscribeBackend`] implementation. -#[derive(Debug, Clone)] -pub struct TranscribeConfig { - /// BCP-47 language tag for transcription. - pub language: String, - /// Whether to perform speaker diarization. - pub enable_speaker_diarization: bool, - /// Minimum confidence threshold for results. - pub confidence_threshold: f64, -} - -/// Backend trait for transcription providers. -/// -/// Implementations call an external speech-to-text service and return -/// raw JSON results. Entity construction is handled by [`GenerateTranscribeAction`]. -#[async_trait::async_trait] -pub trait TranscribeBackend: Send + Sync + 'static { - /// Transcribe audio bytes, returning raw dicts. - /// - /// Each dict should contain: `text`, `start_time`, `end_time`, `confidence`, - /// and optionally `speaker_id`. - async fn transcribe( - &self, - audio_data: &[u8], - mime_type: &str, - config: &TranscribeConfig, - ) -> Result, Error>; -} - /// Typed parameters for [`GenerateTranscribeAction`]. #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] @@ -138,87 +105,11 @@ impl GenerateTranscribeAction { } } -/// Parse raw JSON dicts from a transcription backend into [`Entity`] values. -/// -/// Expected dict keys: `text`, `start_time`, `end_time`, `confidence`, -/// and optionally `speaker_id`. -pub fn parse_transcribe_entities(raw: &[Value]) -> Result, Error> { - let mut entities = Vec::new(); - - for item in raw { - let obj = item.as_object().ok_or_else(|| { - Error::python("Expected JSON object in transcription results".to_string()) - })?; - - let text = obj - .get("text") - .and_then(Value::as_str) - .ok_or_else(|| Error::python("Missing 'text' in transcription result".to_string()))?; - - let start_time = obj - .get("start_time") - .and_then(Value::as_f64) - .ok_or_else(|| Error::python("Missing 'start_time'".to_string()))?; - - let end_time = obj - .get("end_time") - .and_then(Value::as_f64) - .ok_or_else(|| Error::python("Missing 'end_time'".to_string()))?; - - let confidence = obj - .get("confidence") - .and_then(Value::as_f64) - .unwrap_or(0.0); - - let speaker_id = obj - .get("speaker_id") - .and_then(Value::as_str) - .map(String::from); - - let entity = Entity::new( - EntityCategory::Pii, - EntityKind::PersonName, - text, - DetectionMethod::SpeechTranscript, - confidence, - ) - .with_location(Location::Audio(AudioLocation { - time_span: TimeSpan { - start_secs: start_time, - end_secs: end_time, - }, - speaker_id, - audio_id: None, - })); - - entities.push(entity); - } - - Ok(entities) -} - -/// [`TranscribeBackend`] implementation for [`PythonBridge`]. -#[async_trait::async_trait] -impl TranscribeBackend for PythonBridge { - async fn transcribe( - &self, - audio_data: &[u8], - mime_type: &str, - config: &TranscribeConfig, - ) -> Result, Error> { - let params = TranscribeParams { - language: config.language.clone(), - enable_speaker_diarization: config.enable_speaker_diarization, - confidence_threshold: config.confidence_threshold, - }; - nvisy_python::transcribe::transcribe(self, audio_data, mime_type, ¶ms).await - } -} - #[cfg(test)] mod tests { use super::*; - use serde_json::json; + use nvisy_ontology::entity::DetectionMethod; + use serde_json::{json, Value}; #[test] fn parse_transcribe_entities_basic() { diff --git a/crates/nvisy-cli/Cargo.toml b/crates/nvisy-cli/Cargo.toml index eba4379..1b37d29 100644 --- a/crates/nvisy-cli/Cargo.toml +++ b/crates/nvisy-cli/Cargo.toml @@ -34,6 +34,9 @@ nvisy-server = { workspace = true, features = [] } # CLI clap = { workspace = true, features = ["derive", "env"] } +# Error handling +anyhow = { workspace = true, features = [] } + # HTTP framework axum = { workspace = true, features = ["tokio"] } @@ -42,4 +45,4 @@ tokio = { workspace = true, features = ["rt-multi-thread", "macros", "signal"] } # Observability tracing = { workspace = true, features = [] } -tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } +tracing-subscriber = { workspace = true, features = ["env-filter", "json"] } diff --git a/crates/nvisy-cli/src/config/mod.rs b/crates/nvisy-cli/src/config/mod.rs index af70b7c..6aa6dbb 100644 --- a/crates/nvisy-cli/src/config/mod.rs +++ b/crates/nvisy-cli/src/config/mod.rs @@ -1,37 +1,48 @@ -//! CLI configuration parsed from command-line arguments and environment -//! variables via [`clap`]. +//! CLI configuration management. //! -//! All fields have sensible defaults and can be overridden by environment -//! variables (`HOST`, `PORT`, `RUST_LOG`, etc.) or CLI flags. +//! This module defines the complete CLI configuration hierarchy: +//! +//! ```text +//! Cli +//! ├── server: ServerConfig # Host, port, content directory +//! ├── body_limit_bytes: usize # Extractor body limit (default: 2 MiB) +//! ├── file_body_limit_bytes: usize # Upload body limit (default: 50 MiB) +//! └── request_timeout_secs: u64 # Per-request timeout (default: 300s) +//! ``` +//! +//! All configuration can be provided via CLI arguments or environment variables. +//! Use `--help` to see all available options. +//! +//! # Example +//! +//! ```bash +//! # Configure via CLI flags +//! nvisy-server --host 127.0.0.1 --port 3000 --request-timeout-secs 60 +//! +//! # Or via environment variables +//! HOST=127.0.0.1 PORT=3000 REQUEST_TIMEOUT_SECS=60 nvisy-server +//! ``` -use std::net::{IpAddr, Ipv4Addr, SocketAddr}; -use std::path::PathBuf; +mod server; use clap::Parser; +use tracing_subscriber::EnvFilter; use nvisy_server::middleware::{OpenApiConfig, RecoveryConfig, SecurityConfig}; -/// nvisy API server. -#[derive(Debug, Parser)] -#[command(name = "nvisy-server", version, about)] -pub struct ServerConfig { - /// Address to bind the HTTP listener to. - #[arg(long, env = "HOST", default_value_t = IpAddr::V4(Ipv4Addr::UNSPECIFIED))] - pub host: IpAddr, - - /// Port to bind the HTTP listener to. - #[arg(long, env = "PORT", default_value_t = 8080)] - pub port: u16, - - /// Directory for temporary content storage. - /// - /// Defaults to `$TMPDIR/nvisy-server-content` if not set. - #[arg(long, env = "CONTENT_DIR")] - pub content_dir: Option, +pub use server::ServerConfig; - /// Tracing filter directive (e.g. `info`, `nvisy_server=debug`). - #[arg(long, env = "RUST_LOG", default_value = "info")] - pub log_level: String, +/// Complete CLI configuration. +/// +/// Combines all configuration groups for the nvisy server: +/// - [`ServerConfig`]: Network binding and content directory +/// - Middleware settings: Body limits, timeouts, OpenAPI +#[derive(Debug, Parser)] +#[command(name = "nvisy-server", version, about = "nvisy API server")] +pub struct Cli { + /// Server network and lifecycle configuration. + #[command(flatten)] + pub server: ServerConfig, /// Maximum body size in bytes for axum extractors (Json, Form, etc.). #[arg(long, env = "BODY_LIMIT_BYTES", default_value_t = 2 * 1024 * 1024)] @@ -46,19 +57,7 @@ pub struct ServerConfig { pub request_timeout_secs: u64, } -impl ServerConfig { - /// Returns the socket address to bind the listener to. - pub fn socket_addr(&self) -> SocketAddr { - SocketAddr::new(self.host, self.port) - } - - /// Returns the content directory, falling back to a temp directory. - pub fn content_dir(&self) -> PathBuf { - self.content_dir - .clone() - .unwrap_or_else(|| std::env::temp_dir().join("nvisy-server-content")) - } - +impl Cli { /// Builds a [`SecurityConfig`] from the parsed CLI values. pub fn security_config(&self) -> SecurityConfig { SecurityConfig { @@ -78,4 +77,17 @@ impl ServerConfig { pub fn open_api_config(&self) -> OpenApiConfig { OpenApiConfig::default() } + + /// Initializes tracing with environment-based filtering. + /// + /// Uses `RUST_LOG` if set, otherwise defaults to `info`. + pub fn init_tracing() { + let filter = + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")); + + tracing_subscriber::fmt() + .with_env_filter(filter) + .json() + .init(); + } } diff --git a/crates/nvisy-cli/src/config/server.rs b/crates/nvisy-cli/src/config/server.rs new file mode 100644 index 0000000..2c688ee --- /dev/null +++ b/crates/nvisy-cli/src/config/server.rs @@ -0,0 +1,63 @@ +//! HTTP server network and lifecycle configuration. +//! +//! # Environment Variables +//! +//! - `HOST` — Server host address (default: `0.0.0.0`) +//! - `PORT` — Server port (default: `8080`) +//! - `CONTENT_DIR` — Temporary content storage directory +//! - `SHUTDOWN_TIMEOUT` — Graceful shutdown timeout in seconds (default: `30`) + +use std::net::{IpAddr, Ipv4Addr, SocketAddr}; +use std::path::PathBuf; +use std::time::Duration; + +use clap::Args; + +/// HTTP server network and lifecycle configuration. +/// +/// Controls how the server binds to network interfaces, where +/// temporary content is stored, and graceful shutdown behavior. +#[derive(Debug, Clone, Args)] +pub struct ServerConfig { + /// Host address to bind the server to. + /// + /// Use `127.0.0.1` for localhost only, `0.0.0.0` for all interfaces. + #[arg(long, env = "HOST", default_value_t = IpAddr::V4(Ipv4Addr::UNSPECIFIED))] + pub host: IpAddr, + + /// TCP port number for the server to listen on. + #[arg(short = 'p', long, env = "PORT", default_value_t = 8080)] + pub port: u16, + + /// Directory for temporary content storage. + /// + /// Defaults to `$TMPDIR/nvisy-server-content` if not set. + #[arg(long, env = "CONTENT_DIR")] + pub content_dir: Option, + + /// Maximum time in seconds to wait for graceful shutdown. + /// + /// During shutdown, the server stops accepting new connections and waits + /// for existing requests to complete before forcefully terminating. + #[arg(long, env = "SHUTDOWN_TIMEOUT", default_value_t = 30)] + pub shutdown_timeout: u64, +} + +impl ServerConfig { + /// Returns the socket address for server binding. + pub fn socket_addr(&self) -> SocketAddr { + SocketAddr::new(self.host, self.port) + } + + /// Returns the content directory, falling back to a temp directory. + pub fn content_dir(&self) -> PathBuf { + self.content_dir + .clone() + .unwrap_or_else(|| std::env::temp_dir().join("nvisy-server-content")) + } + + /// Returns the graceful shutdown timeout as a [`Duration`]. + pub fn shutdown_timeout(&self) -> Duration { + Duration::from_secs(self.shutdown_timeout) + } +} diff --git a/crates/nvisy-cli/src/main.rs b/crates/nvisy-cli/src/main.rs index d1e8c0c..ffef9db 100644 --- a/crates/nvisy-cli/src/main.rs +++ b/crates/nvisy-cli/src/main.rs @@ -1,32 +1,55 @@ -//! nvisy API server entry point. -//! -//! Parses CLI arguments, initialises tracing, constructs application state, -//! and starts the HTTP server with graceful shutdown support. - -use clap::Parser; -use nvisy_core::fs::ContentRegistry; -use tracing_subscriber::EnvFilter; +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] mod config; mod server; -use config::ServerConfig; +use std::process; + +use axum::Router; +use clap::Parser; +use nvisy_core::fs::ContentRegistry; +use nvisy_server::middleware::*; +use nvisy_server::service::ServiceState; + +use crate::config::Cli; #[tokio::main] async fn main() { - let config = ServerConfig::parse(); + let Err(error) = run().await else { + process::exit(0); + }; + + if tracing::enabled!(tracing::Level::ERROR) { + tracing::error!(error = %error, "application terminated with error"); + } else { + eprintln!("Error: {error:#}"); + } + + process::exit(1); +} + +/// Main application entry point. +async fn run() -> anyhow::Result<()> { + let cli = Cli::parse(); + Cli::init_tracing(); - tracing_subscriber::fmt() - .with_env_filter( - EnvFilter::try_from_default_env() - .unwrap_or_else(|_| EnvFilter::new(&config.log_level)), - ) - .json() - .init(); + // Initialize application state + let content_registry = ContentRegistry::new(cli.server.content_dir()); + let state = ServiceState::new(content_registry); - let content_registry = ContentRegistry::new(config.content_dir()); - let state = nvisy_server::ServiceState::new(content_registry); - let app = server::build_router(&config, state); + // Build and run + let router = create_router(&cli, state); + server::run(&cli.server, router).await +} - server::run(&config, app).await; +/// Creates the router with all middleware layers applied. +fn create_router(cli: &Cli, state: ServiceState) -> Router { + nvisy_server::handler::routes() + .with_open_api(&cli.open_api_config()) + .with_recovery(&cli.recovery_config()) + .with_observability() + .with_security(&cli.security_config()) + .with_state(state) } diff --git a/crates/nvisy-cli/src/server/listen.rs b/crates/nvisy-cli/src/server/listen.rs new file mode 100644 index 0000000..207a9f6 --- /dev/null +++ b/crates/nvisy-cli/src/server/listen.rs @@ -0,0 +1,40 @@ +//! TCP listener binding and graceful server lifecycle. + +use std::path::Path; + +use tokio::net::TcpListener; + +use crate::config::ServerConfig; + +use super::shutdown; + +/// Binds a TCP listener, serves the application, and cleans up on shutdown. +/// +/// Blocks until a shutdown signal (SIGINT or SIGTERM) is received. After the +/// server stops, it removes the temporary content directory if one was created. +pub async fn run(config: &ServerConfig, app: axum::Router) -> anyhow::Result<()> { + let addr = config.socket_addr(); + let listener = TcpListener::bind(addr).await?; + + tracing::info!(%addr, "listening"); + + let shutdown = shutdown::shutdown_signal(config.shutdown_timeout()); + + axum::serve(listener, app) + .with_graceful_shutdown(shutdown) + .await?; + + cleanup_content_dir(&config.content_dir()); + Ok(()) +} + +/// Removes the temporary content directory after graceful shutdown. +fn cleanup_content_dir(path: &Path) { + if !path.exists() { + return; + } + match std::fs::remove_dir_all(path) { + Ok(()) => tracing::info!(path = %path.display(), "content directory cleaned up"), + Err(e) => tracing::warn!(path = %path.display(), "failed to clean up content directory: {e}"), + } +} diff --git a/crates/nvisy-cli/src/server/mod.rs b/crates/nvisy-cli/src/server/mod.rs index 4bc4527..b0c9e2c 100644 --- a/crates/nvisy-cli/src/server/mod.rs +++ b/crates/nvisy-cli/src/server/mod.rs @@ -1,83 +1,9 @@ -//! Server lifecycle: router construction, TCP listener, and graceful shutdown. +//! Server lifecycle: TCP listener and graceful shutdown. -use std::path::Path; +mod listen; +mod shutdown; -use tokio::net::TcpListener; +/// Tracing target for shutdown events. +pub const TRACING_TARGET_SHUTDOWN: &str = "nvisy_cli::server::shutdown"; -use nvisy_server::middleware::{ - RouterObservabilityExt, RouterOpenApiExt, RouterRecoveryExt, RouterSecurityExt, -}; -use nvisy_server::ServiceState; - -use crate::config::ServerConfig; - -/// Builds the application router with all middleware layers applied. -pub fn build_router(config: &ServerConfig, state: ServiceState) -> axum::Router { - nvisy_server::routes() - .with_open_api(&config.open_api_config()) - .with_recovery(&config.recovery_config()) - .with_observability() - .with_security(&config.security_config()) - .with_state(state) -} - -/// Binds a TCP listener, serves the application, and cleans up on shutdown. -/// -/// Blocks until a shutdown signal (SIGINT or SIGTERM) is received. After the -/// server stops, it removes the temporary content directory if one was created. -pub async fn run(config: &ServerConfig, app: axum::Router) { - let addr = config.socket_addr(); - - let listener = TcpListener::bind(addr).await.unwrap_or_else(|e| { - panic!("failed to bind to {addr}: {e}"); - }); - - tracing::info!(%addr, "listening"); - - axum::serve(listener, app) - .with_graceful_shutdown(shutdown_signal()) - .await - .unwrap_or_else(|e| { - panic!("server error: {e}"); - }); - - cleanup_content_dir(&config.content_dir()); -} - -/// Waits for SIGINT (Ctrl+C) or SIGTERM to initiate graceful shutdown. -async fn shutdown_signal() { - let ctrl_c = async { - tokio::signal::ctrl_c() - .await - .expect("failed to install Ctrl+C handler"); - }; - - #[cfg(unix)] - let terminate = async { - tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) - .expect("failed to install SIGTERM handler") - .recv() - .await; - }; - - #[cfg(not(unix))] - let terminate = std::future::pending::<()>(); - - tokio::select! { - () = ctrl_c => {}, - () = terminate => {}, - } - - tracing::info!("shutdown signal received"); -} - -/// Removes the temporary content directory after graceful shutdown. -fn cleanup_content_dir(path: &Path) { - if !path.exists() { - return; - } - match std::fs::remove_dir_all(path) { - Ok(()) => tracing::info!(path = %path.display(), "content directory cleaned up"), - Err(e) => tracing::warn!(path = %path.display(), "failed to clean up content directory: {e}"), - } -} +pub use listen::run; diff --git a/crates/nvisy-cli/src/server/shutdown.rs b/crates/nvisy-cli/src/server/shutdown.rs new file mode 100644 index 0000000..a6bb00a --- /dev/null +++ b/crates/nvisy-cli/src/server/shutdown.rs @@ -0,0 +1,65 @@ +//! Graceful shutdown signal handling. + +use std::time::Duration; + +use tokio::signal::ctrl_c; +#[cfg(unix)] +use tokio::signal::unix; + +use super::TRACING_TARGET_SHUTDOWN; + +/// Waits for a shutdown signal (SIGTERM or SIGINT/Ctrl+C). +/// +/// Listens for OS termination signals and returns when one is received. +/// The `shutdown_timeout` is logged to inform operators how long cleanup +/// will wait before the process is forcefully terminated. +pub async fn shutdown_signal(shutdown_timeout: Duration) { + let ctrl_c = async { + if let Err(e) = ctrl_c().await { + tracing::error!( + target: TRACING_TARGET_SHUTDOWN, + error = %e, + "failed to install Ctrl+C handler" + ); + } else { + tracing::info!( + target: TRACING_TARGET_SHUTDOWN, + "received Ctrl+C signal, initiating graceful shutdown" + ); + } + }; + + #[cfg(unix)] + let terminate = async { + match unix::signal(unix::SignalKind::terminate()) { + Ok(mut signal) => { + signal.recv().await; + tracing::info!( + target: TRACING_TARGET_SHUTDOWN, + "received SIGTERM signal, initiating graceful shutdown" + ); + } + Err(e) => { + tracing::error!( + target: TRACING_TARGET_SHUTDOWN, + error = %e, + "failed to install SIGTERM handler" + ); + } + } + }; + + #[cfg(not(unix))] + let terminate = std::future::pending::<()>(); + + tokio::select! { + () = ctrl_c => {}, + () = terminate => {}, + } + + tracing::info!( + target: TRACING_TARGET_SHUTDOWN, + timeout_secs = shutdown_timeout.as_secs(), + "graceful shutdown initiated" + ); +} diff --git a/crates/nvisy-core/src/fs/content_kind.rs b/crates/nvisy-core/src/fs/content_kind.rs index 288f488..8811f40 100644 --- a/crates/nvisy-core/src/fs/content_kind.rs +++ b/crates/nvisy-core/src/fs/content_kind.rs @@ -12,9 +12,7 @@ use strum::{AsRefStr, Display, EnumIter, EnumString}; /// This enum represents high-level content categories without knowledge /// of specific file extensions or MIME types. The engine's format registry /// handles the mapping from extensions/MIME types to content kinds. -#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)] -#[derive(AsRefStr, Display, EnumString, EnumIter)] -#[derive(Serialize, Deserialize)] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash, AsRefStr, Display, EnumString, EnumIter, Serialize, Deserialize)] #[strum(serialize_all = "lowercase")] #[serde(rename_all = "lowercase")] pub enum ContentKind { diff --git a/crates/nvisy-core/src/fs/document_type.rs b/crates/nvisy-core/src/fs/document_type.rs index ba0cec7..172e4a9 100644 --- a/crates/nvisy-core/src/fs/document_type.rs +++ b/crates/nvisy-core/src/fs/document_type.rs @@ -1,11 +1,12 @@ //! Document format classification. use serde::{Deserialize, Serialize}; +use strum::{Display, EnumString}; /// Document format that content can be classified as. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, EnumString, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum DocumentType { /// Plain text (`.txt`, `.log`, etc.). Txt, diff --git a/crates/nvisy-engine/src/apply/image.rs b/crates/nvisy-engine/src/apply/image.rs index 9a516f3..e39bf87 100644 --- a/crates/nvisy-engine/src/apply/image.rs +++ b/crates/nvisy-engine/src/apply/image.rs @@ -9,7 +9,7 @@ use nvisy_codec::transform::{ImageRedaction, ImageRedactionOutput, ImageHandler} use nvisy_ontology::entity::Entity; use nvisy_ontology::location::Location; use nvisy_ontology::record::Redaction; -use nvisy_ontology::spec::{ImageRedactionInput, RedactionInput}; +use nvisy_ontology::specification::{ImageRedactionInput, RedactionInput}; use nvisy_core::Error; /// Convert a `RedactionInput::Image` into a codec [`ImageRedactionOutput`]. @@ -71,7 +71,7 @@ pub(crate) async fn apply_image_doc( #[cfg(test)] mod tests { use super::*; - use nvisy_ontology::spec::TextRedactionInput; + use nvisy_ontology::specification::TextRedactionInput; #[test] fn image_output_blur() { @@ -123,7 +123,7 @@ mod tests { #[test] fn image_output_audio_spec_returns_none() { - let spec = RedactionInput::Audio(nvisy_ontology::spec::AudioRedactionInput::Silence); + let spec = RedactionInput::Audio(nvisy_ontology::specification::AudioRedactionInput::Silence); assert_eq!(image_output_from_spec(&spec), None); } } diff --git a/crates/nvisy-engine/src/apply/tabular.rs b/crates/nvisy-engine/src/apply/tabular.rs index 3cb8b2d..5525480 100644 --- a/crates/nvisy-engine/src/apply/tabular.rs +++ b/crates/nvisy-engine/src/apply/tabular.rs @@ -8,7 +8,7 @@ use nvisy_codec::document::Document; use nvisy_ontology::entity::Entity; use nvisy_ontology::location::Location; use nvisy_ontology::record::Redaction; -use nvisy_ontology::spec::{RedactionInput, TextRedactionInput}; +use nvisy_ontology::specification::{RedactionInput, TextRedactionInput}; use nvisy_core::Error; pub(crate) async fn apply_tabular_doc( diff --git a/crates/nvisy-engine/src/apply/text.rs b/crates/nvisy-engine/src/apply/text.rs index d02a97d..c5678b6 100644 --- a/crates/nvisy-engine/src/apply/text.rs +++ b/crates/nvisy-engine/src/apply/text.rs @@ -9,7 +9,7 @@ use nvisy_codec::transform::{TextRedaction, TextRedactionOutput, TextHandler}; use nvisy_ontology::entity::Entity; use nvisy_ontology::location::Location; use nvisy_ontology::record::Redaction; -use nvisy_ontology::spec::{RedactionInput, TextRedactionInput}; +use nvisy_ontology::specification::{RedactionInput, TextRedactionInput}; use nvisy_core::Error; /// Convert a `RedactionInput::Text` + replacement string into a codec @@ -129,7 +129,7 @@ pub(crate) async fn apply_text_doc( #[cfg(test)] mod tests { use super::*; - use nvisy_ontology::spec::ImageRedactionInput; + use nvisy_ontology::specification::ImageRedactionInput; #[test] fn text_output_remove_empty_replacement() { diff --git a/crates/nvisy-identify/Cargo.toml b/crates/nvisy-identify/Cargo.toml index 112facb..e51ce04 100644 --- a/crates/nvisy-identify/Cargo.toml +++ b/crates/nvisy-identify/Cargo.toml @@ -31,8 +31,8 @@ nvisy-core = { workspace = true, features = [] } nvisy-ontology = { workspace = true, features = [] } nvisy-codec = { workspace = true, features = [] } nvisy-pattern = { workspace = true, features = [] } -nvisy-python = { workspace = true, features = [] } nvisy-rig = { workspace = true, features = [] } +nvisy-asr = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -51,13 +51,9 @@ semver = { workspace = true, features = [] } # Derive macros and error handling strum = { workspace = true, features = ["derive"] } -# Text processing -regex = { workspace = true, features = [] } - # Observability tracing = { workspace = true, features = [] } [dev-dependencies] -regex = { workspace = true, features = [] } serde_json = { workspace = true, features = [] } tokio = { workspace = true, features = ["macros", "rt"] } diff --git a/crates/nvisy-identify/README.md b/crates/nvisy-identify/README.md index aac2569..b746fd3 100644 --- a/crates/nvisy-identify/README.md +++ b/crates/nvisy-identify/README.md @@ -2,9 +2,17 @@ [![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) -Entity ontology types, detection layers, and pattern/dictionary infrastructure for the Nvisy runtime. +Detection orchestration, entity ontology, and policy evaluation for the Nvisy runtime. -Defines the core entity model (`Entity`, `DetectionMethod`, locations), detection traits (`DetectionLayer`, `Detect`), and concrete detection layers for text (regex patterns, Aho-Corasick dictionaries, NER), tabular data (column rules), and documents (checksum validation, manual annotations). +Organized by **detection method** rather than content modality: + +- **pattern/** — Deterministic regex and dictionary matching via `PatternEngine` +- **ner/** — Statistical NLP named-entity recognition (text and image) +- **llm/** — LLM-based contextual entity detection via `LlmBackend` +- **vision/** — Computer vision layers (face, object, OCR detection) +- **audio/** — Audio detection via transcription + NER pipeline +- **fusion/** — Post-detection entity merging, deduplication, and ensemble scoring +- **policy/** — Policy evaluation, governance rules, and audit trails ## Documentation diff --git a/crates/nvisy-identify/src/action/dedup.rs b/crates/nvisy-identify/src/fusion/dedup.rs similarity index 100% rename from crates/nvisy-identify/src/action/dedup.rs rename to crates/nvisy-identify/src/fusion/dedup.rs diff --git a/crates/nvisy-identify/src/fusion/ensemble.rs b/crates/nvisy-identify/src/fusion/ensemble.rs new file mode 100644 index 0000000..97ba413 --- /dev/null +++ b/crates/nvisy-identify/src/fusion/ensemble.rs @@ -0,0 +1,209 @@ +//! Ensemble entity fusion — merges entities from multiple detectors +//! using configurable confidence-combination strategies. + +use std::collections::HashMap; + +use crate::{DetectionMethod, Entity, Location}; + +/// Strategy for combining confidence scores from multiple detectors. +#[derive(Debug, Clone)] +pub enum FusionStrategy { + /// Take the maximum confidence across all detectors. + MaxConfidence, + /// Weighted average by detection method. + WeightedAverage { + weights: HashMap, + }, + /// Noisy-OR: `P = 1 − ∏(1 − pᵢ)` for independent detectors. + NoisyOr, +} + +/// Ensemble merge — groups entities by `(kind, value, overlapping location)` +/// then fuses confidence using the configured [`FusionStrategy`]. +pub struct EnsembleMerge { + strategy: FusionStrategy, +} + +impl EnsembleMerge { + /// Create a new ensemble merge with the given strategy. + pub fn new(strategy: FusionStrategy) -> Self { + Self { strategy } + } + + /// Group entities by `(kind, value, overlapping location)` then fuse + /// confidence according to the strategy. + pub fn merge(&self, entities: Vec) -> Vec { + if entities.len() <= 1 { + return entities; + } + + let mut groups: Vec> = Vec::new(); + + for entity in entities { + let group = groups.iter_mut().find(|group| { + let representative = &group[0]; + representative.entity_kind == entity.entity_kind + && representative.value == entity.value + && locations_overlap(&representative.location, &entity.location) + }); + + match group { + Some(g) => g.push(entity), + None => groups.push(vec![entity]), + } + } + + groups + .into_iter() + .map(|group| self.fuse_group(group)) + .collect() + } + + /// Fuse a group of matching entities into a single entity. + fn fuse_group(&self, group: Vec) -> Entity { + debug_assert!(!group.is_empty()); + + if group.len() == 1 { + return group.into_iter().next().unwrap(); + } + + let fused_confidence = match &self.strategy { + FusionStrategy::MaxConfidence => { + group.iter().map(|e| e.confidence).fold(0.0_f64, f64::max) + } + FusionStrategy::WeightedAverage { weights } => { + let mut total_weight = 0.0; + let mut weighted_sum = 0.0; + for e in &group { + let w = weights.get(&e.detection_method).copied().unwrap_or(1.0); + weighted_sum += e.confidence * w; + total_weight += w; + } + if total_weight > 0.0 { + weighted_sum / total_weight + } else { + 0.0 + } + } + FusionStrategy::NoisyOr => { + // P = 1 − ∏(1 − pᵢ) + let product: f64 = group.iter().map(|e| 1.0 - e.confidence).product(); + 1.0 - product + } + }; + + // Use the first entity as the base and update confidence/method. + let mut result = group.into_iter().next().unwrap(); + result.confidence = fused_confidence; + result.detection_method = DetectionMethod::Composite; + result + } +} + +/// Check whether two optional locations overlap. +fn locations_overlap(a: &Option, b: &Option) -> bool { + match (a, b) { + (None, None) => true, + (Some(Location::Text(a_loc)), Some(Location::Text(b_loc))) => a_loc.overlaps(b_loc), + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::TextLocation; + use nvisy_ontology::entity::{EntityCategory, EntityKind}; + + fn text_entity( + value: &str, + method: DetectionMethod, + confidence: f64, + start: usize, + end: usize, + ) -> Entity { + Entity::new( + EntityCategory::Pii, + EntityKind::PersonName, + value, + method, + confidence, + ) + .with_location(Location::Text(TextLocation { + start_offset: start, + end_offset: end, + ..Default::default() + })) + } + + #[test] + fn max_confidence_strategy() { + let merge = EnsembleMerge::new(FusionStrategy::MaxConfidence); + let entities = vec![ + text_entity("John", DetectionMethod::Regex, 0.7, 0, 4), + text_entity("John", DetectionMethod::Ner, 0.85, 0, 4), + ]; + let result = merge.merge(entities); + assert_eq!(result.len(), 1); + assert!((result[0].confidence - 0.85).abs() < f64::EPSILON); + assert_eq!(result[0].detection_method, DetectionMethod::Composite); + } + + #[test] + fn noisy_or_strategy() { + let merge = EnsembleMerge::new(FusionStrategy::NoisyOr); + let entities = vec![ + text_entity("John", DetectionMethod::Regex, 0.7, 0, 4), + text_entity("John", DetectionMethod::Ner, 0.8, 0, 4), + ]; + let result = merge.merge(entities); + assert_eq!(result.len(), 1); + // P = 1 − (1 − 0.7)(1 − 0.8) = 1 − (0.3)(0.2) = 0.94 + assert!((result[0].confidence - 0.94).abs() < 0.001); + } + + #[test] + fn weighted_average_strategy() { + let mut weights = HashMap::new(); + weights.insert(DetectionMethod::Regex, 1.0); + weights.insert(DetectionMethod::Ner, 2.0); + + let merge = EnsembleMerge::new(FusionStrategy::WeightedAverage { weights }); + let entities = vec![ + text_entity("John", DetectionMethod::Regex, 0.6, 0, 4), + text_entity("John", DetectionMethod::Ner, 0.9, 0, 4), + ]; + let result = merge.merge(entities); + assert_eq!(result.len(), 1); + // (0.6 * 1.0 + 0.9 * 2.0) / (1.0 + 2.0) = 2.4 / 3.0 = 0.8 + assert!((result[0].confidence - 0.8).abs() < 0.001); + } + + #[test] + fn non_overlapping_not_merged() { + let merge = EnsembleMerge::new(FusionStrategy::NoisyOr); + let entities = vec![ + text_entity("John", DetectionMethod::Regex, 0.7, 0, 4), + text_entity("John", DetectionMethod::Ner, 0.8, 10, 14), + ]; + let result = merge.merge(entities); + assert_eq!(result.len(), 2); + } + + #[test] + fn single_entity_unchanged() { + let merge = EnsembleMerge::new(FusionStrategy::NoisyOr); + let entities = vec![text_entity("John", DetectionMethod::Regex, 0.7, 0, 4)]; + let result = merge.merge(entities); + assert_eq!(result.len(), 1); + assert!((result[0].confidence - 0.7).abs() < f64::EPSILON); + assert_eq!(result[0].detection_method, DetectionMethod::Regex); + } + + #[test] + fn empty_input() { + let merge = EnsembleMerge::new(FusionStrategy::MaxConfidence); + let result = merge.merge(Vec::new()); + assert!(result.is_empty()); + } +} diff --git a/crates/nvisy-identify/src/action/manual.rs b/crates/nvisy-identify/src/fusion/manual.rs similarity index 100% rename from crates/nvisy-identify/src/action/manual.rs rename to crates/nvisy-identify/src/fusion/manual.rs diff --git a/crates/nvisy-identify/src/action/mod.rs b/crates/nvisy-identify/src/fusion/mod.rs similarity index 53% rename from crates/nvisy-identify/src/action/mod.rs rename to crates/nvisy-identify/src/fusion/mod.rs index 988fbc2..4bf91a2 100644 --- a/crates/nvisy-identify/src/action/mod.rs +++ b/crates/nvisy-identify/src/fusion/mod.rs @@ -1,7 +1,9 @@ -//! Post-detection actions. +//! Post-detection entity merging, deduplication, and manual annotations. pub mod dedup; +pub mod ensemble; pub mod manual; pub use dedup::DeduplicateAction; +pub use ensemble::{EnsembleMerge, FusionStrategy}; pub use manual::{DetectManualAction, DetectManualParams, Exclusion, ManualOutput, is_excluded}; diff --git a/crates/nvisy-identify/src/image/face.rs b/crates/nvisy-identify/src/image/face.rs deleted file mode 100644 index 71209ad..0000000 --- a/crates/nvisy-identify/src/image/face.rs +++ /dev/null @@ -1,126 +0,0 @@ -//! Face detection layer for images. -//! -//! Delegates to a [`FaceBackend`] to detect human faces in images, -//! producing entities with [`ImageLocation`] bounding boxes. - -use serde_json::Value; - -use nvisy_codec::handler::{ImageData, Span}; -use nvisy_core::math::BoundingBox; -use nvisy_core::Error; - -use nvisy_ontology::entity::{EntityCategory, EntityKind}; - -use crate::{DetectionMethod, Entity, ImageLocation, Location}; -use crate::{ParallelContext, DetectionService}; - -/// Backend trait for face detection providers. -#[async_trait::async_trait] -pub trait FaceBackend: Send + Sync + 'static { - /// Detect faces in an image, returning raw JSON dicts. - /// - /// Each dict should contain: `confidence`, `x`, `y`, `width`, `height`. - async fn detect_faces( - &self, - image_data: &[u8], - mime_type: &str, - ) -> Result, Error>; -} - -/// Face detection layer — delegates to a [`FaceBackend`] at runtime. -pub struct FaceDetection { - backend: B, -} - -impl FaceDetection { - /// Create a new face detection layer with the given backend. - pub fn new(backend: B) -> Self { - Self { backend } - } -} - -#[async_trait::async_trait] -impl DetectionService<(), ImageData> for FaceDetection { - type Context = ParallelContext; - - async fn detect( - &self, - spans: Vec>, - ) -> Result, Error> { - let mut entities = Vec::new(); - - for span in &spans { - let png_bytes = span.data.encode_png()?; - - let raw = self.backend.detect_faces(&png_bytes, "image/png").await?; - - for item in &raw { - let obj = item.as_object().ok_or_else(|| { - Error::python("Expected JSON object in face detection results".to_string()) - })?; - - let confidence = obj.get("confidence").and_then(Value::as_f64).unwrap_or(0.0); - let x = obj.get("x").and_then(Value::as_f64).unwrap_or(0.0); - let y = obj.get("y").and_then(Value::as_f64).unwrap_or(0.0); - let width = obj.get("width").and_then(Value::as_f64).unwrap_or(0.0); - let height = obj.get("height").and_then(Value::as_f64).unwrap_or(0.0); - - let entity = Entity::new( - EntityCategory::Biometric, - EntityKind::Face, - "face", - DetectionMethod::FaceDetection, - confidence, - ) - .with_location(Location::Image(ImageLocation { - bounding_box: BoundingBox { x, y, width, height }, - image_id: None, - page_number: None, - })) - .with_parent(&span.source); - - entities.push(entity); - } - } - - Ok(entities) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - struct MockFaceBackend; - - #[async_trait::async_trait] - impl FaceBackend for MockFaceBackend { - async fn detect_faces(&self, _: &[u8], _: &str) -> Result, Error> { - Ok(vec![json!({ - "confidence": 0.98, - "x": 50.0, - "y": 30.0, - "width": 120.0, - "height": 150.0 - })]) - } - } - - #[tokio::test] - async fn detect_face_produces_image_location() { - let layer = FaceDetection::new(MockFaceBackend); - - let img = ImageData::new_rgb(200, 200); - let spans = vec![Span::new((), img)]; - - let entities = layer.detect(spans).await.unwrap(); - assert_eq!(entities.len(), 1); - assert_eq!(entities[0].entity_kind, EntityKind::Face); - assert_eq!(entities[0].detection_method, DetectionMethod::FaceDetection); - - let loc = entities[0].location.as_ref().unwrap().as_image().unwrap(); - assert!((loc.bounding_box.x - 50.0).abs() < f64::EPSILON); - assert!((loc.bounding_box.width - 120.0).abs() < f64::EPSILON); - } -} diff --git a/crates/nvisy-identify/src/image/mod.rs b/crates/nvisy-identify/src/image/mod.rs deleted file mode 100644 index ca564d5..0000000 --- a/crates/nvisy-identify/src/image/mod.rs +++ /dev/null @@ -1,9 +0,0 @@ -//! Image detection layers. - -pub mod ner; -pub mod face; -pub mod object; - -pub use ner::ImageNerDetection; -pub use face::{FaceBackend, FaceDetection}; -pub use object::{ObjectBackend, ObjectDetection}; diff --git a/crates/nvisy-identify/src/image/ner.rs b/crates/nvisy-identify/src/image/ner.rs deleted file mode 100644 index 54526af..0000000 --- a/crates/nvisy-identify/src/image/ner.rs +++ /dev/null @@ -1,112 +0,0 @@ -//! NER detection on images via [`NerBackend::detect_image`]. -//! -//! Encodes the image to PNG, sends it to the NER backend, and returns -//! entities with [`ImageLocation`] bounding boxes. - -use nvisy_codec::handler::{ImageData, Span}; -use nvisy_core::Error; - -use crate::Entity; -use crate::{ParallelContext, DetectionService}; -use crate::ner::{NerBackend, NerConfig, parse_image_ner_entity}; - -/// NER detection layer for images. -/// -/// Encodes each image span to PNG and delegates to an [`NerBackend`] -/// for entity recognition. -pub struct ImageNerDetection { - backend: B, - config: NerConfig, -} - -impl ImageNerDetection { - /// Create a new image NER detection layer. - pub fn new(backend: B, config: NerConfig) -> Self { - Self { backend, config } - } -} - -#[async_trait::async_trait] -impl DetectionService<(), ImageData> for ImageNerDetection { - type Context = ParallelContext; - - async fn detect( - &self, - spans: Vec>, - ) -> Result, Error> { - let mut entities = Vec::new(); - - for span in &spans { - let png_bytes = span.data.encode_png()?; - - let raw = self - .backend - .detect_image(&png_bytes, "image/png", &self.config) - .await?; - - for item in &raw { - if let Some(entity) = parse_image_ner_entity(item)? { - entities.push(entity.with_parent(&span.source)); - } - } - } - - Ok(entities) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::DetectionMethod; - use serde_json::{json, Value}; - - struct MockImageNerBackend; - - #[async_trait::async_trait] - impl NerBackend for MockImageNerBackend { - async fn detect_text(&self, _: &str, _: &NerConfig) -> Result, Error> { - Ok(Vec::new()) - } - - async fn detect_image( - &self, - _image_data: &[u8], - _mime_type: &str, - _config: &NerConfig, - ) -> Result, Error> { - Ok(vec![json!({ - "category": "pii", - "entity_type": "person_name", - "value": "John Doe", - "confidence": 0.92, - "x": 10.0, - "y": 20.0, - "width": 100.0, - "height": 30.0 - })]) - } - } - - #[tokio::test] - async fn detect_image_produces_image_location() { - let config = NerConfig { - entity_types: vec![], - confidence_threshold: 0.0, - }; - let layer = ImageNerDetection::new(MockImageNerBackend, config); - - // Create a tiny 1x1 image. - let img = ImageData::new_rgb(1, 1); - let spans = vec![Span::new((), img)]; - - let entities = layer.detect(spans).await.unwrap(); - assert_eq!(entities.len(), 1); - assert_eq!(entities[0].value, "John Doe"); - assert_eq!(entities[0].detection_method, DetectionMethod::Ner); - - let loc = entities[0].location.as_ref().unwrap().as_image().unwrap(); - assert!((loc.bounding_box.x - 10.0).abs() < f64::EPSILON); - assert!((loc.bounding_box.y - 20.0).abs() < f64::EPSILON); - } -} diff --git a/crates/nvisy-identify/src/image/object.rs b/crates/nvisy-identify/src/image/object.rs deleted file mode 100644 index e21e41c..0000000 --- a/crates/nvisy-identify/src/image/object.rs +++ /dev/null @@ -1,153 +0,0 @@ -//! Object detection layer for images. -//! -//! Delegates to an [`ObjectBackend`] to detect objects in images, -//! producing entities with [`ImageLocation`] bounding boxes. - -use std::str::FromStr; - -use serde_json::Value; - -use nvisy_codec::handler::{ImageData, Span}; -use nvisy_ontology::entity::{EntityCategory, EntityKind}; -use nvisy_core::math::BoundingBox; -use nvisy_core::Error; - -use crate::{DetectionMethod, Entity, ImageLocation, Location}; -use crate::{ParallelContext, DetectionService}; - -/// Backend trait for object detection providers. -#[async_trait::async_trait] -pub trait ObjectBackend: Send + Sync + 'static { - /// Detect objects in an image, returning raw JSON dicts. - /// - /// Each dict should contain: `label`, `confidence`, `x`, `y`, `width`, `height`, - /// and optionally `category` and `entity_type`. - async fn detect_objects( - &self, - image_data: &[u8], - mime_type: &str, - ) -> Result, Error>; -} - -/// Object detection layer — delegates to an [`ObjectBackend`] at runtime. -pub struct ObjectDetection { - backend: B, -} - -impl ObjectDetection { - /// Create a new object detection layer with the given backend. - pub fn new(backend: B) -> Self { - Self { backend } - } -} - -#[async_trait::async_trait] -impl DetectionService<(), ImageData> for ObjectDetection { - type Context = ParallelContext; - - async fn detect( - &self, - spans: Vec>, - ) -> Result, Error> { - let mut entities = Vec::new(); - - for span in &spans { - let png_bytes = span.data.encode_png()?; - - let raw = self.backend.detect_objects(&png_bytes, "image/png").await?; - - for item in &raw { - let obj = item.as_object().ok_or_else(|| { - Error::python("Expected JSON object in object detection results".to_string()) - })?; - - let label = obj - .get("label") - .and_then(Value::as_str) - .unwrap_or("unknown"); - - let entity_kind = obj - .get("entity_type") - .and_then(Value::as_str) - .and_then(|s| EntityKind::from_str(s).ok()) - .unwrap_or(EntityKind::Logo); - - let category = obj - .get("category") - .and_then(Value::as_str) - .map(|s| match s { - "pii" => EntityCategory::Pii, - "phi" => EntityCategory::Phi, - "biometric" => EntityCategory::Biometric, - other => EntityCategory::Custom(other.to_string()), - }) - .unwrap_or(EntityCategory::Pii); - - let confidence = obj.get("confidence").and_then(Value::as_f64).unwrap_or(0.0); - let x = obj.get("x").and_then(Value::as_f64).unwrap_or(0.0); - let y = obj.get("y").and_then(Value::as_f64).unwrap_or(0.0); - let width = obj.get("width").and_then(Value::as_f64).unwrap_or(0.0); - let height = obj.get("height").and_then(Value::as_f64).unwrap_or(0.0); - - let entity = Entity::new( - category, - entity_kind, - label, - DetectionMethod::ObjectDetection, - confidence, - ) - .with_location(Location::Image(ImageLocation { - bounding_box: BoundingBox { x, y, width, height }, - image_id: None, - page_number: None, - })) - .with_parent(&span.source); - - entities.push(entity); - } - } - - Ok(entities) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - struct MockObjectBackend; - - #[async_trait::async_trait] - impl ObjectBackend for MockObjectBackend { - async fn detect_objects(&self, _: &[u8], _: &str) -> Result, Error> { - Ok(vec![json!({ - "label": "license_plate", - "entity_type": "license_plate", - "category": "pii", - "confidence": 0.88, - "x": 100.0, - "y": 200.0, - "width": 80.0, - "height": 30.0 - })]) - } - } - - #[tokio::test] - async fn detect_object_produces_image_location() { - let layer = ObjectDetection::new(MockObjectBackend); - - let img = ImageData::new_rgb(400, 300); - let spans = vec![Span::new((), img)]; - - let entities = layer.detect(spans).await.unwrap(); - assert_eq!(entities.len(), 1); - assert_eq!(entities[0].entity_kind, EntityKind::LicensePlate); - assert_eq!(entities[0].detection_method, DetectionMethod::ObjectDetection); - assert_eq!(entities[0].value, "license_plate"); - - let loc = entities[0].location.as_ref().unwrap().as_image().unwrap(); - assert!((loc.bounding_box.x - 100.0).abs() < f64::EPSILON); - } -} diff --git a/crates/nvisy-identify/src/lib.rs b/crates/nvisy-identify/src/lib.rs index 6b092ca..f7b4905 100644 --- a/crates/nvisy-identify/src/lib.rs +++ b/crates/nvisy-identify/src/lib.rs @@ -2,33 +2,32 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -mod ontology; mod layer; -mod ner; -mod text; -mod image; -mod action; +mod method; +mod fusion; mod policy; pub mod prelude; -// --- Domain types --- -pub use ontology::*; +// --- Domain types (re-exported from nvisy-ontology) --- +pub use nvisy_ontology::entity::{ + Annotation, AnnotationKind, AnnotationLabel, AnnotationScope, + DetectionMethod, DetectionOutput, Entity, EntitySelector, ModelInfo, ModelKind, +}; +pub use nvisy_ontology::location::{ + AudioLocation, ImageLocation, Location, TabularLocation, TextLocation, VideoLocation, +}; // --- Layer traits --- pub use layer::*; -// --- NER backend --- -pub use ner::{NerBackend, NerConfig}; - -// --- Detection layers --- -pub use text::{PatternDetection, PatternDetectionParams}; -pub use text::{NerDetection, NerDetectionParams}; -pub use image::{ImageNerDetection, FaceBackend, FaceDetection, ObjectBackend, ObjectDetection}; +// --- Detection methods --- +pub use method::{NerMethod, NerMethodParams, CvMethod, PatternDetection, PatternDetectionParams}; // --- Post-detection actions --- -pub use action::{DetectManualAction, DetectManualParams, Exclusion, ManualOutput, is_excluded}; -pub use action::DeduplicateAction; +pub use fusion::{DetectManualAction, DetectManualParams, Exclusion, ManualOutput, is_excluded}; +pub use fusion::DeduplicateAction; +pub use fusion::{EnsembleMerge, FusionStrategy}; // --- Policy & governance --- pub use policy::{ diff --git a/crates/nvisy-identify/src/method/cv.rs b/crates/nvisy-identify/src/method/cv.rs new file mode 100644 index 0000000..3d9a697 --- /dev/null +++ b/crates/nvisy-identify/src/method/cv.rs @@ -0,0 +1,75 @@ +//! Computer-vision detection adapter wrapping [`CvAgent`] from `nvisy-rig`. +//! +//! Detects entities in image spans by delegating to the CvAgent's +//! object-detection + LLM-classification pipeline. + +use nvisy_codec::handler::{ImageData, Span}; +use nvisy_core::Error; +use nvisy_rig::{CvAgent, CvEntity, DetectionConfig}; + +use crate::{DetectionMethod, Entity, ImageLocation, Location}; +use crate::{ParallelContext, DetectionService}; +use nvisy_core::math::BoundingBox; + +/// Computer-vision detection method — thin adapter around [`CvAgent`]. +pub struct CvMethod { + agent: CvAgent, + config: DetectionConfig, +} + +impl CvMethod { + /// Create a new CV method from a pre-built agent and detection config. + pub fn from_agent(agent: CvAgent, config: DetectionConfig) -> Self { + Self { agent, config } + } +} + +#[async_trait::async_trait] +impl DetectionService<(), ImageData> for CvMethod { + type Context = ParallelContext; + + async fn detect( + &self, + spans: Vec>, + ) -> Result, Error> { + let mut entities = Vec::new(); + + for span in &spans { + let png_bytes = span.data.encode_png()?; + + let cv_entities = self + .agent + .detect(&png_bytes, &self.config) + .await + .map_err(|e| Error::runtime(e.to_string(), "cv-agent", e.is_retryable()))?; + + for cv_entity in &cv_entities { + let entity = map_cv_entity(cv_entity); + entities.push(entity.with_parent(&span.source)); + } + } + + Ok(entities) + } +} + +/// Convert a [`CvEntity`] to an [`Entity`] with [`ImageLocation`]. +fn map_cv_entity(cv: &CvEntity) -> Entity { + Entity::new( + cv.category.clone(), + cv.entity_type, + &cv.label, + DetectionMethod::ObjectDetection, + cv.confidence, + ) + .with_location(Location::Image(ImageLocation { + bounding_box: BoundingBox { + x: cv.bbox[0], + y: cv.bbox[1], + width: cv.bbox[2], + height: cv.bbox[3], + }, + image_id: None, + page_number: None, + })) +} diff --git a/crates/nvisy-identify/src/method/mod.rs b/crates/nvisy-identify/src/method/mod.rs new file mode 100644 index 0000000..e3d5197 --- /dev/null +++ b/crates/nvisy-identify/src/method/mod.rs @@ -0,0 +1,14 @@ +//! Detection method adapters wrapping external crates. +//! +//! Each sub-module provides a thin struct that holds an agent or engine +//! from `nvisy-rig` / `nvisy-pattern` and implements the +//! [`DetectionLayer`](crate::DetectionLayer) / +//! [`DetectionService`](crate::DetectionService) traits. + +mod ner; +mod cv; +mod pattern; + +pub use ner::{NerMethod, NerMethodParams}; +pub use cv::CvMethod; +pub use pattern::{PatternDetection, PatternDetectionParams}; diff --git a/crates/nvisy-identify/src/method/ner.rs b/crates/nvisy-identify/src/method/ner.rs new file mode 100644 index 0000000..8bd7b5d --- /dev/null +++ b/crates/nvisy-identify/src/method/ner.rs @@ -0,0 +1,176 @@ +//! NER detection adapter wrapping [`NerAgent`] from `nvisy-rig`. +//! +//! Uses a [`SequentialContext`] so the orchestrator feeds one span at +//! a time, allowing the adapter to accumulate known entities between +//! spans for coreference resolution. + +use serde::Deserialize; +use tokio::sync::Mutex; + +use nvisy_codec::handler::{Span, TxtSpan}; +use nvisy_ontology::entity::EntityKind; +use nvisy_core::Error; +use nvisy_ontology::entity::EntityCategory; +use nvisy_rig::{ + BaseAgentConfig, DetectionConfig, KnownNerEntity, NerAgent, NerContext, Provider, +}; + +use crate::{DetectionMethod, Entity, Location, TextLocation}; +use crate::{SequentialContext, DetectionLayer, DetectionService}; + +fn default_confidence() -> f64 { + 0.5 +} + +/// Typed parameters for [`NerMethod`]. +#[derive(Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct NerMethodParams { + /// Entity kinds to detect (empty = all). + #[serde(rename = "entityTypes", default)] + pub entity_kinds: Vec, + /// Minimum confidence score for returned entities. + #[serde(default = "default_confidence")] + pub confidence_threshold: f64, + /// Provider configuration for the NER agent. + #[serde(skip)] + pub provider: Option, + /// Optional agent config overrides. + #[serde(skip)] + pub agent_config: Option, +} + +/// Accumulated state between sequential span calls. +struct NerState { + /// Known entities from prior detection calls (for coreference). + known_entities: Vec, +} + +/// NER detection method — thin adapter around [`NerAgent`]. +/// +/// Uses [`SequentialContext`]: the orchestrator feeds one span at a +/// time so the adapter can carry known-entity context between spans. +pub struct NerMethod { + agent: NerAgent, + config: DetectionConfig, + state: Mutex, +} + +impl NerMethod { + /// Create a new NER method from a pre-built agent and detection config. + pub fn from_agent(agent: NerAgent, config: DetectionConfig) -> Self { + Self { + agent, + config, + state: Mutex::new(NerState { + known_entities: Vec::new(), + }), + } + } + + /// Clear accumulated state between documents. + pub async fn reset(&self) { + let mut state = self.state.lock().await; + state.known_entities.clear(); + } +} + +#[async_trait::async_trait] +impl DetectionLayer for NerMethod { + type Params = NerMethodParams; + + async fn connect(params: Self::Params) -> Result { + let provider = params.provider.ok_or_else(|| { + Error::validation("NerMethod requires a provider", "ner-method") + })?; + let agent_config = params.agent_config.unwrap_or_default(); + let agent = NerAgent::new(&provider, agent_config).map_err(|e| { + Error::validation(e.to_string(), "ner-method") + })?; + let config = DetectionConfig { + entity_kinds: params.entity_kinds, + confidence_threshold: params.confidence_threshold, + system_prompt: None, + }; + Ok(Self::from_agent(agent, config)) + } +} + +#[async_trait::async_trait] +impl DetectionService for NerMethod { + type Context = SequentialContext; + + async fn detect( + &self, + spans: Vec>, + ) -> Result, Error> { + let mut entities = Vec::new(); + + for span in &spans { + // Build NER context with accumulated known entities. + let known = { + let state = self.state.lock().await; + state.known_entities.clone() + }; + let ctx = NerContext::with_known(&span.data, known); + + let ner_entities = self + .agent + .detect(&ctx, &self.config) + .await + .map_err(|e| Error::runtime(e.to_string(), "ner-agent", e.is_retryable()))?; + + // Convert NerEntity → Entity with resolved offsets. + for ner_entity in &ner_entities { + let category: EntityCategory = match ner_entity.category { + Some(ref c) => c.clone(), + None => continue, + }; + let entity_kind = match ner_entity.entity_type { + Some(ek) => ek, + None => continue, + }; + let confidence = ner_entity.confidence.unwrap_or(0.0); + if confidence < self.config.confidence_threshold { + continue; + } + + let mut entity = Entity::new( + category, + entity_kind, + &ner_entity.value, + DetectionMethod::Ner, + confidence, + ); + + // Resolve offsets within the current span text. + if let Some(offsets) = ner_entity.resolve_offsets(&ctx) { + entity = entity.with_location(Location::Text(TextLocation { + start_offset: offsets.start, + end_offset: offsets.end, + element_id: Some(span.id.0.to_string()), + ..Default::default() + })); + } else { + entity = entity.with_location(Location::Text(TextLocation { + element_id: Some(span.id.0.to_string()), + ..Default::default() + })); + } + + entities.push(entity.with_parent(&span.source)); + } + + // Accumulate known entities for coreference across spans. + let mut state = self.state.lock().await; + let mut merge_ctx = NerContext::with_known( + &span.data, + std::mem::take(&mut state.known_entities), + ); + merge_ctx.merge(ner_entities); + state.known_entities = merge_ctx.known_entities; + } + + Ok(entities) + } +} diff --git a/crates/nvisy-identify/src/text/pattern.rs b/crates/nvisy-identify/src/method/pattern.rs similarity index 100% rename from crates/nvisy-identify/src/text/pattern.rs rename to crates/nvisy-identify/src/method/pattern.rs diff --git a/crates/nvisy-identify/src/ner/backend.rs b/crates/nvisy-identify/src/ner/backend.rs deleted file mode 100644 index 4dbc4bf..0000000 --- a/crates/nvisy-identify/src/ner/backend.rs +++ /dev/null @@ -1,42 +0,0 @@ -//! NER backend trait and configuration. - -use serde_json::Value; - -use nvisy_core::Error; - -/// Configuration passed to an [`NerBackend`] implementation. -/// -/// Contains only the model-agnostic parameters that every backend needs. -/// Provider-specific fields (API key, model name, etc.) belong in the -/// action's [`NerDetectionParams`](super::super::text::ner::NerDetectionParams) -/// or the provider's credentials. -#[derive(Debug, Clone)] -pub struct NerConfig { - /// Entity type labels to detect (e.g., `["PERSON", "SSN"]`). - pub entity_types: Vec, - /// Minimum confidence score to include a detection (0.0 -- 1.0). - pub confidence_threshold: f64, -} - -/// Backend trait for NER providers. -/// -/// Implementations call an external NER service (e.g. via Python, HTTP) -/// and return raw JSON results. Entity construction from the raw dicts -/// is handled by the detection layers. -#[async_trait::async_trait] -pub trait NerBackend: Send + Sync + 'static { - /// Detect entities in text, returning raw dicts. - async fn detect_text( - &self, - text: &str, - config: &NerConfig, - ) -> Result, Error>; - - /// Detect entities in an image, returning raw dicts. - async fn detect_image( - &self, - image_data: &[u8], - mime_type: &str, - config: &NerConfig, - ) -> Result, Error>; -} diff --git a/crates/nvisy-identify/src/ner/bridge.rs b/crates/nvisy-identify/src/ner/bridge.rs deleted file mode 100644 index a4ef000..0000000 --- a/crates/nvisy-identify/src/ner/bridge.rs +++ /dev/null @@ -1,38 +0,0 @@ -//! [`NerBackend`] implementation for [`PythonBridge`]. - -use serde_json::Value; - -use nvisy_core::Error; -use nvisy_python::bridge::PythonBridge; -use nvisy_python::ner::NerParams; - -use super::backend::{NerBackend, NerConfig}; - -/// Converts [`NerConfig`] to [`NerParams`] and delegates to `nvisy_python::ner`. -#[async_trait::async_trait] -impl NerBackend for PythonBridge { - async fn detect_text( - &self, - text: &str, - config: &NerConfig, - ) -> Result, Error> { - let params = NerParams { - entity_types: config.entity_types.clone(), - confidence_threshold: config.confidence_threshold, - }; - nvisy_python::ner::detect_ner(self, text, ¶ms).await - } - - async fn detect_image( - &self, - image_data: &[u8], - mime_type: &str, - config: &NerConfig, - ) -> Result, Error> { - let params = NerParams { - entity_types: config.entity_types.clone(), - confidence_threshold: config.confidence_threshold, - }; - nvisy_python::ner::detect_ner_image(self, image_data, mime_type, ¶ms).await - } -} diff --git a/crates/nvisy-identify/src/ner/mod.rs b/crates/nvisy-identify/src/ner/mod.rs deleted file mode 100644 index 4a82676..0000000 --- a/crates/nvisy-identify/src/ner/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -//! Cross-modal NER backend trait, configuration, and result parsing. - -mod backend; -mod bridge; -mod parse; - -pub use backend::{NerBackend, NerConfig}; -pub use parse::{parse_image_ner_entity, parse_ner_entities}; diff --git a/crates/nvisy-identify/src/ner/parse.rs b/crates/nvisy-identify/src/ner/parse.rs deleted file mode 100644 index e37fd1b..0000000 --- a/crates/nvisy-identify/src/ner/parse.rs +++ /dev/null @@ -1,150 +0,0 @@ -//! NER result parsing for text and image modalities. - -use std::str::FromStr; - -use serde_json::Value; - -use nvisy_ontology::entity::{EntityCategory, EntityKind}; -use nvisy_core::math::BoundingBox; -use nvisy_core::Error; - -use crate::{DetectionMethod, Entity, ImageLocation, Location, TextLocation}; - -/// Parse raw JSON dicts from an NER backend into [`Entity`] values. -/// -/// Expected dict keys: `category`, `entity_type`, `value`, `confidence`, -/// and optionally `start_offset` / `end_offset`. -pub fn parse_ner_entities(raw: &[Value]) -> Result, Error> { - let mut entities = Vec::new(); - - for item in raw { - let obj = item.as_object().ok_or_else(|| { - Error::python("Expected JSON object in NER results".to_string()) - })?; - - let category_str = obj - .get("category") - .and_then(Value::as_str) - .ok_or_else(|| Error::python("Missing 'category'".to_string()))?; - - let category = match category_str { - "pii" => EntityCategory::Pii, - "phi" => EntityCategory::Phi, - "financial" => EntityCategory::Financial, - "credentials" => EntityCategory::Credentials, - other => EntityCategory::Custom(other.to_string()), - }; - - let entity_type_str = obj - .get("entity_type") - .and_then(Value::as_str) - .ok_or_else(|| Error::python("Missing 'entity_type'".to_string()))?; - - let entity_kind = match EntityKind::from_str(entity_type_str) { - Ok(ek) => ek, - Err(_) => { - tracing::warn!(entity_type = entity_type_str, "unknown entity type from NER, dropping"); - continue; - } - }; - - let value = obj - .get("value") - .and_then(Value::as_str) - .ok_or_else(|| Error::python("Missing 'value'".to_string()))?; - - let confidence = obj - .get("confidence") - .and_then(Value::as_f64) - .ok_or_else(|| Error::python("Missing 'confidence'".to_string()))?; - - let start_offset = obj - .get("start_offset") - .and_then(Value::as_u64) - .map(|v| v as usize) - .unwrap_or(0); - - let end_offset = obj - .get("end_offset") - .and_then(Value::as_u64) - .map(|v| v as usize) - .unwrap_or(0); - - let entity = Entity::new( - category, - entity_kind, - value, - DetectionMethod::Ner, - confidence, - ) - .with_location(Location::Text(TextLocation { - start_offset, - end_offset, - ..Default::default() - })); - - entities.push(entity); - } - - Ok(entities) -} - -/// Parse a single NER result dict into an [`Entity`] with [`ImageLocation`]. -/// -/// Expected keys: `category`, `entity_type`, `value`, `confidence`, -/// and optionally bounding box fields `x`, `y`, `width`, `height`. -pub fn parse_image_ner_entity(item: &Value) -> Result, Error> { - let obj = item.as_object().ok_or_else(|| { - Error::python("Expected JSON object in image NER results".to_string()) - })?; - - let category_str = obj - .get("category") - .and_then(Value::as_str) - .ok_or_else(|| Error::python("Missing 'category'".to_string()))?; - - let category = match category_str { - "pii" => EntityCategory::Pii, - "phi" => EntityCategory::Phi, - "financial" => EntityCategory::Financial, - "credentials" => EntityCategory::Credentials, - other => EntityCategory::Custom(other.to_string()), - }; - - let entity_type_str = obj - .get("entity_type") - .and_then(Value::as_str) - .ok_or_else(|| Error::python("Missing 'entity_type'".to_string()))?; - - let entity_kind = match EntityKind::from_str(entity_type_str) { - Ok(ek) => ek, - Err(_) => { - tracing::warn!(entity_type = entity_type_str, "unknown entity type from image NER, dropping"); - return Ok(None); - } - }; - - let value = obj - .get("value") - .and_then(Value::as_str) - .ok_or_else(|| Error::python("Missing 'value'".to_string()))?; - - let confidence = obj - .get("confidence") - .and_then(Value::as_f64) - .ok_or_else(|| Error::python("Missing 'confidence'".to_string()))?; - - let x = obj.get("x").and_then(Value::as_f64).unwrap_or(0.0); - let y = obj.get("y").and_then(Value::as_f64).unwrap_or(0.0); - let width = obj.get("width").and_then(Value::as_f64).unwrap_or(0.0); - let height = obj.get("height").and_then(Value::as_f64).unwrap_or(0.0); - - let entity = Entity::new(category, entity_kind, value, DetectionMethod::Ner, confidence) - .with_location(Location::Image(ImageLocation { - bounding_box: BoundingBox { x, y, width, height }, - image_id: None, - page_number: None, - })); - - Ok(Some(entity)) -} diff --git a/crates/nvisy-identify/src/ontology/mod.rs b/crates/nvisy-identify/src/ontology/mod.rs deleted file mode 100644 index 8145ee5..0000000 --- a/crates/nvisy-identify/src/ontology/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -pub use nvisy_ontology::entity::{ - Annotation, AnnotationKind, AnnotationLabel, AnnotationScope, - DetectionMethod, DetectionOutput, Entity, EntitySelector, ModelInfo, ModelKind, -}; -pub use nvisy_ontology::location::{ - AudioLocation, ImageLocation, Location, TabularLocation, TextLocation, VideoLocation, -}; diff --git a/crates/nvisy-identify/src/policy/audit.rs b/crates/nvisy-identify/src/policy/audit.rs index 99f6cf1..0ce4bf9 100644 --- a/crates/nvisy-identify/src/policy/audit.rs +++ b/crates/nvisy-identify/src/policy/audit.rs @@ -5,14 +5,13 @@ use jiff::Timestamp; use serde::{Deserialize, Serialize}; -use strum::Display; +use strum::{Display, EnumString}; use uuid::Uuid; use nvisy_core::path::ContentSource; /// Kind of auditable action recorded in an [`Audit`] entry. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, EnumString, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum AuditAction { @@ -28,8 +27,7 @@ pub enum AuditAction { /// /// Audit entries are emitted by pipeline actions and form a tamper-evident /// log of all detection, redaction, and policy decisions. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] +#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)] pub struct Audit { /// Content source identity and lineage. #[serde(flatten)] diff --git a/crates/nvisy-identify/src/policy/evaluate.rs b/crates/nvisy-identify/src/policy/evaluate.rs index 07d624b..6330498 100644 --- a/crates/nvisy-identify/src/policy/evaluate.rs +++ b/crates/nvisy-identify/src/policy/evaluate.rs @@ -4,7 +4,7 @@ use serde::Deserialize; use crate::Entity; use nvisy_ontology::record::Redaction; -use nvisy_ontology::spec::{RedactionInput, TextRedactionInput}; +use nvisy_ontology::specification::{RedactionInput, TextRedactionInput}; use super::rule::PolicyRule; use nvisy_core::Error; diff --git a/crates/nvisy-identify/src/policy/mod.rs b/crates/nvisy-identify/src/policy/mod.rs index 484c517..df7ad1c 100644 --- a/crates/nvisy-identify/src/policy/mod.rs +++ b/crates/nvisy-identify/src/policy/mod.rs @@ -21,7 +21,7 @@ pub use summary::RedactionSummary; // Re-export data types from nvisy-ontology pub use nvisy_ontology::record::Redaction; pub use nvisy_ontology::record::{ReviewDecision, ReviewStatus}; -pub use nvisy_ontology::spec::{ +pub use nvisy_ontology::specification::{ AudioRedactionInput, ImageRedactionInput, RedactionInput, TextRedactionInput, DEFAULT_BLOCK_COLOR, DEFAULT_BLUR_SIGMA, DEFAULT_MASK_CHAR, DEFAULT_PIXELATE_BLOCK_SIZE, }; diff --git a/crates/nvisy-identify/src/policy/retention.rs b/crates/nvisy-identify/src/policy/retention.rs index 808ecd3..d7cc21d 100644 --- a/crates/nvisy-identify/src/policy/retention.rs +++ b/crates/nvisy-identify/src/policy/retention.rs @@ -3,11 +3,12 @@ use std::time::Duration; use serde::{Deserialize, Serialize}; +use strum::{Display, EnumString}; /// What class of data a retention policy applies to. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, EnumString, Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum RetentionScope { /// Original ingested content before redaction. OriginalContent, @@ -18,8 +19,7 @@ pub enum RetentionScope { } /// A retention policy governing how long data is kept. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] +#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)] pub struct RetentionPolicy { /// What class of data this policy applies to. pub scope: RetentionScope, diff --git a/crates/nvisy-identify/src/policy/rule.rs b/crates/nvisy-identify/src/policy/rule.rs index 5d8b88e..8daa801 100644 --- a/crates/nvisy-identify/src/policy/rule.rs +++ b/crates/nvisy-identify/src/policy/rule.rs @@ -5,7 +5,7 @@ use strum::Display; use uuid::Uuid; use nvisy_core::fs::DocumentType; -use nvisy_ontology::spec::RedactionInput; +use nvisy_ontology::specification::RedactionInput; use crate::EntitySelector; diff --git a/crates/nvisy-identify/src/policy/types.rs b/crates/nvisy-identify/src/policy/types.rs index db36101..e959eb6 100644 --- a/crates/nvisy-identify/src/policy/types.rs +++ b/crates/nvisy-identify/src/policy/types.rs @@ -6,7 +6,7 @@ use uuid::Uuid; use super::rule::PolicyRule; use super::regulation::RegulationKind; -use nvisy_ontology::spec::RedactionInput; +use nvisy_ontology::specification::RedactionInput; /// A named redaction policy containing an ordered set of rules. #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/crates/nvisy-identify/src/prelude.rs b/crates/nvisy-identify/src/prelude.rs index 3a6e834..aa4f302 100644 --- a/crates/nvisy-identify/src/prelude.rs +++ b/crates/nvisy-identify/src/prelude.rs @@ -7,5 +7,7 @@ pub use crate::{ Annotation, AnnotationKind, AnnotationLabel, AnnotationScope, DetectionContext, ParallelContext, SequentialContext, DetectionLayer, DetectionService, - NerBackend, NerConfig, + NerMethod, NerMethodParams, + CvMethod, + PatternDetection, PatternDetectionParams, }; diff --git a/crates/nvisy-identify/src/text/mod.rs b/crates/nvisy-identify/src/text/mod.rs deleted file mode 100644 index b55134b..0000000 --- a/crates/nvisy-identify/src/text/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -//! Text detection layers. - -pub mod pattern; -pub mod ner; - -pub use pattern::{PatternDetection, PatternDetectionParams}; -pub use ner::{NerDetection, NerDetectionParams}; diff --git a/crates/nvisy-identify/src/text/ner.rs b/crates/nvisy-identify/src/text/ner.rs deleted file mode 100644 index 7bd3b4b..0000000 --- a/crates/nvisy-identify/src/text/ner.rs +++ /dev/null @@ -1,339 +0,0 @@ -//! AI-powered named-entity recognition (NER) detection layer. -//! -//! Uses a [`SequentialContext`] so the orchestrator feeds one span at -//! a time, allowing the layer to accumulate prior text/entities -//! between spans via interior mutability. - -use serde::Deserialize; -use tokio::sync::Mutex; - -use nvisy_codec::handler::{Span, TxtSpan}; -use nvisy_ontology::entity::EntityKind; -use nvisy_core::Error; - -use crate::ner::{NerBackend, NerConfig, parse_ner_entities}; -use crate::{Entity, Location, ModelInfo, TextLocation}; -use crate::{SequentialContext, DetectionService}; - -fn default_confidence() -> f64 { - 0.5 -} - -/// Typed parameters for [`NerDetection`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct NerDetectionParams { - /// Entity kinds to detect (empty = all). - #[serde(rename = "entityTypes", default)] - pub entity_kinds: Vec, - /// Minimum confidence score for returned entities. - #[serde(default = "default_confidence")] - pub confidence_threshold: f64, - /// Optional model info to attach to every NER-produced entity. - #[serde(skip)] - pub model_info: Option, -} - -/// Accumulated state between sequential span calls. -struct NerState { - /// Text from previously processed spans (for sliding context). - prior_text: String, -} - -/// AI NER detection layer — delegates to an [`NerBackend`] at runtime. -/// -/// Uses [`SequentialContext`]: the orchestrator feeds one span at a -/// time so the layer can carry sliding context between spans. -pub struct NerDetection { - backend: B, - config: NerConfig, - model_info: Option, - state: Mutex, -} - -impl NerDetection { - /// Create a new detection layer with the given backend and params. - pub fn new(backend: B, params: NerDetectionParams) -> Self { - let config = NerConfig { - entity_types: params.entity_kinds.iter().map(|ek| ek.to_string()).collect(), - confidence_threshold: params.confidence_threshold, - }; - Self { - backend, - config, - model_info: params.model_info, - state: Mutex::new(NerState { - prior_text: String::new(), - }), - } - } - - /// Clear accumulated state between documents. - pub async fn reset(&self) { - let mut state = self.state.lock().await; - state.prior_text.clear(); - } -} - -#[async_trait::async_trait] -impl DetectionService for NerDetection { - type Context = SequentialContext; - - async fn detect( - &self, - spans: Vec>, - ) -> Result, Error> { - let mut entities = Vec::new(); - - for span in &spans { - // Build the full text with prior context prepended. - let (full_text, context_len) = { - let state = self.state.lock().await; - if state.prior_text.is_empty() { - (span.data.clone(), 0) - } else { - let sep = "\n"; - let context_len = state.prior_text.len() + sep.len(); - let full = format!("{}{}{}", state.prior_text, sep, span.data); - (full, context_len) - } - }; - - let raw = self - .backend - .detect_text(&full_text, &self.config) - .await?; - - // Filter entities to the current span and adjust offsets. - let span_len = span.data.len(); - for mut e in parse_ner_entities(&raw)? { - if let Some(Location::Text(ref loc)) = e.location { - // Skip entities that fall entirely within the prior context. - if loc.end_offset <= context_len { - continue; - } - // Skip entities that start before the current span. - if loc.start_offset < context_len { - continue; - } - // Skip entities that extend beyond the current span. - if loc.start_offset - context_len >= span_len { - continue; - } - // Adjust offsets to be relative to the current span. - e.location = Some(Location::Text(TextLocation { - start_offset: loc.start_offset - context_len, - end_offset: loc.end_offset - context_len, - element_id: Some(span.id.0.to_string()), - ..Default::default() - })); - } else { - // Non-text entity: set element_id via a new text location. - e.location = Some(Location::Text(TextLocation { - element_id: Some(span.id.0.to_string()), - ..Default::default() - })); - } - - // Attach model info if provided. - if let Some(ref model) = self.model_info { - e.model = Some(model.clone()); - } - - entities.push(e.with_parent(&span.source)); - } - - // Accumulate text for sliding context. - let mut state = self.state.lock().await; - if !state.prior_text.is_empty() { - state.prior_text.push('\n'); - } - state.prior_text.push_str(&span.data); - } - - Ok(entities) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::{json, Value}; - - #[test] - fn parse_ner_entities_basic() { - let raw = vec![json!({ - "category": "pii", - "entity_type": "person_name", - "value": "John Doe", - "confidence": 0.95, - "start_offset": 10, - "end_offset": 18 - })]; - let entities = parse_ner_entities(&raw).unwrap(); - assert_eq!(entities.len(), 1); - assert_eq!(entities[0].value, "John Doe"); - assert_eq!(entities[0].entity_kind, EntityKind::PersonName); - let loc = entities[0].location.as_ref().unwrap().as_text().unwrap(); - assert_eq!(loc.start_offset, 10); - assert_eq!(loc.end_offset, 18); - } - - #[test] - fn parse_ner_entities_sets_element_id_none_by_default() { - let raw = vec![json!({ - "category": "pii", - "entity_type": "email_address", - "value": "a@b.com", - "confidence": 0.9, - "start_offset": 0, - "end_offset": 7 - })]; - let entities = parse_ner_entities(&raw).unwrap(); - let loc = entities[0].location.as_ref().unwrap().as_text().unwrap(); - assert!(loc.element_id.is_none()); - } - - /// Mock NER backend that returns entities relative to the full text it receives. - struct MockNerBackend; - - #[async_trait::async_trait] - impl NerBackend for MockNerBackend { - async fn detect_text( - &self, - text: &str, - _config: &NerConfig, - ) -> Result, Error> { - // Find "ENTITY" in the text and report its position. - let mut results = Vec::new(); - if let Some(pos) = text.find("ENTITY") { - results.push(json!({ - "category": "pii", - "entity_type": "person_name", - "value": "ENTITY", - "confidence": 0.95, - "start_offset": pos, - "end_offset": pos + 6 - })); - } - Ok(results) - } - - async fn detect_image( - &self, - _image_data: &[u8], - _mime_type: &str, - _config: &NerConfig, - ) -> Result, Error> { - Ok(Vec::new()) - } - } - - #[tokio::test] - async fn sliding_context_prepended_and_offsets_adjusted() { - let params = NerDetectionParams { - entity_kinds: vec![], - confidence_threshold: 0.0, - model_info: None, - }; - let ner = NerDetection::new(MockNerBackend, params); - - // First span: no entity, just context. - let span1 = vec![Span::new(TxtSpan(0), "some context text".into())]; - let result1 = ner.detect(span1).await.unwrap(); - assert!(result1.is_empty()); - - // Second span: entity in current span. Backend sees prior + current. - let span2 = vec![Span::new(TxtSpan(1), "has ENTITY here".into())]; - let result2 = ner.detect(span2).await.unwrap(); - assert_eq!(result2.len(), 1); - - // Offsets should be adjusted to current span (relative). - let loc = result2[0].location.as_ref().unwrap().as_text().unwrap(); - assert_eq!(loc.start_offset, 4); // "has " = 4 chars - assert_eq!(loc.end_offset, 10); // "has ENTITY" = 10 chars - assert_eq!(loc.element_id.as_deref(), Some("1")); - } - - #[tokio::test] - async fn element_id_set_from_span() { - let params = NerDetectionParams { - entity_kinds: vec![], - confidence_threshold: 0.0, - model_info: None, - }; - let ner = NerDetection::new(MockNerBackend, params); - - let spans = vec![Span::new(TxtSpan(42), "ENTITY".into())]; - let entities = ner.detect(spans).await.unwrap(); - assert_eq!(entities.len(), 1); - let loc = entities[0].location.as_ref().unwrap().as_text().unwrap(); - assert_eq!(loc.element_id.as_deref(), Some("42")); - } - - #[tokio::test] - async fn model_info_attached_when_provided() { - let model = ModelInfo { - name: "test-model".into(), - kind: crate::ModelKind::OpenSource, - version: "1.0".into(), - }; - let params = NerDetectionParams { - entity_kinds: vec![], - confidence_threshold: 0.0, - model_info: Some(model.clone()), - }; - let ner = NerDetection::new(MockNerBackend, params); - - let spans = vec![Span::new(TxtSpan(0), "ENTITY".into())]; - let entities = ner.detect(spans).await.unwrap(); - assert_eq!(entities.len(), 1); - assert_eq!(entities[0].model.as_ref().unwrap().name, "test-model"); - } - - #[tokio::test] - async fn entities_in_prior_context_are_filtered_out() { - // Backend that always returns an entity at position 0..6. - struct AlwaysFirstBackend; - - #[async_trait::async_trait] - impl NerBackend for AlwaysFirstBackend { - async fn detect_text( - &self, - _text: &str, - _config: &NerConfig, - ) -> Result, Error> { - Ok(vec![json!({ - "category": "pii", - "entity_type": "person_name", - "value": "ENTITY", - "confidence": 0.95, - "start_offset": 0, - "end_offset": 6 - })]) - } - - async fn detect_image( - &self, - _: &[u8], _: &str, _: &NerConfig, - ) -> Result, Error> { - Ok(Vec::new()) - } - } - - let params = NerDetectionParams { - entity_kinds: vec![], - confidence_threshold: 0.0, - model_info: None, - }; - let ner = NerDetection::new(AlwaysFirstBackend, params); - - // First span — entity at 0..6 in current span: should be included. - let result1 = ner.detect(vec![Span::new(TxtSpan(0), "ENTITY here".into())]).await.unwrap(); - assert_eq!(result1.len(), 1); - - // Second span — entity at 0..6 is now in the prior context, should be filtered. - let result2 = ner.detect(vec![Span::new(TxtSpan(1), "no entity".into())]).await.unwrap(); - assert!(result2.is_empty()); - } -} diff --git a/crates/nvisy-object/Cargo.toml b/crates/nvisy-object/Cargo.toml deleted file mode 100644 index c642375..0000000 --- a/crates/nvisy-object/Cargo.toml +++ /dev/null @@ -1,46 +0,0 @@ -# https://doc.rust-lang.org/cargo/reference/manifest.html - -[package] -name = "nvisy-object" -description = "Object store providers and streams (S3, Azure, GCS) for Nvisy" -keywords = ["nvisy", "object-store", "s3", "storage"] -categories = ["filesystem"] - -version = { workspace = true } -rust-version = { workspace = true } -edition = { workspace = true } -license = { workspace = true } -publish = { workspace = true } - -authors = { workspace = true } -repository = { workspace = true } -homepage = { workspace = true } -documentation = { workspace = true } - -[package.metadata.docs.rs] -all-features = true -rustdoc-args = ["--cfg", "docsrs"] - -[dependencies] -# Internal crates -nvisy-core = { workspace = true, features = [] } - -# (De)serialization -serde = { workspace = true, features = ["derive"] } - -# Async runtime -tokio = { workspace = true, features = ["sync"] } -async-trait = { workspace = true, features = [] } -futures = { workspace = true, features = [] } - -# Primitive datatypes -bytes = { workspace = true, features = [] } - -# Cloud object storage (S3, Azure Blob, GCS) -object_store = { workspace = true, features = ["aws", "azure", "gcp"] } - -# Observability -tracing = { workspace = true, features = [] } - -[dev-dependencies] -tokio = { workspace = true, features = ["macros", "rt"] } diff --git a/crates/nvisy-object/src/client/get_output.rs b/crates/nvisy-object/src/client/get_output.rs deleted file mode 100644 index 6546f2a..0000000 --- a/crates/nvisy-object/src/client/get_output.rs +++ /dev/null @@ -1,15 +0,0 @@ -//! Result type for [`ObjectStoreClient::get`](super::ObjectStoreClient::get). - -use bytes::Bytes; -use object_store::ObjectMeta; - -/// Result of a successful [`ObjectStoreClient::get`](super::ObjectStoreClient::get) call. -#[derive(Debug)] -pub struct GetOutput { - /// Raw bytes of the retrieved object. - pub data: Bytes, - /// MIME content-type, if the backend provides one. - pub content_type: Option, - /// Object metadata (size, etag, last_modified, location). - pub meta: ObjectMeta, -} diff --git a/crates/nvisy-object/src/client/mod.rs b/crates/nvisy-object/src/client/mod.rs deleted file mode 100644 index 7419197..0000000 --- a/crates/nvisy-object/src/client/mod.rs +++ /dev/null @@ -1,340 +0,0 @@ -//! Unified object-store client backed by [`object_store::ObjectStore`]. -//! -//! [`ObjectStoreClient`] is a thin, cloneable wrapper around -//! `Arc` that provides convenience methods for the most -//! common operations. Every public method is instrumented with -//! [`tracing`] for observability. - -use std::sync::Arc; - -use bytes::Bytes; -use futures::stream::BoxStream; -use futures::TryStreamExt; -use object_store::path::Path; -use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt, PutMode, PutOptions, PutPayload}; - -use nvisy_core::Error; - -mod get_output; -mod put_output; - -pub use get_output::GetOutput; -pub use put_output::PutOutput; - -/// Cloneable handle to any [`ObjectStore`] backend (S3, Azure, GCS, ...). -/// -/// All methods accept human-readable string keys and convert them to -/// [`object_store::path::Path`] internally. -#[derive(Clone, Debug)] -pub struct ObjectStoreClient(pub Arc); - -impl ObjectStoreClient { - /// Wrap a concrete [`ObjectStore`] implementation. - pub fn new(store: impl ObjectStore) -> Self { - Self(Arc::new(store)) - } - - /// Verify that the backing store is reachable. - /// - /// Issues a HEAD for a probe key — a not-found response is treated as - /// success (the bucket/container exists), any other error is propagated. - #[tracing::instrument(name = "object.verify", skip(self))] - pub async fn verify_reachable(&self) -> Result<(), Error> { - let path = Path::from("_nvisy_verify_probe"); - match self.0.head(&path).await { - Ok(_) => Ok(()), - Err(object_store::Error::NotFound { .. }) => Ok(()), - Err(e) => Err(from_object_store(e)), - } - } - - /// List object keys under `prefix`. - /// - /// Returns all matching keys in a single `Vec`. For lazy iteration, - /// use [`list_stream`](Self::list_stream) instead. - #[tracing::instrument(name = "object.list", skip(self), fields(prefix))] - pub async fn list( - &self, - prefix: &str, - ) -> Result, Error> { - let prefix = if prefix.is_empty() { - None - } else { - Some(Path::from(prefix)) - }; - self.0 - .list(prefix.as_ref()) - .try_collect() - .await - .map_err(from_object_store) - } - - /// Lazily stream object metadata under `prefix`. - #[tracing::instrument(name = "object.list_stream", skip(self), fields(prefix))] - pub fn list_stream( - &self, - prefix: &str, - ) -> BoxStream<'_, Result> { - let prefix = if prefix.is_empty() { - None - } else { - Some(Path::from(prefix)) - }; - Box::pin(self.0.list(prefix.as_ref()).map_err(from_object_store)) - } - - /// Retrieve the raw bytes, content-type, and metadata stored at `key`. - #[tracing::instrument(name = "object.get", skip(self), fields(key))] - pub async fn get(&self, key: &str) -> Result { - let path = Path::from(key); - let result = self.0.get(&path).await.map_err(from_object_store)?; - let meta = result.meta.clone(); - let content_type = result - .attributes - .get(&object_store::Attribute::ContentType) - .map(|v| v.to_string()); - let data = result.bytes().await.map_err(from_object_store)?; - Ok(GetOutput { - data, - content_type, - meta, - }) - } - - /// Upload `data` to `key`, optionally setting the content-type. - pub async fn put( - &self, - key: &str, - data: Bytes, - content_type: Option<&str>, - ) -> Result { - self.put_opts(key, data, PutMode::Overwrite, content_type).await - } - - /// Upload `data` to `key` with the specified [`PutMode`]. - #[tracing::instrument(name = "object.put_opts", skip(self, data), fields(key, size = data.len()))] - pub async fn put_opts( - &self, - key: &str, - data: Bytes, - mode: PutMode, - content_type: Option<&str>, - ) -> Result { - let path = Path::from(key); - let payload = PutPayload::from(data); - let mut opts = PutOptions { - mode, - ..Default::default() - }; - if let Some(ct) = content_type { - opts.attributes.insert( - object_store::Attribute::ContentType, - ct.to_string().into(), - ); - } - let result = self - .0 - .put_opts(&path, payload, opts) - .await - .map_err(from_object_store)?; - Ok(result.into()) - } - - /// Get object metadata without downloading the body. - #[tracing::instrument(name = "object.head", skip(self), fields(key))] - pub async fn head(&self, key: &str) -> Result { - let path = Path::from(key); - self.0.head(&path).await.map_err(from_object_store) - } - - /// Delete the object at `key`. - #[tracing::instrument(name = "object.delete", skip(self), fields(key))] - pub async fn delete(&self, key: &str) -> Result<(), Error> { - let path = Path::from(key); - self.0.delete(&path).await.map_err(from_object_store) - } - - /// Copy an object from `src` to `dst` within the same store. - #[tracing::instrument(name = "object.copy", skip(self), fields(src, dst))] - pub async fn copy(&self, src: &str, dst: &str) -> Result<(), Error> { - let from = Path::from(src); - let to = Path::from(dst); - self.0.copy(&from, &to).await.map_err(from_object_store) - } -} - -/// Convert an [`object_store::Error`] into a [`nvisy_core::Error`]. -fn from_object_store(err: object_store::Error) -> Error { - let retryable = !matches!( - err, - object_store::Error::NotFound { .. } - | object_store::Error::PermissionDenied { .. } - | object_store::Error::Unauthenticated { .. } - | object_store::Error::AlreadyExists { .. } - | object_store::Error::Precondition { .. } - ); - Error::runtime(err.to_string(), "object-store", retryable) - .with_source(err) -} - -#[cfg(test)] -mod tests { - use super::*; - use object_store::memory::InMemory; - - fn test_client() -> ObjectStoreClient { - ObjectStoreClient::new(InMemory::new()) - } - - #[tokio::test] - async fn put_and_get() { - let client = test_client(); - let data = Bytes::from("hello world"); - client - .put("test.txt", data.clone(), Some("text/plain")) - .await - .unwrap(); - - let result = client.get("test.txt").await.unwrap(); - assert_eq!(result.data, data); - assert_eq!(result.content_type.as_deref(), Some("text/plain")); - } - - #[tokio::test] - async fn get_returns_meta() { - let client = test_client(); - let data = Bytes::from("abc"); - client.put("meta.bin", data, None).await.unwrap(); - - let result = client.get("meta.bin").await.unwrap(); - assert_eq!(result.meta.size as usize, 3); - assert_eq!(result.meta.location, Path::from("meta.bin")); - } - - #[tokio::test] - async fn put_returns_result() { - let client = test_client(); - let result = client - .put("etag.bin", Bytes::from("x"), None) - .await - .unwrap(); - assert!(result.e_tag.is_some()); - } - - #[tokio::test] - async fn head() { - let client = test_client(); - client - .put("head.bin", Bytes::from("data"), None) - .await - .unwrap(); - - let meta = client.head("head.bin").await.unwrap(); - assert_eq!(meta.size, 4); - assert_eq!(meta.location, Path::from("head.bin")); - } - - #[tokio::test] - async fn head_not_found() { - let client = test_client(); - let err = client.head("missing").await.unwrap_err(); - assert!(!err.is_retryable()); - } - - #[tokio::test] - async fn delete() { - let client = test_client(); - client - .put("del.bin", Bytes::from("x"), None) - .await - .unwrap(); - client.delete("del.bin").await.unwrap(); - - assert!(client.get("del.bin").await.is_err()); - } - - #[tokio::test] - async fn copy() { - let client = test_client(); - let data = Bytes::from("copy me"); - client.put("src.bin", data.clone(), None).await.unwrap(); - client.copy("src.bin", "dst.bin").await.unwrap(); - - let result = client.get("dst.bin").await.unwrap(); - assert_eq!(result.data, data); - } - - #[tokio::test] - async fn list() { - let client = test_client(); - for i in 0..3 { - client - .put( - &format!("dir/file{i}.txt"), - Bytes::from(format!("{i}")), - None, - ) - .await - .unwrap(); - } - - let items = client.list("dir/").await.unwrap(); - assert_eq!(items.len(), 3); - } - - #[tokio::test] - async fn list_stream() { - use futures::StreamExt; - let client = test_client(); - for i in 0..3 { - client - .put( - &format!("stream/f{i}.bin"), - Bytes::from(format!("{i}")), - None, - ) - .await - .unwrap(); - } - - let items: Vec<_> = client - .list_stream("stream/") - .collect::>() - .await - .into_iter() - .collect::, _>>() - .unwrap(); - assert_eq!(items.len(), 3); - } - - #[tokio::test] - async fn put_create_only() { - let client = test_client(); - client - .put_opts( - "unique.bin", - Bytes::from("first"), - PutMode::Create, - None, - ) - .await - .unwrap(); - - let err = client - .put_opts( - "unique.bin", - Bytes::from("second"), - PutMode::Create, - None, - ) - .await - .unwrap_err(); - assert!(!err.is_retryable()); - } - - #[tokio::test] - async fn verify_reachable() { - let client = test_client(); - client.verify_reachable().await.unwrap(); - } -} diff --git a/crates/nvisy-object/src/client/put_output.rs b/crates/nvisy-object/src/client/put_output.rs deleted file mode 100644 index 2550eee..0000000 --- a/crates/nvisy-object/src/client/put_output.rs +++ /dev/null @@ -1,20 +0,0 @@ -//! Result type for [`ObjectStoreClient::put`](super::ObjectStoreClient::put) and -//! [`ObjectStoreClient::put_opts`](super::ObjectStoreClient::put_opts). - -/// Result of a successful put operation. -#[derive(Debug)] -pub struct PutOutput { - /// Unique identifier for the newly created object, if the backend provides one. - pub e_tag: Option, - /// A version indicator for the newly created object, if the backend provides one. - pub version: Option, -} - -impl From for PutOutput { - fn from(r: object_store::PutResult) -> Self { - Self { - e_tag: r.e_tag, - version: r.version, - } - } -} diff --git a/crates/nvisy-object/src/lib.rs b/crates/nvisy-object/src/lib.rs deleted file mode 100644 index fb0a72c..0000000 --- a/crates/nvisy-object/src/lib.rs +++ /dev/null @@ -1,12 +0,0 @@ -#![forbid(unsafe_code)] -#![cfg_attr(docsrs, feature(doc_cfg))] -#![doc = include_str!("../README.md")] - -pub mod client; -/// Provider trait and object storage provider factories. -pub mod providers; -/// Streaming traits and object store adapters. -pub mod streams; - -#[doc(hidden)] -pub mod prelude; diff --git a/crates/nvisy-object/src/prelude.rs b/crates/nvisy-object/src/prelude.rs deleted file mode 100644 index f2936a3..0000000 --- a/crates/nvisy-object/src/prelude.rs +++ /dev/null @@ -1,8 +0,0 @@ -//! Convenience re-exports. - -pub use crate::providers::Provider; -pub use crate::streams::{StreamSource, StreamTarget}; - -pub use crate::client::{GetOutput, ObjectStoreClient, PutOutput}; -pub use crate::providers::{AzureProvider, GcsProvider, S3Provider}; -pub use crate::streams::{ObjectReadStream, ObjectWriteStream}; diff --git a/crates/nvisy-object/src/providers/azure.rs b/crates/nvisy-object/src/providers/azure.rs deleted file mode 100644 index dfe4a89..0000000 --- a/crates/nvisy-object/src/providers/azure.rs +++ /dev/null @@ -1,76 +0,0 @@ -//! Azure Blob Storage provider using [`object_store::azure::MicrosoftAzureBuilder`]. - -use object_store::azure::MicrosoftAzureBuilder; -use serde::Deserialize; - -use nvisy_core::Error; -use super::Provider; - -use crate::client::ObjectStoreClient; - -/// Typed credentials for Azure Blob Storage. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct AzureCredentials { - /// Azure storage container name. - pub container: String, - /// Azure storage account name. - pub account_name: String, - /// Storage account access key. - #[serde(default)] - pub access_key: Option, - /// Shared Access Signature token. - #[serde(default)] - pub sas_token: Option, - /// Custom endpoint URL (for Azure Stack or Azurite). - #[serde(default)] - pub endpoint: Option, -} - -/// Factory that creates [`ObjectStoreClient`] instances backed by Azure Blob Storage. -pub struct AzureProvider; - -#[async_trait::async_trait] -impl Provider for AzureProvider { - type Credentials = AzureCredentials; - type Client = ObjectStoreClient; - - const ID: &str = "azure"; - - async fn verify(creds: &Self::Credentials) -> Result<(), Error> { - let client = Self::connect(creds).await?; - client.verify_reachable().await - } - - async fn connect(creds: &Self::Credentials) -> Result { - let mut builder = MicrosoftAzureBuilder::new() - .with_container_name(&creds.container) - .with_account(&creds.account_name); - - if let Some(key) = &creds.access_key { - builder = builder.with_access_key(key); - } - - if let Some(sas) = &creds.sas_token { - let pairs: Vec<(String, String)> = sas - .trim_start_matches('?') - .split('&') - .filter_map(|pair| { - let mut parts = pair.splitn(2, '='); - Some((parts.next()?.to_string(), parts.next().unwrap_or("").to_string())) - }) - .collect(); - builder = builder.with_sas_authorization(pairs); - } - - if let Some(endpoint) = &creds.endpoint { - builder = builder.with_endpoint(endpoint.clone()); - } - - let store = builder - .build() - .map_err(|e| Error::connection(e.to_string(), "azure", true))?; - - Ok(ObjectStoreClient::new(store)) - } -} diff --git a/crates/nvisy-object/src/providers/gcs.rs b/crates/nvisy-object/src/providers/gcs.rs deleted file mode 100644 index 8002931..0000000 --- a/crates/nvisy-object/src/providers/gcs.rs +++ /dev/null @@ -1,58 +0,0 @@ -//! Google Cloud Storage provider using [`object_store::gcp::GoogleCloudStorageBuilder`]. - -use object_store::gcp::GoogleCloudStorageBuilder; -use serde::Deserialize; - -use nvisy_core::Error; -use super::Provider; - -use crate::client::ObjectStoreClient; - -/// Typed credentials for Google Cloud Storage. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct GcsCredentials { - /// GCS bucket name. - pub bucket: String, - /// Path to a JSON service account key file. - #[serde(default)] - pub service_account_key: Option, - /// Custom endpoint URL (for testing with a fake GCS server). - #[serde(default)] - pub endpoint: Option, -} - -/// Factory that creates [`ObjectStoreClient`] instances backed by Google Cloud Storage. -pub struct GcsProvider; - -#[async_trait::async_trait] -impl Provider for GcsProvider { - type Credentials = GcsCredentials; - type Client = ObjectStoreClient; - - const ID: &str = "gcs"; - - async fn verify(creds: &Self::Credentials) -> Result<(), Error> { - let client = Self::connect(creds).await?; - client.verify_reachable().await - } - - async fn connect(creds: &Self::Credentials) -> Result { - let mut builder = - GoogleCloudStorageBuilder::new().with_bucket_name(&creds.bucket); - - if let Some(key_path) = &creds.service_account_key { - builder = builder.with_service_account_key(key_path); - } - - if let Some(endpoint) = &creds.endpoint { - builder = builder.with_url(endpoint); - } - - let store = builder - .build() - .map_err(|e| Error::connection(e.to_string(), "gcs", true))?; - - Ok(ObjectStoreClient::new(store)) - } -} diff --git a/crates/nvisy-object/src/providers/mod.rs b/crates/nvisy-object/src/providers/mod.rs deleted file mode 100644 index fe7d0e6..0000000 --- a/crates/nvisy-object/src/providers/mod.rs +++ /dev/null @@ -1,11 +0,0 @@ -//! Provider trait and object storage provider factories. - -mod provider; -mod azure; -mod gcs; -mod s3; - -pub use provider::Provider; -pub use azure::{AzureCredentials, AzureProvider}; -pub use gcs::{GcsCredentials, GcsProvider}; -pub use s3::{S3Credentials, S3Provider}; diff --git a/crates/nvisy-object/src/providers/provider.rs b/crates/nvisy-object/src/providers/provider.rs deleted file mode 100644 index 3ed9441..0000000 --- a/crates/nvisy-object/src/providers/provider.rs +++ /dev/null @@ -1,38 +0,0 @@ -//! Provider trait for creating authenticated client connections. - -use std::future::Future; -use std::pin::Pin; - -use serde::de::DeserializeOwned; - -use nvisy_core::Error; - -/// Factory for creating authenticated connections to an external service. -/// -/// Implementations handle credential validation, connectivity verification, -/// and client construction for a specific provider (e.g. S3, OpenAI). -#[async_trait::async_trait] -pub trait Provider: Send + Sync + 'static { - /// Strongly-typed credentials for this provider. - type Credentials: DeserializeOwned + Send; - /// The client type produced by [`connect`](Self::connect). - type Client: Send + 'static; - - /// Unique identifier (e.g. "s3", "openai"). - const ID: &str; - - /// Verify credentials by attempting a lightweight connection. - async fn verify(creds: &Self::Credentials) -> Result<(), Error>; - - /// Create a connected client instance. - async fn connect(creds: &Self::Credentials) -> Result; - - /// Optional async cleanup when the connection is released. - /// - /// Return `None` if no cleanup is needed. The default implementation - /// returns `None`. - #[allow(clippy::type_complexity)] - fn disconnect(_client: Self::Client) -> Option + Send>>> { - None - } -} diff --git a/crates/nvisy-object/src/providers/s3.rs b/crates/nvisy-object/src/providers/s3.rs deleted file mode 100644 index 66ab78d..0000000 --- a/crates/nvisy-object/src/providers/s3.rs +++ /dev/null @@ -1,86 +0,0 @@ -//! S3-compatible provider using [`object_store::aws::AmazonS3Builder`]. -//! -//! Works with AWS S3, MinIO, and any S3-compatible service. - -use object_store::aws::AmazonS3Builder; -use serde::Deserialize; - -use nvisy_core::Error; -use super::Provider; - -use crate::client::ObjectStoreClient; - -/// Typed credentials for S3-compatible provider. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct S3Credentials { - /// S3 bucket name. - pub bucket: String, - /// AWS region (defaults to `us-east-1`). - #[serde(default = "default_region")] - pub region: String, - /// Endpoint URL (e.g. `http://localhost:9000` for MinIO). - /// Required for non-AWS S3-compatible services. - #[serde(default)] - pub endpoint: Option, - /// Access key ID for static credentials. - #[serde(default)] - pub access_key_id: Option, - /// Secret access key for static credentials. - #[serde(default)] - pub secret_access_key: Option, - /// Session token for temporary credentials. - #[serde(default)] - pub session_token: Option, -} - -fn default_region() -> String { - "us-east-1".to_string() -} - -/// Factory that creates [`ObjectStoreClient`] instances backed by S3. -pub struct S3Provider; - -#[async_trait::async_trait] -impl Provider for S3Provider { - type Credentials = S3Credentials; - type Client = ObjectStoreClient; - - const ID: &str = "s3"; - - async fn verify(creds: &Self::Credentials) -> Result<(), Error> { - let client = Self::connect(creds).await?; - client.verify_reachable().await - } - - async fn connect(creds: &Self::Credentials) -> Result { - let mut builder = AmazonS3Builder::new() - .with_bucket_name(&creds.bucket) - .with_region(&creds.region); - - if let Some(endpoint) = &creds.endpoint { - builder = builder.with_endpoint(endpoint); - if endpoint.starts_with("http://") { - builder = builder.with_allow_http(true); - } - } - - if let Some(access_key) = &creds.access_key_id { - builder = builder.with_access_key_id(access_key); - } - - if let Some(secret_key) = &creds.secret_access_key { - builder = builder.with_secret_access_key(secret_key); - } - - if let Some(token) = &creds.session_token { - builder = builder.with_token(token); - } - - let store = builder - .build() - .map_err(|e| Error::connection(e.to_string(), "s3", true))?; - - Ok(ObjectStoreClient::new(store)) - } -} diff --git a/crates/nvisy-object/src/streams/mod.rs b/crates/nvisy-object/src/streams/mod.rs deleted file mode 100644 index 706d836..0000000 --- a/crates/nvisy-object/src/streams/mod.rs +++ /dev/null @@ -1,11 +0,0 @@ -//! Streaming traits and object store adapters. - -mod source_stream; -mod target_stream; -mod read_object; -mod write_object; - -pub use source_stream::StreamSource; -pub use target_stream::StreamTarget; -pub use read_object::{ObjectReadStream, ObjectReadParams}; -pub use write_object::{ObjectWriteStream, ObjectWriteParams}; diff --git a/crates/nvisy-object/src/streams/read_object.rs b/crates/nvisy-object/src/streams/read_object.rs deleted file mode 100644 index 46bae18..0000000 --- a/crates/nvisy-object/src/streams/read_object.rs +++ /dev/null @@ -1,147 +0,0 @@ -//! Streaming reader that pulls objects from a cloud object store. - -use futures::StreamExt; -use serde::Deserialize; -use tokio::sync::mpsc; - -use nvisy_core::Error; -use nvisy_core::io::ContentData; -use nvisy_core::path::ContentSource; - -use super::StreamSource; - -use crate::client::ObjectStoreClient; - -/// Typed parameters for [`ObjectReadStream`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ObjectReadParams { - /// Object key prefix to filter by. - #[serde(default)] - pub prefix: String, - /// Skip objects whose size exceeds this limit (in bytes). - #[serde(default)] - pub max_size: Option, -} - -/// A [`StreamSource`] that lists and fetches objects from a cloud object store, -/// emitting each object as a [`ContentData`] onto the output channel. -pub struct ObjectReadStream; - -#[async_trait::async_trait] -impl StreamSource for ObjectReadStream { - type Params = ObjectReadParams; - type Client = ObjectStoreClient; - - fn id(&self) -> &str { "read" } - - #[tracing::instrument(name = "object.read", skip_all, fields(prefix = %params.prefix, count))] - async fn read( - &self, - output: mpsc::Sender, - params: Self::Params, - client: Self::Client, - ) -> Result { - let mut stream = client.list_stream(¶ms.prefix); - let mut total = 0u64; - - while let Some(result) = stream.next().await { - let meta = result?; - let key = meta.location.as_ref(); - - if let Some(max) = params.max_size - && meta.size > max - { - tracing::debug!(key, size = meta.size, max_size = max, "skipping oversized object"); - continue; - } - - let source = ContentSource::new(); - tracing::debug!(key, source_id = %source, "fetching object"); - - let result = client.get(key).await?; - - let mut content = ContentData::new(source, result.data); - if let Some(ct) = result.content_type { - content = content.with_content_type(ct); - } - - total += 1; - if output.send(content).await.is_err() { - break; - } - } - - tracing::Span::current().record("count", total); - Ok(total) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use bytes::Bytes; - use object_store::memory::InMemory; - - fn test_client() -> ObjectStoreClient { - ObjectStoreClient::new(InMemory::new()) - } - - #[tokio::test] - async fn read_emits_all_objects() { - let client = test_client(); - for i in 0..3 { - client - .put( - &format!("data/file{i}.txt"), - Bytes::from(format!("content-{i}")), - Some("text/plain"), - ) - .await - .unwrap(); - } - - let (tx, mut rx) = mpsc::channel(16); - let stream = ObjectReadStream; - let params = ObjectReadParams { - prefix: "data/".to_string(), - max_size: None, - }; - - let count = stream.read(tx, params, client).await.unwrap(); - assert_eq!(count, 3); - - let mut items = Vec::new(); - while let Some(item) = rx.recv().await { - items.push(item); - } - assert_eq!(items.len(), 3); - } - - #[tokio::test] - async fn read_max_size_filter() { - let client = test_client(); - client - .put("filter/small.bin", Bytes::from("hi"), None) - .await - .unwrap(); - client - .put("filter/big.bin", Bytes::from("this is a much bigger payload"), None) - .await - .unwrap(); - - let (tx, mut rx) = mpsc::channel(16); - let stream = ObjectReadStream; - let params = ObjectReadParams { - prefix: "filter/".to_string(), - max_size: Some(10), - }; - - let count = stream.read(tx, params, client).await.unwrap(); - assert_eq!(count, 1); - - let item = rx.recv().await.unwrap(); - assert_eq!(item.as_bytes(), b"hi"); - assert!(rx.recv().await.is_none()); - } -} diff --git a/crates/nvisy-object/src/streams/source_stream.rs b/crates/nvisy-object/src/streams/source_stream.rs deleted file mode 100644 index 51fe26c..0000000 --- a/crates/nvisy-object/src/streams/source_stream.rs +++ /dev/null @@ -1,34 +0,0 @@ -//! Streaming source trait for pipeline input. -//! -//! [`StreamSource`] reads content from an external system into the pipeline. - -use serde::de::DeserializeOwned; -use tokio::sync::mpsc; - -use nvisy_core::Error; -use nvisy_core::io::ContentData; - -/// A source stream that reads content from an external system into the pipeline. -/// -/// Implementations connect to a storage backend (e.g. S3, local filesystem) -/// and emit content data into the pipeline's input channel. -#[async_trait::async_trait] -pub trait StreamSource: Send + Sync + 'static { - /// Strongly-typed parameters for this stream source. - type Params: DeserializeOwned + Send; - /// The client type this stream requires. - type Client: Send + 'static; - - /// Unique identifier for this stream source (e.g. `"read"`). - fn id(&self) -> &str; - - /// Read content from the external system and send it to `output`. - /// - /// Returns the number of items read. - async fn read( - &self, - output: mpsc::Sender, - params: Self::Params, - client: Self::Client, - ) -> Result; -} diff --git a/crates/nvisy-object/src/streams/target_stream.rs b/crates/nvisy-object/src/streams/target_stream.rs deleted file mode 100644 index efc92d0..0000000 --- a/crates/nvisy-object/src/streams/target_stream.rs +++ /dev/null @@ -1,34 +0,0 @@ -//! Streaming target trait for pipeline output. -//! -//! [`StreamTarget`] writes processed content back to an external system. - -use serde::de::DeserializeOwned; -use tokio::sync::mpsc; - -use nvisy_core::Error; -use nvisy_core::io::ContentData; - -/// A target stream that writes content from the pipeline to an external system. -/// -/// Implementations receive processed content data from the pipeline and persist -/// it to a storage backend. -#[async_trait::async_trait] -pub trait StreamTarget: Send + Sync + 'static { - /// Strongly-typed parameters for this stream target. - type Params: DeserializeOwned + Send; - /// The client type this stream requires. - type Client: Send + 'static; - - /// Unique identifier for this stream target (e.g. `"write"`). - fn id(&self) -> &str; - - /// Receive content from `input` and write it to the external system. - /// - /// Returns the number of items written. - async fn write( - &self, - input: mpsc::Receiver, - params: Self::Params, - client: Self::Client, - ) -> Result; -} diff --git a/crates/nvisy-object/src/streams/write_object.rs b/crates/nvisy-object/src/streams/write_object.rs deleted file mode 100644 index 409c31b..0000000 --- a/crates/nvisy-object/src/streams/write_object.rs +++ /dev/null @@ -1,138 +0,0 @@ -//! Streaming writer that uploads content to a cloud object store. - -use object_store::PutMode; -use serde::Deserialize; -use tokio::sync::mpsc; - -use nvisy_core::Error; -use nvisy_core::io::ContentData; - -use super::StreamTarget; - -use crate::client::ObjectStoreClient; - -/// Typed parameters for [`ObjectWriteStream`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ObjectWriteParams { - /// Key prefix prepended to each content source UUID. - #[serde(default)] - pub prefix: String, - /// When `true`, uses `PutMode::Create` so that writing to an existing - /// key fails with an error. - #[serde(default)] - pub create_only: bool, -} - -/// A [`StreamTarget`] that receives [`ContentData`] from the input channel and -/// uploads each one to a cloud object store. -pub struct ObjectWriteStream; - -#[async_trait::async_trait] -impl StreamTarget for ObjectWriteStream { - type Params = ObjectWriteParams; - type Client = ObjectStoreClient; - - fn id(&self) -> &str { "write" } - - #[tracing::instrument(name = "object.write", skip_all, fields(prefix = %params.prefix, count))] - async fn write( - &self, - mut input: mpsc::Receiver, - params: Self::Params, - client: Self::Client, - ) -> Result { - let prefix = ¶ms.prefix; - let mut total = 0u64; - - while let Some(content) = input.recv().await { - let source_id = content.content_source.to_string(); - let key = if prefix.is_empty() { - source_id - } else { - format!("{prefix}{source_id}") - }; - - let mode = if params.create_only { - PutMode::Create - } else { - PutMode::Overwrite - }; - client - .put_opts(&key, content.to_bytes(), mode, content.content_type()) - .await?; - - total += 1; - } - - tracing::Span::current().record("count", total); - Ok(total) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use bytes::Bytes; - use nvisy_core::io::ContentData; - use nvisy_core::path::ContentSource; - use object_store::memory::InMemory; - - fn test_client() -> ObjectStoreClient { - ObjectStoreClient::new(InMemory::new()) - } - - #[tokio::test] - async fn write_uploads_all() { - let client = test_client(); - let (tx, rx) = mpsc::channel(16); - - let sources: Vec = (0..3).map(|_| ContentSource::new()).collect(); - for (i, src) in sources.iter().enumerate() { - let content = ContentData::new(*src, Bytes::from(format!("payload-{i}"))); - tx.send(content).await.unwrap(); - } - drop(tx); - - let stream = ObjectWriteStream; - let params = ObjectWriteParams { - prefix: "out/".to_string(), - create_only: false, - }; - - let count = stream.write(rx, params, client.clone()).await.unwrap(); - assert_eq!(count, 3); - - // Verify all objects were stored - let items = client.list("out/").await.unwrap(); - assert_eq!(items.len(), 3); - } - - #[tokio::test] - async fn write_create_only() { - let client = test_client(); - - // Pre-populate an object at a known key - let source = ContentSource::new(); - let key = format!("prefix/{source}"); - client - .put(&key, Bytes::from("existing"), None) - .await - .unwrap(); - - // Try to write the same key with create_only - let (tx, rx) = mpsc::channel(1); - let content = ContentData::new(source, Bytes::from("new")); - tx.send(content).await.unwrap(); - drop(tx); - - let stream = ObjectWriteStream; - let params = ObjectWriteParams { - prefix: "prefix/".to_string(), - create_only: true, - }; - - let result = stream.write(rx, params, client).await; - assert!(result.is_err()); - } -} diff --git a/crates/nvisy-ontology/src/entity/annotation.rs b/crates/nvisy-ontology/src/entity/annotation.rs index 53f6f90..0b0f226 100644 --- a/crates/nvisy-ontology/src/entity/annotation.rs +++ b/crates/nvisy-ontology/src/entity/annotation.rs @@ -1,14 +1,14 @@ //! Annotation types for pre-identified regions and classification labels. use serde::{Deserialize, Serialize}; -use strum::Display; +use strum::{Display, EnumString}; use super::{EntityCategory, EntityKind}; use crate::location::Location; /// The kind of annotation applied to a content region. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, EnumString, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum AnnotationKind { @@ -21,7 +21,7 @@ pub enum AnnotationKind { } /// The scope to which an annotation label applies. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, EnumString, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum AnnotationScope { diff --git a/crates/nvisy-ontology/src/entity/mod.rs b/crates/nvisy-ontology/src/entity/mod.rs index 701deea..3661f87 100644 --- a/crates/nvisy-ontology/src/entity/mod.rs +++ b/crates/nvisy-ontology/src/entity/mod.rs @@ -20,7 +20,7 @@ pub use sensitivity::EntitySensitivity; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use serde_json::{Map, Value}; -use strum::Display; +use strum::{Display, EnumString}; use uuid::Uuid; use nvisy_core::path::ContentSource; @@ -28,8 +28,7 @@ use nvisy_core::path::ContentSource; use crate::location::Location; /// Method used to detect a sensitive entity. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, Serialize, Deserialize)] -#[derive(JsonSchema)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, EnumString, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] pub enum DetectionMethod { diff --git a/crates/nvisy-ontology/src/entity/model.rs b/crates/nvisy-ontology/src/entity/model.rs index c003b32..524ba58 100644 --- a/crates/nvisy-ontology/src/entity/model.rs +++ b/crates/nvisy-ontology/src/entity/model.rs @@ -2,11 +2,12 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use strum::{Display, EnumString}; /// Provenance or licensing classification of a detection model. -#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[derive(JsonSchema)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Display, EnumString, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum ModelKind { /// Open-source model (e.g. spaCy, Hugging Face community models). OpenSource, @@ -19,8 +20,7 @@ pub enum ModelKind { } /// Identity and version of the model used for detection. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(JsonSchema)] +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] pub struct ModelInfo { /// Model name (e.g. `"spacy-en-core-web-lg"`, `"gpt-4"`). pub name: String, diff --git a/crates/nvisy-ontology/src/entity/sensitivity.rs b/crates/nvisy-ontology/src/entity/sensitivity.rs index 066b509..310615e 100644 --- a/crates/nvisy-ontology/src/entity/sensitivity.rs +++ b/crates/nvisy-ontology/src/entity/sensitivity.rs @@ -14,8 +14,7 @@ use strum::{Display, EnumString}; /// directly (`Critical > High > Medium > Low`). #[derive( Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, - Display, EnumString, - Serialize, Deserialize, JsonSchema, + Display, EnumString, Serialize, Deserialize, JsonSchema, )] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")] diff --git a/crates/nvisy-ontology/src/lib.rs b/crates/nvisy-ontology/src/lib.rs index 309b3bc..0d84f4c 100644 --- a/crates/nvisy-ontology/src/lib.rs +++ b/crates/nvisy-ontology/src/lib.rs @@ -5,4 +5,4 @@ pub mod entity; pub mod location; pub mod record; -pub mod spec; +pub mod specification; diff --git a/crates/nvisy-ontology/src/record/mod.rs b/crates/nvisy-ontology/src/record/mod.rs index a0d8a15..6e0c50a 100644 --- a/crates/nvisy-ontology/src/record/mod.rs +++ b/crates/nvisy-ontology/src/record/mod.rs @@ -10,7 +10,7 @@ use uuid::Uuid; use nvisy_core::path::ContentSource; -use crate::spec::RedactionInput; +use crate::specification::RedactionInput; /// A redaction decision recording how a specific entity was (or will be) redacted. /// diff --git a/crates/nvisy-ontology/src/record/review.rs b/crates/nvisy-ontology/src/record/review.rs index c624478..732e523 100644 --- a/crates/nvisy-ontology/src/record/review.rs +++ b/crates/nvisy-ontology/src/record/review.rs @@ -3,11 +3,12 @@ use jiff::Timestamp; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use strum::{Display, EnumString}; /// Status of a human review on a redaction decision. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[derive(JsonSchema)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, EnumString, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] pub enum ReviewStatus { /// Awaiting human review. Pending, @@ -20,8 +21,7 @@ pub enum ReviewStatus { } /// A review decision recorded against a redaction. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(JsonSchema)] +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] pub struct ReviewDecision { /// Outcome of the review. pub status: ReviewStatus, diff --git a/crates/nvisy-ontology/src/spec/mod.rs b/crates/nvisy-ontology/src/specification/input.rs similarity index 63% rename from crates/nvisy-ontology/src/spec/mod.rs rename to crates/nvisy-ontology/src/specification/input.rs index 9d9c0c0..1a6eb71 100644 --- a/crates/nvisy-ontology/src/spec/mod.rs +++ b/crates/nvisy-ontology/src/specification/input.rs @@ -1,9 +1,15 @@ -//! Redaction specifications for all modalities. +//! Redaction input types: configuration-carrying specifications submitted +//! to the redaction engine, and the [`RedactorInput`] context struct +//! passed to LLM agents. use derive_more::From; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use crate::entity::{EntityCategory, EntityKind}; + +// ── defaults ──────────────────────────────────────────────────────────── + /// Default mask character for text redaction. pub const DEFAULT_MASK_CHAR: char = '*'; @@ -29,6 +35,8 @@ fn default_block_size() -> u32 { DEFAULT_PIXELATE_BLOCK_SIZE } +// ── text / tabular ────────────────────────────────────────────────────── + /// Text redaction specification with method-specific configuration. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] #[serde(tag = "method", rename_all = "snake_case")] @@ -80,6 +88,8 @@ pub enum TextRedactionInput { }, } +// ── image / video ─────────────────────────────────────────────────────── + /// Image redaction specification with method-specific configuration. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] #[serde(tag = "method", rename_all = "snake_case")] @@ -106,6 +116,8 @@ pub enum ImageRedactionInput { Synthesize, } +// ── audio ─────────────────────────────────────────────────────────────── + /// Audio redaction specification. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] #[serde(tag = "method", rename_all = "snake_case")] @@ -118,6 +130,8 @@ pub enum AudioRedactionInput { Synthesize, } +// ── unified ───────────────────────────────────────────────────────────── + /// Unified redaction specification submitted to the engine. /// /// Carries the method to apply and its configuration parameters. @@ -131,3 +145,26 @@ pub enum RedactionInput { /// Audio redaction specification. Audio(AudioRedactionInput), } + +// ── agent input ───────────────────────────────────────────────────────── + +/// Entity passed to a redactor agent for decision-making. +/// +/// Contains the detected entity's classification, matched value, confidence, +/// and byte offsets in the source text. The redactor uses this context to +/// choose an appropriate [`RedactionMethod`](super::RedactionMethod). +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] +pub struct RedactorInput { + /// Specific entity type (e.g. `EmailAddress`, `GovernmentId`). + pub entity_type: EntityKind, + /// Broad classification (e.g. `Pii`, `Financial`). + pub category: EntityCategory, + /// The matched text value. + pub value: String, + /// Detection confidence (0.0 -- 1.0). + pub confidence: f64, + /// Start byte offset in the input text. + pub start_offset: usize, + /// End byte offset in the input text. + pub end_offset: usize, +} diff --git a/crates/nvisy-ontology/src/specification/method.rs b/crates/nvisy-ontology/src/specification/method.rs new file mode 100644 index 0000000..90a8f09 --- /dev/null +++ b/crates/nvisy-ontology/src/specification/method.rs @@ -0,0 +1,114 @@ +//! Flat redaction method identifiers (no configuration payload). +//! +//! Each [`TextRedactionMethod`], [`ImageRedactionMethod`], and +//! [`AudioRedactionMethod`] names the *kind* of redaction to apply +//! without carrying method-specific parameters. These are the types an +//! LLM agent returns when recommending a redaction strategy; downstream +//! code maps them into the full [`TextRedactionInput`](super::TextRedactionInput), +//! [`ImageRedactionInput`](super::ImageRedactionInput), or +//! [`AudioRedactionInput`](super::AudioRedactionInput) with appropriate +//! defaults. + +use derive_more::From; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use strum::Display; + +/// Text/tabular redaction method. +/// +/// | Variant | Effect | +/// |---|---| +/// | `Mask` | Replace characters with a fixed mask character | +/// | `Replace` | Substitute with a type-appropriate placeholder | +/// | `Hash` | Replace with a one-way hash | +/// | `Encrypt` | Encrypt the value (recoverable with key) | +/// | `Remove` | Delete the value entirely | +/// | `Synthesize` | Replace with a realistic synthetic value | +/// | `Pseudonymize` | Replace with a consistent pseudonym | +/// | `Tokenize` | Replace with a vault-backed reversible token | +/// | `Aggregate` | Aggregate into a range or bucket | +/// | `Generalize` | Generalize to a less precise value | +/// | `DateShift` | Shift dates by a consistent offset | +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum TextRedactionMethod { + /// Replace characters with a mask character (e.g. `***`). + Mask, + /// Substitute with a fixed placeholder (e.g. `[EMAIL]`). + Replace, + /// Replace with a one-way hash. + Hash, + /// Encrypt the value; recoverable with a referenced key. + Encrypt, + /// Remove the value entirely. + Remove, + /// Replace with a synthetically generated value. + Synthesize, + /// Replace with a consistent pseudonym. + Pseudonymize, + /// Replace with a vault-backed reversible token. + Tokenize, + /// Aggregate into a range or bucket. + Aggregate, + /// Generalize to a less precise value. + Generalize, + /// Shift dates by a consistent offset. + DateShift, +} + +/// Image/video redaction method. +/// +/// | Variant | Effect | +/// |---|---| +/// | `Blur` | Apply a gaussian blur over the region | +/// | `Block` | Overlay an opaque rectangle | +/// | `Pixelate` | Apply pixelation / mosaic effect | +/// | `Synthesize` | Replace with a synthetic region | +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum ImageRedactionMethod { + /// Apply a gaussian blur over the region. + Blur, + /// Overlay an opaque rectangle. + Block, + /// Apply pixelation / mosaic effect. + Pixelate, + /// Replace with a synthetic region. + Synthesize, +} + +/// Audio redaction method. +/// +/// | Variant | Effect | +/// |---|---| +/// | `Silence` | Replace audio segment with silence | +/// | `Remove` | Remove the segment entirely | +/// | `Synthesize` | Replace with synthetic audio | +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +#[strum(serialize_all = "snake_case")] +pub enum AudioRedactionMethod { + /// Replace audio segment with silence. + Silence, + /// Remove the segment entirely. + Remove, + /// Replace with synthetic audio. + Synthesize, +} + +/// Unified redaction method across all modalities. +/// +/// Mirrors the structure of [`RedactionInput`](super::RedactionInput) but +/// carries only the method name — no configuration payload. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, From, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub enum RedactionMethod { + /// Text/tabular redaction method. + Text(TextRedactionMethod), + /// Image/video redaction method. + Image(ImageRedactionMethod), + /// Audio redaction method. + Audio(AudioRedactionMethod), +} diff --git a/crates/nvisy-ontology/src/specification/mod.rs b/crates/nvisy-ontology/src/specification/mod.rs new file mode 100644 index 0000000..451398f --- /dev/null +++ b/crates/nvisy-ontology/src/specification/mod.rs @@ -0,0 +1,27 @@ +//! Redaction specifications for all modalities. +//! +//! This module contains two layers: +//! +//! - **Methods** ([`TextRedactionMethod`], [`ImageRedactionMethod`], +//! [`AudioRedactionMethod`], [`RedactionMethod`]) — flat enums naming +//! *what kind* of redaction to apply, without configuration. These are +//! returned by LLM agents when recommending a strategy. +//! +//! - **Inputs** ([`TextRedactionInput`], [`ImageRedactionInput`], +//! [`AudioRedactionInput`], [`RedactionInput`]) — tagged enums carrying +//! method-specific configuration (mask char, blur sigma, etc.). These +//! are submitted to the redaction engine for execution. +//! +//! The [`RedactorInput`] struct carries entity context passed *into* a +//! redactor agent so it can choose the right method. + +mod input; +mod method; + +pub use input::{ + AudioRedactionInput, ImageRedactionInput, RedactionInput, RedactorInput, TextRedactionInput, + DEFAULT_BLOCK_COLOR, DEFAULT_BLUR_SIGMA, DEFAULT_MASK_CHAR, DEFAULT_PIXELATE_BLOCK_SIZE, +}; +pub use method::{ + AudioRedactionMethod, ImageRedactionMethod, RedactionMethod, TextRedactionMethod, +}; diff --git a/crates/nvisy-paddle/Cargo.toml b/crates/nvisy-paddle/Cargo.toml new file mode 100644 index 0000000..a570333 --- /dev/null +++ b/crates/nvisy-paddle/Cargo.toml @@ -0,0 +1,34 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-paddle" +description = "PaddleOCR backend trait and provider integration for Nvisy" +keywords = ["nvisy", "ocr", "paddle", "text-extraction"] +categories = ["text-processing"] + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +# Internal crates +nvisy-core = { workspace = true, features = [] } +nvisy-ontology = { workspace = true, features = [] } +nvisy-python = { workspace = true, features = [] } + +# (De)serialization +serde_json = { workspace = true, features = [] } + +# Async runtime +async-trait = { workspace = true, features = [] } diff --git a/crates/nvisy-paddle/README.md b/crates/nvisy-paddle/README.md new file mode 100644 index 0000000..bd19cf5 --- /dev/null +++ b/crates/nvisy-paddle/README.md @@ -0,0 +1,3 @@ +# nvisy-paddle + +PaddleOCR backend trait and provider integration for the Nvisy runtime. diff --git a/crates/nvisy-paddle/src/backend.rs b/crates/nvisy-paddle/src/backend.rs new file mode 100644 index 0000000..c0c2f32 --- /dev/null +++ b/crates/nvisy-paddle/src/backend.rs @@ -0,0 +1,31 @@ +//! OCR backend trait and configuration. + +use serde_json::Value; + +use nvisy_core::Error; + +/// Configuration passed to an [`OcrBackend`] implementation. +#[derive(Debug, Clone)] +pub struct OcrConfig { + /// Language hint (e.g. `"eng"` for English). + pub language: String, + /// OCR engine to use (`"tesseract"`, `"google-vision"`, `"aws-textract"`). + pub engine: String, + /// Minimum confidence threshold for OCR results. + pub confidence_threshold: f64, +} + +/// Backend trait for OCR providers. +/// +/// Implementations call an external OCR service and return raw JSON +/// results. Entity construction is handled by the consuming crate. +#[async_trait::async_trait] +pub trait OcrBackend: Send + Sync + 'static { + /// Run OCR on image bytes, returning raw dicts. + async fn detect_ocr( + &self, + image_data: &[u8], + mime_type: &str, + config: &OcrConfig, + ) -> Result, Error>; +} diff --git a/crates/nvisy-paddle/src/bridge.rs b/crates/nvisy-paddle/src/bridge.rs new file mode 100644 index 0000000..9ea3e5d --- /dev/null +++ b/crates/nvisy-paddle/src/bridge.rs @@ -0,0 +1,27 @@ +//! [`OcrBackend`] implementation for [`PythonBridge`]. + +use serde_json::Value; + +use nvisy_core::Error; +use nvisy_python::bridge::PythonBridge; +use nvisy_python::ocr::OcrParams; + +use crate::backend::{OcrBackend, OcrConfig}; + +/// Converts [`OcrConfig`] to [`OcrParams`] and delegates to `nvisy_python::ocr`. +#[async_trait::async_trait] +impl OcrBackend for PythonBridge { + async fn detect_ocr( + &self, + image_data: &[u8], + mime_type: &str, + config: &OcrConfig, + ) -> Result, Error> { + let params = OcrParams { + language: config.language.clone(), + engine: config.engine.clone(), + confidence_threshold: config.confidence_threshold, + }; + nvisy_python::ocr::detect_ocr(self, image_data, mime_type, ¶ms).await + } +} diff --git a/crates/nvisy-paddle/src/lib.rs b/crates/nvisy-paddle/src/lib.rs new file mode 100644 index 0000000..ae2b5a9 --- /dev/null +++ b/crates/nvisy-paddle/src/lib.rs @@ -0,0 +1,10 @@ +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + +mod backend; +mod bridge; +mod parse; + +pub use backend::{OcrBackend, OcrConfig}; +pub use parse::parse_ocr_entities; diff --git a/crates/nvisy-paddle/src/parse.rs b/crates/nvisy-paddle/src/parse.rs new file mode 100644 index 0000000..1c6d0dc --- /dev/null +++ b/crates/nvisy-paddle/src/parse.rs @@ -0,0 +1,49 @@ +//! OCR result parsing. + +use serde_json::Value; + +use nvisy_core::math::BoundingBox; +use nvisy_core::Error; +use nvisy_ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityKind}; +use nvisy_ontology::location::{ImageLocation, Location}; + +/// Parse raw JSON dicts from an OCR backend into [`Entity`] values. +/// +/// Expected dict keys: `text`, `x`, `y`, `width`, `height`, `confidence`. +pub fn parse_ocr_entities(raw: &[Value]) -> Result, Error> { + let mut entities = Vec::new(); + + for item in raw { + let obj = item.as_object().ok_or_else(|| { + Error::python("Expected JSON object in OCR results".to_string()) + })?; + + let text = obj + .get("text") + .and_then(Value::as_str) + .ok_or_else(|| Error::python("Missing 'text' in OCR result".to_string()))?; + + let x = obj.get("x").and_then(Value::as_f64).unwrap_or(0.0); + let y = obj.get("y").and_then(Value::as_f64).unwrap_or(0.0); + let width = obj.get("width").and_then(Value::as_f64).unwrap_or(0.0); + let height = obj.get("height").and_then(Value::as_f64).unwrap_or(0.0); + let confidence = obj.get("confidence").and_then(Value::as_f64).unwrap_or(0.0); + + let entity = Entity::new( + EntityCategory::Pii, + EntityKind::Handwriting, + text, + DetectionMethod::Ocr, + confidence, + ) + .with_location(Location::Image(ImageLocation { + bounding_box: BoundingBox { x, y, width, height }, + image_id: None, + page_number: None, + })); + + entities.push(entity); + } + + Ok(entities) +} diff --git a/crates/nvisy-pattern/assets/patterns/aws_key.json b/crates/nvisy-pattern/assets/patterns/aws_key.json index d3fa046..96ec047 100644 --- a/crates/nvisy-pattern/assets/patterns/aws_key.json +++ b/crates/nvisy-pattern/assets/patterns/aws_key.json @@ -3,7 +3,7 @@ "category": "credentials", "entity_type": "api_key", "pattern": { - "regex": "\\bAKIA[0-9A-Z]{16}\\b" - }, - "confidence": 0.95 + "regex": "\\bAKIA[0-9A-Z]{16}\\b", + "confidence": 0.95 + } } diff --git a/crates/nvisy-pattern/assets/patterns/bitcoin_address.json b/crates/nvisy-pattern/assets/patterns/bitcoin_address.json index 3d7289a..409b0ef 100644 --- a/crates/nvisy-pattern/assets/patterns/bitcoin_address.json +++ b/crates/nvisy-pattern/assets/patterns/bitcoin_address.json @@ -3,7 +3,7 @@ "category": "financial", "entity_type": "crypto_address", "pattern": { - "regex": "\\b(?:bc1[a-z0-9]{25,39}|[13][a-km-zA-HJ-NP-Z1-9]{25,34})\\b" - }, - "confidence": 0.85 + "regex": "\\b(?:bc1[a-z0-9]{25,39}|[13][a-km-zA-HJ-NP-Z1-9]{25,34})\\b", + "confidence": 0.85 + } } diff --git a/crates/nvisy-pattern/assets/patterns/credit_card.json b/crates/nvisy-pattern/assets/patterns/credit_card.json index 4d9d2ce..0e3f59f 100644 --- a/crates/nvisy-pattern/assets/patterns/credit_card.json +++ b/crates/nvisy-pattern/assets/patterns/credit_card.json @@ -4,9 +4,9 @@ "entity_type": "payment_card", "pattern": { "regex": "\\b(?:\\d[ \\-]*?){13,19}\\b", - "validator": "luhn" + "validator": "luhn", + "confidence": 0.85 }, - "confidence": 0.85, "context": { "keywords": ["card", "credit", "debit", "payment", "visa", "mastercard", "amex"], "window": 3, diff --git a/crates/nvisy-pattern/assets/patterns/cryptocurrencies.json b/crates/nvisy-pattern/assets/patterns/cryptocurrencies.json index 78bf468..140cc51 100644 --- a/crates/nvisy-pattern/assets/patterns/cryptocurrencies.json +++ b/crates/nvisy-pattern/assets/patterns/cryptocurrencies.json @@ -3,7 +3,7 @@ "category": "financial", "entity_type": "amount", "dictionary": { - "name": "cryptocurrencies" - }, - "confidence": 0.85 + "name": "cryptocurrencies", + "confidence": [0.85, 0.55] + } } diff --git a/crates/nvisy-pattern/assets/patterns/currencies.json b/crates/nvisy-pattern/assets/patterns/currencies.json index 4805155..ab0d016 100644 --- a/crates/nvisy-pattern/assets/patterns/currencies.json +++ b/crates/nvisy-pattern/assets/patterns/currencies.json @@ -3,7 +3,7 @@ "category": "financial", "entity_type": "amount", "dictionary": { - "name": "currencies" - }, - "confidence": 0.85 + "name": "currencies", + "confidence": [0.85, 0.55] + } } diff --git a/crates/nvisy-pattern/assets/patterns/date_of_birth.json b/crates/nvisy-pattern/assets/patterns/date_of_birth.json index fcb8cae..26ecd52 100644 --- a/crates/nvisy-pattern/assets/patterns/date_of_birth.json +++ b/crates/nvisy-pattern/assets/patterns/date_of_birth.json @@ -3,9 +3,9 @@ "category": "pii", "entity_type": "date_of_birth", "pattern": { - "regex": "\\b(?:0[1-9]|1[0-2])[/\\-](?:0[1-9]|[12]\\d|3[01])[/\\-](?:19|20)\\d{2}\\b" + "regex": "\\b(?:0[1-9]|1[0-2])[/\\-](?:0[1-9]|[12]\\d|3[01])[/\\-](?:19|20)\\d{2}\\b", + "confidence": 0.6 }, - "confidence": 0.6, "context": { "keywords": ["birth", "born", "dob", "birthday", "date of birth"], "window": 2, diff --git a/crates/nvisy-pattern/assets/patterns/email.json b/crates/nvisy-pattern/assets/patterns/email.json index 7335061..8748538 100644 --- a/crates/nvisy-pattern/assets/patterns/email.json +++ b/crates/nvisy-pattern/assets/patterns/email.json @@ -3,7 +3,7 @@ "category": "pii", "entity_type": "email_address", "pattern": { - "regex": "\\b[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}\\b" - }, - "confidence": 0.95 + "regex": "\\b[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}\\b", + "confidence": 0.95 + } } diff --git a/crates/nvisy-pattern/assets/patterns/ethereum_address.json b/crates/nvisy-pattern/assets/patterns/ethereum_address.json index b757995..d9e93de 100644 --- a/crates/nvisy-pattern/assets/patterns/ethereum_address.json +++ b/crates/nvisy-pattern/assets/patterns/ethereum_address.json @@ -3,7 +3,7 @@ "category": "financial", "entity_type": "crypto_address", "pattern": { - "regex": "\\b0x[0-9a-fA-F]{40}\\b" - }, - "confidence": 0.85 + "regex": "\\b0x[0-9a-fA-F]{40}\\b", + "confidence": 0.85 + } } diff --git a/crates/nvisy-pattern/assets/patterns/generic_api_key.json b/crates/nvisy-pattern/assets/patterns/generic_api_key.json index 37dc872..cffec52 100644 --- a/crates/nvisy-pattern/assets/patterns/generic_api_key.json +++ b/crates/nvisy-pattern/assets/patterns/generic_api_key.json @@ -3,7 +3,7 @@ "category": "credentials", "entity_type": "api_key", "pattern": { - "regex": "(?i)(?:api[_\\-]?key|api[_\\-]?secret|access[_\\-]?token|secret[_\\-]?key|bearer)\\s*[:=]\\s*[\"']?([a-zA-Z0-9_\\-]{20,})[\"']?" - }, - "confidence": 0.7 + "regex": "(?i)(?:api[_\\-]?key|api[_\\-]?secret|access[_\\-]?token|secret[_\\-]?key|bearer)\\s*[:=]\\s*[\"']?([a-zA-Z0-9_\\-]{20,})[\"']?", + "confidence": 0.7 + } } diff --git a/crates/nvisy-pattern/assets/patterns/github_token.json b/crates/nvisy-pattern/assets/patterns/github_token.json index 77ac9e2..e6cf759 100644 --- a/crates/nvisy-pattern/assets/patterns/github_token.json +++ b/crates/nvisy-pattern/assets/patterns/github_token.json @@ -3,7 +3,7 @@ "category": "credentials", "entity_type": "auth_token", "pattern": { - "regex": "\\bgh[pousr]_[a-zA-Z0-9]{36}\\b" - }, - "confidence": 0.95 + "regex": "\\bgh[pousr]_[a-zA-Z0-9]{36}\\b", + "confidence": 0.95 + } } diff --git a/crates/nvisy-pattern/assets/patterns/iban.json b/crates/nvisy-pattern/assets/patterns/iban.json index c2c3e86..ac3f75f 100644 --- a/crates/nvisy-pattern/assets/patterns/iban.json +++ b/crates/nvisy-pattern/assets/patterns/iban.json @@ -4,9 +4,9 @@ "entity_type": "iban", "pattern": { "regex": "\\b[A-Z]{2}\\d{2}\\s?[A-Z0-9]{4}\\s?(?:\\d{4}\\s?){2,7}\\d{1,4}\\b", - "validator": "iban" + "validator": "iban", + "confidence": 0.85 }, - "confidence": 0.85, "context": { "keywords": ["iban", "bank", "account", "transfer", "swift"], "window": 3, diff --git a/crates/nvisy-pattern/assets/patterns/ipv4.json b/crates/nvisy-pattern/assets/patterns/ipv4.json index 37777bd..971ccd9 100644 --- a/crates/nvisy-pattern/assets/patterns/ipv4.json +++ b/crates/nvisy-pattern/assets/patterns/ipv4.json @@ -3,7 +3,7 @@ "category": "pii", "entity_type": "ip_address", "pattern": { - "regex": "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b" - }, - "confidence": 0.75 + "regex": "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b", + "confidence": 0.75 + } } diff --git a/crates/nvisy-pattern/assets/patterns/ipv6.json b/crates/nvisy-pattern/assets/patterns/ipv6.json index 5462ce2..ce096fd 100644 --- a/crates/nvisy-pattern/assets/patterns/ipv6.json +++ b/crates/nvisy-pattern/assets/patterns/ipv6.json @@ -3,7 +3,7 @@ "category": "pii", "entity_type": "ip_address", "pattern": { - "regex": "\\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b|(?:[0-9a-fA-F]{1,4}:){1,7}:|::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\\b" - }, - "confidence": 0.75 + "regex": "\\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b|(?:[0-9a-fA-F]{1,4}:){1,7}:|::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\\b", + "confidence": 0.75 + } } diff --git a/crates/nvisy-pattern/assets/patterns/languages.json b/crates/nvisy-pattern/assets/patterns/languages.json index 436dcfc..5d460af 100644 --- a/crates/nvisy-pattern/assets/patterns/languages.json +++ b/crates/nvisy-pattern/assets/patterns/languages.json @@ -3,7 +3,7 @@ "category": "pii", "entity_type": "demographic", "dictionary": { - "name": "languages" - }, - "confidence": 0.85 + "name": "languages", + "confidence": [0.85, 0.45] + } } diff --git a/crates/nvisy-pattern/assets/patterns/mac_address.json b/crates/nvisy-pattern/assets/patterns/mac_address.json index f67a7d5..fd8fe8e 100644 --- a/crates/nvisy-pattern/assets/patterns/mac_address.json +++ b/crates/nvisy-pattern/assets/patterns/mac_address.json @@ -3,7 +3,7 @@ "category": "pii", "entity_type": "mac_address", "pattern": { - "regex": "\\b(?:[0-9A-Fa-f]{2}[:\\-]){5}[0-9A-Fa-f]{2}\\b" - }, - "confidence": 0.85 + "regex": "\\b(?:[0-9A-Fa-f]{2}[:\\-]){5}[0-9A-Fa-f]{2}\\b", + "confidence": 0.85 + } } diff --git a/crates/nvisy-pattern/assets/patterns/nationalities.json b/crates/nvisy-pattern/assets/patterns/nationalities.json index dbaf0bb..a32593c 100644 --- a/crates/nvisy-pattern/assets/patterns/nationalities.json +++ b/crates/nvisy-pattern/assets/patterns/nationalities.json @@ -3,7 +3,7 @@ "category": "pii", "entity_type": "demographic", "dictionary": { - "name": "nationalities" - }, - "confidence": 0.85 + "name": "nationalities", + "confidence": 0.85 + } } diff --git a/crates/nvisy-pattern/assets/patterns/phone.json b/crates/nvisy-pattern/assets/patterns/phone.json index 928dde5..5380e94 100644 --- a/crates/nvisy-pattern/assets/patterns/phone.json +++ b/crates/nvisy-pattern/assets/patterns/phone.json @@ -3,9 +3,9 @@ "category": "pii", "entity_type": "phone_number", "pattern": { - "regex": "(?:\\+\\d{1,3}[\\s.\\-]?)?\\(?\\d{2,4}\\)?[\\s.\\-]?\\d{3,4}[\\s.\\-]?\\d{4}\\b" + "regex": "(?:\\+\\d{1,3}[\\s.\\-]?)?\\(?\\d{2,4}\\)?[\\s.\\-]?\\d{3,4}[\\s.\\-]?\\d{4}\\b", + "confidence": 0.8 }, - "confidence": 0.8, "context": { "keywords": ["phone", "call", "mobile", "tel", "fax", "contact"], "window": 2, diff --git a/crates/nvisy-pattern/assets/patterns/private_key.json b/crates/nvisy-pattern/assets/patterns/private_key.json index f52c9b7..138037f 100644 --- a/crates/nvisy-pattern/assets/patterns/private_key.json +++ b/crates/nvisy-pattern/assets/patterns/private_key.json @@ -3,7 +3,7 @@ "category": "credentials", "entity_type": "private_key", "pattern": { - "regex": "-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----" - }, - "confidence": 0.98 + "regex": "-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----", + "confidence": 0.98 + } } diff --git a/crates/nvisy-pattern/assets/patterns/religions.json b/crates/nvisy-pattern/assets/patterns/religions.json index 521bf5f..bb3d2f2 100644 --- a/crates/nvisy-pattern/assets/patterns/religions.json +++ b/crates/nvisy-pattern/assets/patterns/religions.json @@ -3,7 +3,7 @@ "category": "pii", "entity_type": "demographic", "dictionary": { - "name": "religions" - }, - "confidence": 0.85 + "name": "religions", + "confidence": 0.85 + } } diff --git a/crates/nvisy-pattern/assets/patterns/ssn.json b/crates/nvisy-pattern/assets/patterns/ssn.json index c6b3b52..12aeb75 100644 --- a/crates/nvisy-pattern/assets/patterns/ssn.json +++ b/crates/nvisy-pattern/assets/patterns/ssn.json @@ -4,9 +4,9 @@ "entity_type": "government_id", "pattern": { "regex": "\\b(\\d{3})-(\\d{2})-(\\d{4})\\b", - "validator": "ssn" + "validator": "ssn", + "confidence": 0.9 }, - "confidence": 0.9, "context": { "keywords": ["social security", "ssn", "tax id", "taxpayer identification"], "window": 3, diff --git a/crates/nvisy-pattern/assets/patterns/stripe_key.json b/crates/nvisy-pattern/assets/patterns/stripe_key.json index be89fb2..f2e5c1b 100644 --- a/crates/nvisy-pattern/assets/patterns/stripe_key.json +++ b/crates/nvisy-pattern/assets/patterns/stripe_key.json @@ -3,7 +3,7 @@ "category": "credentials", "entity_type": "api_key", "pattern": { - "regex": "\\bsk_(live|test)_[a-zA-Z0-9]{24,}\\b" - }, - "confidence": 0.95 + "regex": "\\bsk_(live|test)_[a-zA-Z0-9]{24,}\\b", + "confidence": 0.95 + } } diff --git a/crates/nvisy-pattern/assets/patterns/swift_code.json b/crates/nvisy-pattern/assets/patterns/swift_code.json index 3dd9cc0..a28a5e3 100644 --- a/crates/nvisy-pattern/assets/patterns/swift_code.json +++ b/crates/nvisy-pattern/assets/patterns/swift_code.json @@ -3,7 +3,7 @@ "category": "financial", "entity_type": "swift_code", "pattern": { - "regex": "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b" - }, - "confidence": 0.7 + "regex": "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b", + "confidence": 0.7 + } } diff --git a/crates/nvisy-pattern/assets/patterns/url.json b/crates/nvisy-pattern/assets/patterns/url.json index 02f43bd..d7bebc5 100644 --- a/crates/nvisy-pattern/assets/patterns/url.json +++ b/crates/nvisy-pattern/assets/patterns/url.json @@ -3,7 +3,7 @@ "category": "pii", "entity_type": "url", "pattern": { - "regex": "\\bhttps?://[^\\s/$.?#][^\\s]*\\b" - }, - "confidence": 0.9 + "regex": "\\bhttps?://[^\\s/$.?#][^\\s]*\\b", + "confidence": 0.9 + } } diff --git a/crates/nvisy-pattern/assets/patterns/us_bank_routing.json b/crates/nvisy-pattern/assets/patterns/us_bank_routing.json index feca4e3..cb5588c 100644 --- a/crates/nvisy-pattern/assets/patterns/us_bank_routing.json +++ b/crates/nvisy-pattern/assets/patterns/us_bank_routing.json @@ -3,7 +3,7 @@ "category": "financial", "entity_type": "bank_routing", "pattern": { - "regex": "\\b(?:0[1-9]|[12]\\d|3[0-2])\\d{7}\\b" - }, - "confidence": 0.5 + "regex": "\\b(?:0[1-9]|[12]\\d|3[0-2])\\d{7}\\b", + "confidence": 0.5 + } } diff --git a/crates/nvisy-pattern/assets/patterns/us_drivers_license.json b/crates/nvisy-pattern/assets/patterns/us_drivers_license.json index 7ee2664..1c1709a 100644 --- a/crates/nvisy-pattern/assets/patterns/us_drivers_license.json +++ b/crates/nvisy-pattern/assets/patterns/us_drivers_license.json @@ -3,7 +3,7 @@ "category": "pii", "entity_type": "drivers_license", "pattern": { - "regex": "\\b[A-Z]\\d{3}-\\d{4}-\\d{4}\\b" - }, - "confidence": 0.4 + "regex": "\\b[A-Z]\\d{3}-\\d{4}-\\d{4}\\b", + "confidence": 0.4 + } } diff --git a/crates/nvisy-pattern/assets/patterns/us_passport.json b/crates/nvisy-pattern/assets/patterns/us_passport.json index 058dfbf..bf055a8 100644 --- a/crates/nvisy-pattern/assets/patterns/us_passport.json +++ b/crates/nvisy-pattern/assets/patterns/us_passport.json @@ -3,7 +3,7 @@ "category": "pii", "entity_type": "passport_number", "pattern": { - "regex": "\\b[A-Z]\\d{8}\\b" - }, - "confidence": 0.5 + "regex": "\\b[A-Z]\\d{8}\\b", + "confidence": 0.5 + } } diff --git a/crates/nvisy-pattern/assets/patterns/us_postal_code.json b/crates/nvisy-pattern/assets/patterns/us_postal_code.json index d9cc156..b626956 100644 --- a/crates/nvisy-pattern/assets/patterns/us_postal_code.json +++ b/crates/nvisy-pattern/assets/patterns/us_postal_code.json @@ -3,7 +3,7 @@ "category": "pii", "entity_type": "postal_code", "pattern": { - "regex": "\\b\\d{5}(?:-\\d{4})?\\b" - }, - "confidence": 0.5 + "regex": "\\b\\d{5}(?:-\\d{4})?\\b", + "confidence": 0.5 + } } diff --git a/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs b/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs index 87c8c25..574eae6 100644 --- a/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs +++ b/crates/nvisy-pattern/src/dictionaries/csv_dictionary.rs @@ -10,6 +10,8 @@ use super::Dictionary; pub struct CsvDictionary { name: String, entries: Vec, + /// Source column index for each entry (parallel to `entries`). + columns: Vec, } impl CsvDictionary { @@ -17,10 +19,13 @@ impl CsvDictionary { /// /// `name` identifies this dictionary (e.g. `"currencies"`). /// `text` is the CSV content where each non-empty cell becomes a matchable term. + /// The column index of each cell is preserved so that per-column confidence + /// scores can be applied at detection time. pub fn new(name: impl Into, text: &str) -> Self { let name = name.into(); let mut entries = Vec::new(); + let mut columns = Vec::new(); let mut reader = csv::ReaderBuilder::new() .has_headers(false) .flexible(true) @@ -29,15 +34,16 @@ impl CsvDictionary { for result in reader.records() { let record = result.expect("failed to parse CSV record"); - for field in record.iter() { + for (col, field) in record.iter().enumerate() { let trimmed = field.trim(); if !trimmed.is_empty() { entries.push(trimmed.to_owned()); + columns.push(col); } } } - Self { name, entries } + Self { name, entries, columns } } } @@ -49,6 +55,10 @@ impl Dictionary for CsvDictionary { fn entries(&self) -> &[String] { &self.entries } + + fn columns(&self) -> Option<&[usize]> { + Some(&self.columns) + } } #[cfg(test)] diff --git a/crates/nvisy-pattern/src/dictionaries/dictionary.rs b/crates/nvisy-pattern/src/dictionaries/dictionary.rs index 578c8eb..8edc63f 100644 --- a/crates/nvisy-pattern/src/dictionaries/dictionary.rs +++ b/crates/nvisy-pattern/src/dictionaries/dictionary.rs @@ -16,6 +16,15 @@ pub trait Dictionary: Send + Sync { /// All matchable terms produced by this dictionary. fn entries(&self) -> &[String]; + + /// Column index for each entry, parallel to [`entries`](Self::entries). + /// + /// Returns `Some` for CSV dictionaries where each cell tracks its + /// source column. Returns `None` for plain-text dictionaries (all + /// entries are logically in column 0). + fn columns(&self) -> Option<&[usize]> { + None + } } /// Type-erased boxed [`Dictionary`]. diff --git a/crates/nvisy-pattern/src/engine/builder.rs b/crates/nvisy-pattern/src/engine/builder.rs index 9bb6f2e..96ea1ed 100644 --- a/crates/nvisy-pattern/src/engine/builder.rs +++ b/crates/nvisy-pattern/src/engine/builder.rs @@ -99,7 +99,7 @@ impl PatternEngineBuilder { pattern_name: p.name().to_owned(), category: p.category().clone(), entity_kind: p.entity_kind(), - confidence: p.confidence(), + confidence: rp.confidence, validator_name: rp.validator.clone(), regex: compiled, context: p.context().cloned(), @@ -116,6 +116,7 @@ impl PatternEngineBuilder { if values.is_empty() { continue; } + let columns = dict.columns().map(|c| c.to_vec()); let automaton = aho_corasick::AhoCorasickBuilder::new() .ascii_case_insensitive(!dp.case_sensitive) .build(&values) @@ -127,9 +128,10 @@ impl PatternEngineBuilder { pattern_name: p.name().to_owned(), category: p.category().clone(), entity_kind: p.entity_kind(), - confidence: p.confidence(), + confidence: dp.confidence.clone(), automaton, values, + columns, context: p.context().cloned(), }); } diff --git a/crates/nvisy-pattern/src/engine/mod.rs b/crates/nvisy-pattern/src/engine/mod.rs index 2573fc3..b8a8c9e 100644 --- a/crates/nvisy-pattern/src/engine/mod.rs +++ b/crates/nvisy-pattern/src/engine/mod.rs @@ -34,7 +34,7 @@ use regex::{Regex, RegexSet}; use nvisy_ontology::entity::{EntityCategory, EntityKind}; -use crate::patterns::ContextRule; +use crate::patterns::{ContextRule, DictionaryConfidence}; use crate::validators::ValidatorResolver; /// Metadata stored alongside each compiled regex. @@ -53,13 +53,28 @@ struct DictEntry { pattern_name: String, category: EntityCategory, entity_kind: EntityKind, - confidence: f64, + confidence: DictionaryConfidence, automaton: AhoCorasick, /// The terms used to build the automaton, indexed by pattern id. values: Vec, + /// Per-entry column index from the source dictionary (parallel to `values`). + /// `None` for plain-text dictionaries (all entries are column 0). + columns: Option>, context: Option, } +impl DictEntry { + /// Resolve the confidence for the entry at `pattern_index`. + fn resolve_confidence(&self, pattern_index: usize) -> f64 { + let col = self + .columns + .as_ref() + .and_then(|cols| cols.get(pattern_index).copied()) + .unwrap_or(0); + self.confidence.resolve(col) + } +} + /// Pre-compiled engine that scans text against all registered patterns. /// /// Scanning runs in three phases: @@ -176,12 +191,17 @@ impl PatternEngine { /// Phase 2: dictionary matches via Aho-Corasick automata. fn scan_dict(&self, text: &str, results: &mut Vec) { for entry in &self.dict_entries { - if entry.confidence < self.confidence_threshold { - continue; - } - for mat in entry.automaton.find_iter(text) { - let value = &entry.values[mat.pattern().as_usize()]; + let pat_idx = mat.pattern().as_usize(); + let value = &entry.values[pat_idx]; + + // Resolve per-entry confidence: use column override if available, + // otherwise fall back to the pattern's base confidence. + let confidence = entry.resolve_confidence(pat_idx); + + if confidence < self.confidence_threshold { + continue; + } if self.allow_set.contains(value.as_str()) { continue; @@ -194,7 +214,7 @@ impl PatternEngine { value: value.clone(), start: mat.start(), end: mat.end(), - confidence: entry.confidence, + confidence, source: DetectionSource::Dictionary, context: entry.context.clone(), }); @@ -389,6 +409,23 @@ mod tests { assert_eq!(entry.category, EntityCategory::Financial); } + #[test] + fn column_confidence_applies_to_csv_dictionaries() { + let engine = default_engine(); + // "US Dollar" is column 0 (full name), "USD" is column 1 (code). + let matches = engine.scan_text("I paid in US Dollar and also in USD."); + let full_name = matches.iter().find(|m| m.value == "US Dollar"); + let code = matches.iter().find(|m| m.value == "USD"); + assert!(full_name.is_some(), "should match 'US Dollar'"); + assert!(code.is_some(), "should match 'USD'"); + let full_conf = full_name.unwrap().confidence; + let code_conf = code.unwrap().confidence; + assert!( + full_conf > code_conf, + "full name confidence ({full_conf}) should exceed code confidence ({code_conf})" + ); + } + #[test] fn context_rule_passthrough() { let engine = PatternEngine::builder() diff --git a/crates/nvisy-pattern/src/lib.rs b/crates/nvisy-pattern/src/lib.rs index 839d727..9f50527 100644 --- a/crates/nvisy-pattern/src/lib.rs +++ b/crates/nvisy-pattern/src/lib.rs @@ -2,15 +2,13 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -pub(crate) mod patterns; pub(crate) mod dictionaries; +pub mod engine; +pub(crate) mod patterns; pub(crate) mod validators; -mod engine; -pub use engine::{ - AllowList, DenyEntry, DenyList, DetectionSource, PatternEngine, PatternEngineBuilder, - PatternEngineError, PatternMatch, default_engine, -}; +pub use engine::{DetectionSource, PatternEngine, PatternEngineBuilder, PatternMatch}; pub use patterns::ContextRule; +#[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-pattern/src/patterns/json_pattern.rs b/crates/nvisy-pattern/src/patterns/json_pattern.rs index 67a5f46..7651d8f 100644 --- a/crates/nvisy-pattern/src/patterns/json_pattern.rs +++ b/crates/nvisy-pattern/src/patterns/json_pattern.rs @@ -35,9 +35,6 @@ pub enum JsonPatternWarning { UnknownValidator { pattern: String, validator: String }, } -/// Default confidence score when `"confidence"` is omitted from JSON. -const DEFAULT_CONFIDENCE: f64 = 1.0; - /// A detection pattern deserialized from a JSON definition file. /// /// Implements the [`Pattern`] trait and is the only concrete implementation @@ -48,7 +45,6 @@ pub struct JsonPattern { category: EntityCategory, entity_kind: EntityKind, match_source: MatchSource, - confidence: f64, pub(crate) context: Option, } @@ -84,8 +80,6 @@ impl JsonPattern { #[serde(flatten)] source: RawSource, #[serde(default)] - confidence: Option, - #[serde(default)] context: Option, } @@ -118,7 +112,6 @@ impl JsonPattern { category: raw.category, entity_kind: raw.entity_kind, match_source, - confidence: raw.confidence.unwrap_or(DEFAULT_CONFIDENCE), context: raw.context, }; @@ -143,10 +136,6 @@ impl Pattern for JsonPattern { &self.match_source } - fn confidence(&self) -> f64 { - self.confidence - } - fn context(&self) -> Option<&ContextRule> { self.context.as_ref() } diff --git a/crates/nvisy-pattern/src/patterns/mod.rs b/crates/nvisy-pattern/src/patterns/mod.rs index 8433777..878151d 100644 --- a/crates/nvisy-pattern/src/patterns/mod.rs +++ b/crates/nvisy-pattern/src/patterns/mod.rs @@ -19,7 +19,7 @@ mod pattern; pub use context_rule::ContextRule; pub use json_pattern::{JsonPattern, JsonPatternWarning}; -pub use pattern::{BoxPattern, MatchSource, Pattern}; +pub use pattern::{BoxPattern, DictionaryConfidence, MatchSource, Pattern}; use std::collections::BTreeMap; use std::sync::LazyLock; @@ -187,11 +187,18 @@ mod tests { for p in registry().values() { assert!(!p.name().is_empty(), "pattern name is empty"); match p.match_source() { - MatchSource::Regex(rp) => assert!(!rp.regex.is_empty(), "regex is empty for {}", p.name()), - MatchSource::Dictionary(dp) => assert!(!dp.name.is_empty(), "dictionary is empty for {}", p.name()), + MatchSource::Regex(rp) => { + assert!(!rp.regex.is_empty(), "regex is empty for {}", p.name()); + assert!(rp.confidence > 0.0, "confidence is 0 for {}", p.name()); + assert!(rp.confidence <= 1.0, "confidence > 1 for {}", p.name()); + } + MatchSource::Dictionary(dp) => { + assert!(!dp.name.is_empty(), "dictionary is empty for {}", p.name()); + let c = dp.confidence.resolve(0); + assert!(c > 0.0, "confidence is 0 for {}", p.name()); + assert!(c <= 1.0, "confidence > 1 for {}", p.name()); + } } - assert!(p.confidence() > 0.0, "confidence is 0 for {}", p.name()); - assert!(p.confidence() <= 1.0, "confidence > 1 for {}", p.name()); } } @@ -229,8 +236,7 @@ mod tests { "name": "test", "category": "pii", "entity_type": "government_id", - "pattern": { "regex": "\\d+" }, - "confidence": 0.9 + "pattern": { "regex": "\\d+", "confidence": 0.9 } }"#; let (pattern, _warnings) = JsonPattern::from_bytes(json).unwrap(); diff --git a/crates/nvisy-pattern/src/patterns/pattern.rs b/crates/nvisy-pattern/src/patterns/pattern.rs index 5186e42..724e7f0 100644 --- a/crates/nvisy-pattern/src/patterns/pattern.rs +++ b/crates/nvisy-pattern/src/patterns/pattern.rs @@ -11,7 +11,7 @@ use nvisy_ontology::entity::{EntityCategory, EntityKind}; use super::context_rule::ContextRule; /// A regex-based match source with an optional post-match validator. -#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] +#[derive(Debug, Clone, PartialEq, Deserialize)] pub struct RegexPattern { /// The regular expression string. pub regex: String, @@ -27,10 +27,67 @@ pub struct RegexPattern { /// inline `(?i)` or equivalent flag. #[serde(default)] pub case_sensitive: bool, + /// Confidence score (0.0–1.0) assigned to matches from this pattern. + /// + /// Defaults to `1.0` when not specified. + #[serde(default = "default_confidence")] + pub confidence: f64, +} + +/// Confidence for a dictionary pattern: either a single uniform score +/// or per-column scores for CSV dictionaries. +#[derive(Debug, Clone, PartialEq)] +pub enum DictionaryConfidence { + /// Single confidence score applied to all entries. + Uniform(f64), + /// Per-column confidence scores. Entries from column `i` use index `i`. + /// Columns beyond the length fall back to the last value. + PerColumn(Vec), +} + +impl DictionaryConfidence { + /// Resolve confidence for a given column index. + pub fn resolve(&self, column: usize) -> f64 { + match self { + Self::Uniform(c) => *c, + Self::PerColumn(cols) => cols.get(column).copied().unwrap_or_else(|| { + cols.last().copied().unwrap_or(DEFAULT_CONFIDENCE) + }), + } + } + +} + +impl Default for DictionaryConfidence { + fn default() -> Self { + Self::Uniform(DEFAULT_CONFIDENCE) + } +} + +/// Serde helper — accepts either a single number or an array of numbers. +mod confidence_serde { + use super::DictionaryConfidence; + use serde::{Deserialize, Deserializer}; + + #[derive(Deserialize)] + #[serde(untagged)] + enum Raw { + Uniform(f64), + PerColumn(Vec), + } + + pub fn deserialize<'de, D: Deserializer<'de>>( + deserializer: D, + ) -> Result { + Ok(match Raw::deserialize(deserializer)? { + Raw::Uniform(c) => DictionaryConfidence::Uniform(c), + Raw::PerColumn(v) => DictionaryConfidence::PerColumn(v), + }) + } } /// A dictionary-based match source. -#[derive(Debug, Clone, PartialEq, Eq, Deserialize)] +#[derive(Debug, Clone, PartialEq, Deserialize)] pub struct DictionaryPattern { /// Named dictionary from the [`DictionaryRegistry`]. /// @@ -42,6 +99,16 @@ pub struct DictionaryPattern { /// `ascii_case_insensitive` setting. #[serde(default)] pub case_sensitive: bool, + /// Confidence score(s) for matches from this dictionary. + /// + /// A single number applies uniformly to all entries. + /// An array assigns per-column confidence for CSV dictionaries + /// (e.g. `[0.85, 0.55]` gives column 0 entries 0.85 and column 1 + /// entries 0.55). + /// + /// Defaults to `1.0` when not specified. + #[serde(default, deserialize_with = "confidence_serde::deserialize")] + pub confidence: DictionaryConfidence, } /// How a pattern finds matches in text. @@ -49,7 +116,7 @@ pub struct DictionaryPattern { /// Each pattern uses exactly one source: either a regular expression that /// is compiled and run against text spans, or a named dictionary whose /// entries are matched literally. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq)] pub enum MatchSource { /// Match via a compiled regular expression. Regex(RegexPattern), @@ -70,6 +137,13 @@ pub enum MatchSource { /// from the JSON files under `assets/patterns/`. /// /// [`JsonPattern`]: super::JsonPattern +/// Default confidence score when `"confidence"` is omitted from JSON. +pub const DEFAULT_CONFIDENCE: f64 = 1.0; + +fn default_confidence() -> f64 { + DEFAULT_CONFIDENCE +} + pub trait Pattern: Send + Sync { /// Unique name identifying this pattern (e.g. `"ssn"`, `"credit-card"`). fn name(&self) -> &str; @@ -82,15 +156,11 @@ pub trait Pattern: Send + Sync { /// How this pattern matches text: regex or dictionary lookup. /// - /// For regex patterns, the validator (if any) is embedded in the - /// [`MatchSource::Regex`] variant. + /// Confidence scores are embedded in the match source itself: + /// [`RegexPattern::confidence`] for regex, [`DictionaryPattern::confidence`] + /// for dictionaries. fn match_source(&self) -> &MatchSource; - /// Base confidence score (0.0–1.0) assigned to every raw match. - /// - /// Defaults to `1.0` when not specified in the pattern definition. - fn confidence(&self) -> f64; - /// Optional co-occurrence context rule for span-level confidence boosting. fn context(&self) -> Option<&ContextRule> { None diff --git a/crates/nvisy-pattern/src/prelude.rs b/crates/nvisy-pattern/src/prelude.rs index 60f8968..f045dfd 100644 --- a/crates/nvisy-pattern/src/prelude.rs +++ b/crates/nvisy-pattern/src/prelude.rs @@ -5,6 +5,5 @@ //! ``` pub use crate::{ - AllowList, ContextRule, DenyEntry, DenyList, DetectionSource, PatternEngine, - PatternEngineBuilder, PatternEngineError, PatternMatch, default_engine, + ContextRule, DetectionSource, PatternEngine, PatternEngineBuilder, PatternMatch, }; diff --git a/crates/nvisy-rig/Cargo.toml b/crates/nvisy-rig/Cargo.toml index db3b506..7ad6990 100644 --- a/crates/nvisy-rig/Cargo.toml +++ b/crates/nvisy-rig/Cargo.toml @@ -21,15 +21,37 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates -nvisy-codec = { workspace = true, features = [] } nvisy-core = { workspace = true, features = [] } +nvisy-ontology = { workspace = true, features = [] } # LLM framework -rig-core = { workspace = true, features = ["derive"] } +rig-core = { workspace = true, features = ["derive", "reqwest-middleware"] } # Async runtime async-trait = { workspace = true, features = [] } +tokio = { workspace = true, features = ["time"] } + +# HTTP middleware (retry, tracing) +reqwest-middleware = { workspace = true } +reqwest-retry = { workspace = true } +reqwest-tracing = { workspace = true } + +# Encoding +base64 = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = [] } +schemars = { workspace = true, features = [] } + +# Error handling +thiserror = { workspace = true, features = [] } + +# Identifiers +uuid = { workspace = true, features = ["v7"] } + +# Observability +tracing = { workspace = true, features = [] } + +[dev-dependencies] +tokio = { workspace = true, features = ["macros", "rt"] } diff --git a/crates/nvisy-rig/src/agent/base/agent.rs b/crates/nvisy-rig/src/agent/base/agent.rs new file mode 100644 index 0000000..18a951f --- /dev/null +++ b/crates/nvisy-rig/src/agent/base/agent.rs @@ -0,0 +1,211 @@ +//! Foundation agent that wraps provider-specific rig-core agents. + +use reqwest_middleware::ClientWithMiddleware; +use rig::agent::Agent; +use rig::completion::{Completion, Prompt}; +use rig::providers::{anthropic, gemini, ollama, openai}; +use schemars::JsonSchema; +use serde::de::DeserializeOwned; +use serde::Serialize; +use uuid::Uuid; + +use crate::backend::{ContextWindow, Provider, UsageTracker}; +use crate::bridge::ResponseParser; +use crate::error::Error; + +use super::BaseAgentBuilder; + +/// Sampling, retry, context-window, and preamble settings shared by all agents. +#[derive(Debug, Clone)] +pub struct BaseAgentConfig { + /// Sampling temperature (default: 0.1). + pub temperature: f64, + /// Maximum output tokens (default: 4096). + pub max_tokens: u64, + /// Maximum retries for transient HTTP errors (default: 3). + pub max_retries: u32, + /// Context window for chunking large inputs. + pub context_window: Option, + /// System prompt (preamble) for the agent. + pub preamble: Option, +} + +impl Default for BaseAgentConfig { + fn default() -> Self { + Self { + temperature: 0.1, + max_tokens: 4096, + max_retries: 3, + context_window: None, + preamble: None, + } + } +} + +pub(crate) enum Agents { + OpenAi(Agent>), + Anthropic(Agent>), + Gemini(Agent>), + Ollama(Agent>), +} + +macro_rules! dispatch { + ($inner:expr, |$agent:ident| $body:expr) => { + match $inner { + Agents::OpenAi($agent) => $body, + Agents::Anthropic($agent) => $body, + Agents::Gemini($agent) => $body, + Agents::Ollama($agent) => $body, + } + }; +} + +/// Internal foundation agent wrapping a provider-specific rig-core agent +/// with usage tracking and structured-output fallback. +/// +/// Specialized agents ([`NerAgent`], [`CvAgent`], [`OcrAgent`]) compose this +/// type rather than inheriting from it. +/// +/// [`NerAgent`]: crate::NerAgent +/// [`CvAgent`]: crate::CvAgent +/// [`OcrAgent`]: crate::OcrAgent +#[allow(dead_code)] +pub(crate) struct BaseAgent { + pub(super) id: Uuid, + pub(super) inner: Agents, + pub(super) context_window: Option, + pub(super) tracker: UsageTracker, +} + +#[allow(dead_code)] +impl BaseAgent { + pub fn builder(provider: &Provider, config: BaseAgentConfig) -> BaseAgentBuilder { + BaseAgentBuilder::new(provider, config) + } + + pub fn id(&self) -> Uuid { + self.id + } + + pub fn tracker(&self) -> &UsageTracker { + &self.tracker + } + + /// Structured-output prompt with usage tracking and JSON fallback. + /// + /// Sends a completion request with an `output_schema` so the provider + /// constrains its response to valid JSON matching `T`. On deserialization + /// failure the raw text is re-parsed via [`ResponseParser`]. + #[tracing::instrument(skip_all, fields(agent_id = %self.id, mode = "structured"))] + pub async fn prompt_structured(&self, prompt: &str) -> Result + where + T: DeserializeOwned + Default + JsonSchema + Serialize + Send + Sync, + { + let schema = schemars::schema_for!(T); + + let (text, usage) = dispatch!(&self.inner, |agent| { + let builder = agent + .completion(prompt, vec![]) + .await + .map_err(Error::from)? + .output_schema(schema); + + let response = builder.send().await.map_err(Error::from)?; + let parsed = ResponseParser::extract_text(&response)?; + Ok::<_, Error>((parsed.into_string(), response.usage)) + })?; + + self.tracker.record(&usage, 0); + + let parser = ResponseParser::from_text(&text); + match serde_json::from_str::(&text) { + Ok(value) => { + tracing::debug!("structured output succeeded"); + Ok(value) + } + Err(structured_err) => { + tracing::warn!( + error = %structured_err, + "structured JSON parse failed, falling back to text-based parsing" + ); + parser.parse_json() + } + } + } + + /// Text completion with usage tracking. + #[tracing::instrument(skip_all, fields(agent_id = %self.id, mode = "text"))] + pub async fn prompt_text(&self, prompt: &str) -> Result { + let (text, usage) = dispatch!(&self.inner, |agent| { + let builder = agent + .completion(prompt, vec![]) + .await + .map_err(Error::from)?; + + let response = builder.send().await.map_err(Error::from)?; + let parsed = ResponseParser::extract_text(&response)?; + Ok::<_, Error>((parsed.into_string(), response.usage)) + })?; + + self.tracker.record(&usage, 0); + Ok(text) + } + + /// Plain text completion (no usage tracking). + #[tracing::instrument(skip_all, fields(agent_id = %self.id, mode = "prompt"))] + pub async fn prompt(&self, prompt: &str) -> Result { + dispatch!(&self.inner, |agent| { + agent.prompt(prompt).await.map_err(Error::from) + }) + } + + /// Summarize text to fit within the context window's input budget. + /// + /// Returns the text unchanged when no context window is configured or + /// the text already fits. + #[tracing::instrument(skip_all, fields(agent_id = %self.id, mode = "compact"))] + pub async fn prompt_compact(&self, text: &str) -> Result { + let cw = match &self.context_window { + Some(cw) if !cw.fits(text) => cw, + _ => return Ok(text.to_owned()), + }; + + let budget = cw.input_budget(); + let prompt = format!( + "Summarize the following text to fit within {budget} tokens. \ + Preserve all key entities, names, numbers, dates, and facts. \ + Remove redundancy and filler. Return ONLY the condensed text, \ + no preamble.\n\n{text}" + ); + + self.prompt_text(&prompt).await + } + + /// Split text via [`ContextWindow`], run `prompt_structured` per chunk, + /// and flatten results. + #[tracing::instrument(skip_all, fields(agent_id = %self.id, mode = "chunked"))] + pub async fn prompt_chunked( + &self, + text: &str, + build_prompt: F, + ) -> Result, Error> + where + T: DeserializeOwned + Default + JsonSchema + Serialize + Send + Sync, + F: Fn(&str) -> String, + Vec: Default, + { + let chunks = match &self.context_window { + Some(cw) => cw.split_to_fit(text), + None => vec![text], + }; + + let mut all_results = Vec::new(); + for chunk in chunks { + let prompt = build_prompt(chunk); + let chunk_results: Vec = self.prompt_structured(&prompt).await?; + all_results.extend(chunk_results); + } + + Ok(all_results) + } +} diff --git a/crates/nvisy-rig/src/agent/base/builder.rs b/crates/nvisy-rig/src/agent/base/builder.rs new file mode 100644 index 0000000..5cae5ba --- /dev/null +++ b/crates/nvisy-rig/src/agent/base/builder.rs @@ -0,0 +1,140 @@ +//! Builder for [`BaseAgent`](super::BaseAgent). + +use reqwest_middleware::ClientWithMiddleware; +use rig::agent::{Agent, AgentBuilder}; +use rig::client::CompletionClient; +use rig::completion::CompletionModel; +use rig::providers::{anthropic, gemini, ollama, openai}; +use rig::tool::{Tool, ToolDyn}; +use uuid::Uuid; + +use crate::backend::{Provider, UsageTracker, build_http_client}; +use super::{Agents, BaseAgent, BaseAgentConfig}; +use crate::error::Error; + +/// Builder for [`BaseAgent`]. +/// +/// Created via [`BaseAgent::builder`]. Collects a provider reference, config, +/// and optional tools, then constructs the concrete rig-core agent on +/// [`build`](Self::build). +pub(crate) struct BaseAgentBuilder { + provider: Provider, + config: BaseAgentConfig, + tools: Vec>, +} + +impl BaseAgentBuilder { + pub fn new(provider: &Provider, config: BaseAgentConfig) -> Self { + Self { + provider: provider.clone(), + config, + tools: Vec::new(), + } + } + + /// Register a tool the agent can call during prompts. + pub fn tool(mut self, tool: impl Tool + 'static) -> Self { + self.tools.push(Box::new(tool)); + self + } + + /// Build the [`BaseAgent`], constructing the provider-specific rig client. + pub fn build(self) -> Result { + let Self { + provider, + config, + tools, + } = self; + + let http_client = build_http_client(config.max_retries); + let preamble = config.preamble.as_deref(); + + let inner = match &provider { + Provider::OpenAi(p) => { + let mut b = openai::Client::::builder() + .api_key(&p.api_key) + .http_client(http_client); + if let Some(url) = &p.base_url { + b = b.base_url(url); + } + let client = b.build().map_err(|e| Error::Client(e.to_string()))?; + let model = client.completions_api().completion_model(&p.model); + Agents::OpenAi(build_rig_agent(model, &config, preamble, tools)) + } + Provider::Anthropic(p) => { + let mut b = anthropic::Client::::builder() + .api_key(&p.api_key) + .http_client(http_client); + if let Some(url) = &p.base_url { + b = b.base_url(url); + } + let client = b.build().map_err(|e| Error::Client(e.to_string()))?; + let model = client.completion_model(&p.model); + Agents::Anthropic(build_rig_agent(model, &config, preamble, tools)) + } + Provider::Gemini(p) => { + let mut b = gemini::Client::::builder() + .api_key(&p.api_key) + .http_client(http_client); + if let Some(url) = &p.base_url { + b = b.base_url(url); + } + let client = b.build().map_err(|e| Error::Client(e.to_string()))?; + // rig-core 0.31: Gemini's Capabilities doesn't propagate H, + // so CompletionClient is unavailable for non-default H. + let model = gemini::completion::CompletionModel::new(client, &p.model); + Agents::Gemini(build_rig_agent(model, &config, preamble, tools)) + } + Provider::Ollama(p) => { + let mut b = ollama::Client::::builder() + .api_key(rig::client::Nothing) + .http_client(http_client); + if let Some(url) = &p.base_url { + b = b.base_url(url); + } + let client = b.build().map_err(|e| Error::Client(e.to_string()))?; + let model = client.completion_model(&p.model); + Agents::Ollama(build_rig_agent(model, &config, preamble, tools)) + } + }; + + Ok(BaseAgent { + id: Uuid::now_v7(), + inner, + context_window: config.context_window, + tracker: UsageTracker::new(), + }) + } +} + +/// Build a concrete rig-core `Agent`. +/// +/// Generic over `M` but only called inside [`BaseAgentBuilder::build`] — +/// the generic never escapes the module boundary. +fn build_rig_agent( + model: M, + config: &BaseAgentConfig, + preamble: Option<&str>, + tools: Vec>, +) -> Agent { + // AgentBuilder uses typestate: `.tools()` changes the type parameter, + // so the with-tools and without-tools paths cannot share a binding. + if tools.is_empty() { + let mut b = AgentBuilder::new(model) + .temperature(config.temperature) + .max_tokens(config.max_tokens); + if let Some(p) = preamble { + b = b.preamble(p); + } + b.build() + } else { + let mut b = AgentBuilder::new(model) + .temperature(config.temperature) + .max_tokens(config.max_tokens) + .tools(tools); + if let Some(p) = preamble { + b = b.preamble(p); + } + b.build() + } +} diff --git a/crates/nvisy-rig/src/agent/base/mod.rs b/crates/nvisy-rig/src/agent/base/mod.rs new file mode 100644 index 0000000..914639e --- /dev/null +++ b/crates/nvisy-rig/src/agent/base/mod.rs @@ -0,0 +1,8 @@ +//! Foundation agent and builder shared by all specialized agents. + +mod agent; +mod builder; + +pub use agent::BaseAgentConfig; +pub(crate) use agent::{Agents, BaseAgent}; +pub(crate) use builder::BaseAgentBuilder; diff --git a/crates/nvisy-rig/src/agent/cv/mod.rs b/crates/nvisy-rig/src/agent/cv/mod.rs new file mode 100644 index 0000000..8f42f79 --- /dev/null +++ b/crates/nvisy-rig/src/agent/cv/mod.rs @@ -0,0 +1,119 @@ +//! Computer vision agent for face, license plate, and signature detection. +//! +//! [`CvAgent`] wraps a [`BaseAgent`](crate::backend::BaseAgent) with a +//! [`CvProvider`]-backed tool. It encodes an image as base64, prompts the +//! VLM to call the CV tool, and returns classified entities with bounding +//! boxes. + +mod output; +mod prompt; +mod tool; + +pub use output::{CvEntities, CvEntity}; + +use async_trait::async_trait; +use base64::Engine; +use base64::engine::general_purpose::STANDARD; +use serde::Serialize; +use uuid::Uuid; + +use crate::backend::{DetectionConfig, Provider, UsageTracker}; +use super::{BaseAgent, BaseAgentConfig}; +use crate::error::Error; +use prompt::{CV_SYSTEM_PROMPT, CvPromptBuilder}; +use tool::CvRigTool; + +/// A single computer-vision detection result returned by a [`CvProvider`]. +/// +/// This is the raw output from the CV backend before the VLM classifies +/// detections into entity categories. It carries a human-readable label, +/// a confidence score, and a pixel-space bounding box. +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct CvDetection { + /// Label for the detected object (e.g. `"face"`, `"license_plate"`). + pub label: String, + /// Detection confidence in the range `0.0..=1.0`. + pub confidence: f64, + /// Bounding box as `[x, y, width, height]` in pixels. + pub bbox: [f64; 4], +} + +/// Trait for computer-vision capabilities (face/plate/signature detection). +/// +/// Consumers implement this trait to supply object detection from images. +/// The trait is intentionally free of rig-core types so it can be +/// implemented in any crate without pulling in the LLM framework. +#[async_trait] +pub trait CvProvider: Send + Sync { + /// Detect objects in raw image bytes (PNG, JPEG, etc.). + async fn detect_objects(&self, image_data: &[u8]) -> Result, Error>; +} + +/// VLM agent that detects privacy-sensitive objects in images. +/// +/// # Workflow +/// +/// 1. Caller passes raw image bytes to [`detect`](Self::detect). +/// 2. The agent base64-encodes the image and builds a user prompt via +/// [`CvPromptBuilder`]. +/// 3. The VLM is instructed to call the `cv_detect_objects` tool (backed +/// by the [`CvProvider`]) and then classify each detection into an +/// entity category and type. +/// 4. Structured output is parsed into a `Vec`. +pub struct CvAgent { + base: BaseAgent, +} + +impl CvAgent { + /// Create a new CV agent. + pub fn new( + provider: &Provider, + mut config: BaseAgentConfig, + cv: impl CvProvider + 'static, + ) -> Result { + config.preamble.get_or_insert_with(|| CV_SYSTEM_PROMPT.into()); + let base = BaseAgent::builder(provider, config) + .tool(CvRigTool::new(cv)) + .build()?; + Ok(Self { base }) + } + + /// Unique identifier for this agent instance (UUIDv7). + pub fn id(&self) -> Uuid { + self.base.id() + } + + /// Access the usage tracker for this agent's LLM calls. + pub fn tracker(&self) -> &UsageTracker { + self.base.tracker() + } + + /// Detect privacy-sensitive objects in an image. + #[tracing::instrument( + skip_all, + fields(image_bytes = image_data.len(), agent = "cv"), + )] + pub async fn detect( + &self, + image_data: &[u8], + config: &DetectionConfig, + ) -> Result, Error> { + let image_b64 = STANDARD.encode(image_data); + tracing::debug!( + b64_len = image_b64.len(), + entity_kinds = config.entity_kinds.len(), + "encoded image, building prompt" + ); + + let prompt = CvPromptBuilder::new(config).build(&image_b64); + + let result: CvEntities = self.base.prompt_structured(&prompt).await?; + + tracing::info!( + entity_count = result.entities.len(), + "cv detection complete" + ); + + Ok(result.entities) + } +} diff --git a/crates/nvisy-rig/src/agent/cv/output.rs b/crates/nvisy-rig/src/agent/cv/output.rs new file mode 100644 index 0000000..32331b7 --- /dev/null +++ b/crates/nvisy-rig/src/agent/cv/output.rs @@ -0,0 +1,28 @@ +//! Structured output types for CV detection. + +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use nvisy_ontology::entity::{EntityCategory, EntityKind}; + +/// A single entity detected by computer vision. +#[derive(Debug, Clone, PartialEq, Deserialize, Serialize, JsonSchema)] +pub struct CvEntity { + /// Broad classification. + pub category: EntityCategory, + /// Specific entity type. + pub entity_type: EntityKind, + /// Label from the CV model (e.g. "face", "license_plate"). + pub label: String, + /// Detection confidence (0.0..=1.0). + pub confidence: f64, + /// Bounding box `[x, y, width, height]` in pixels. + pub bbox: [f64; 4], +} + +/// Wrapper for structured output parsing. +#[derive(Debug, Clone, Default, PartialEq, Deserialize, Serialize, JsonSchema)] +pub struct CvEntities { + /// Detected entities. + pub entities: Vec, +} diff --git a/crates/nvisy-rig/src/agent/cv/prompt.rs b/crates/nvisy-rig/src/agent/cv/prompt.rs new file mode 100644 index 0000000..81c3048 --- /dev/null +++ b/crates/nvisy-rig/src/agent/cv/prompt.rs @@ -0,0 +1,66 @@ +//! CV-specific prompt construction. +//! +//! [`CvPromptBuilder`] constructs the user prompt that instructs the VLM +//! to call the CV tool and classify detections into entity categories. + +use crate::backend::DetectionConfig; + +/// Fallback when no specific entity types are requested. +const ALL_TYPES_HINT: &str = "all detectable object types"; + +/// Builds user prompts for CV-based object detection. +/// +/// Encodes entity-kind filters and confidence thresholds into the prompt +/// alongside the base64-encoded image data. +pub(crate) struct CvPromptBuilder<'a> { + config: &'a DetectionConfig, +} + +impl<'a> CvPromptBuilder<'a> { + /// Create a prompt builder from a [`DetectionConfig`]. + pub fn new(config: &'a DetectionConfig) -> Self { + Self { config } + } + + /// Build the user prompt for the given base64-encoded image. + pub fn build(&self, image_b64: &str) -> String { + let entity_hint = if self.config.entity_kinds.is_empty() { + ALL_TYPES_HINT.to_string() + } else { + self.config + .entity_kinds + .iter() + .map(|e| e.to_string()) + .collect::>() + .join(", ") + }; + + format!( + "Detect objects of types [{entity_hint}] with minimum confidence \ + {threshold:.2} in the following base64-encoded image using the \ + cv_detect_objects tool.\n\n\ + Image (base64): {image_b64}", + threshold = self.config.confidence_threshold, + ) + } +} + +/// Default system prompt for the CV agent. +pub(super) const CV_SYSTEM_PROMPT: &str = "\ +You are a vision-language model performing object detection for privacy-sensitive content in images. \ +You have access to a computer vision tool that detects faces, license plates, and signatures.\n\ +\n\ +Your workflow:\n\ +1. Use the cv_detect_objects tool to detect objects in the provided image.\n\ +2. Analyze the detections and classify each into an entity category (pii, phi, etc.) \ + and specific entity type.\n\ +3. Return a JSON array of detected entities, each with keys: \ + category, entity_type, label, confidence, bbox ([x, y, width, height] in pixels).\n\ +\n\ +Common entity mappings:\n\ +- face → category: pii, entity_type: biometric_data\n\ +- license_plate → category: pii, entity_type: vehicle_id\n\ +- signature → category: pii, entity_type: biometric_data\n\ +- handwriting → category: pii, entity_type: person_name (if it contains a name)\n\ +\n\ +If no objects are detected, return an empty array []."; diff --git a/crates/nvisy-rig/src/agent/cv/tool.rs b/crates/nvisy-rig/src/agent/cv/tool.rs new file mode 100644 index 0000000..bfc0ef1 --- /dev/null +++ b/crates/nvisy-rig/src/agent/cv/tool.rs @@ -0,0 +1,64 @@ +//! Internal rig `Tool` wrapper for [`CvProvider`]. + +use std::sync::Arc; + +use base64::Engine; +use base64::engine::general_purpose::STANDARD; +use rig::completion::ToolDefinition; +use rig::tool::Tool; +use schemars::JsonSchema; +use serde::Deserialize; + +use super::CvProvider; + +/// Arguments for the CV tool call. +#[derive(Debug, Clone, Deserialize, JsonSchema)] +pub(super) struct CvToolArgs { + /// Base64-encoded image data. + pub image_base64: String, +} + +/// Error returned by the CV tool. +#[derive(Debug, thiserror::Error)] +#[error("{0}")] +pub(super) struct CvToolError(String); + +/// Rig `Tool` wrapper around a [`CvProvider`] implementation. +pub(super) struct CvRigTool(Arc); + +impl CvRigTool { + pub fn new(provider: T) -> Self { + Self(Arc::new(provider)) + } +} + +impl Tool for CvRigTool { + const NAME: &'static str = "cv_detect_objects"; + + type Error = CvToolError; + type Args = CvToolArgs; + type Output = String; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "Detect objects (faces, license plates, signatures) in an image \ + using computer vision. Pass the image as a base64-encoded string." + .to_string(), + parameters: serde_json::to_value(schemars::schema_for!(CvToolArgs)) + .unwrap_or_default(), + } + } + + async fn call(&self, args: Self::Args) -> Result { + let bytes = STANDARD + .decode(&args.image_base64) + .map_err(|e| CvToolError(format!("invalid base64: {e}")))?; + let detections = self + .0 + .detect_objects(&bytes) + .await + .map_err(|e| CvToolError(e.to_string()))?; + serde_json::to_string(&detections).map_err(|e| CvToolError(e.to_string())) + } +} diff --git a/crates/nvisy-rig/src/agent/mod.rs b/crates/nvisy-rig/src/agent/mod.rs new file mode 100644 index 0000000..2415c84 --- /dev/null +++ b/crates/nvisy-rig/src/agent/mod.rs @@ -0,0 +1,17 @@ +//! Specialized detection agents: NER (text), CV (vision), and OCR (image-to-text). +//! +//! Each agent composes a [`BaseAgent`](base::BaseAgent) with domain-specific +//! prompts and optional tools. Public types are re-exported from [`crate`] — +//! consumer code should not reach into submodules. + +mod base; +mod cv; +mod ner; +mod ocr; + +pub use base::BaseAgentConfig; +pub(crate) use base::BaseAgent; + +pub use cv::{CvAgent, CvDetection, CvEntities, CvEntity, CvProvider}; +pub use ner::{KnownNerEntity, NerAgent, NerContext, NerEntities, NerEntity, ResolvedOffsets}; +pub use ocr::{OcrAgent, OcrEntity, OcrOutput, OcrProvider, OcrTextRegion}; diff --git a/crates/nvisy-rig/src/agent/ner/context.rs b/crates/nvisy-rig/src/agent/ner/context.rs new file mode 100644 index 0000000..7a6490a --- /dev/null +++ b/crates/nvisy-rig/src/agent/ner/context.rs @@ -0,0 +1,168 @@ +//! Input context for NER detection calls. + +use super::{KnownNerEntity, NerEntity}; + +/// Input context for a single NER detection call. +/// +/// Bundles the text to analyse together with any previously identified +/// entities so the LLM can assign consistent `entity_id` values across +/// chunks or sequential calls. +/// +/// Use [`merge`](Self::merge) to accumulate entities from successive +/// detection calls, then update the text with [`set_text`](Self::set_text) +/// before the next call. +pub struct NerContext<'a> { + /// The text to analyse. + pub text: &'a str, + /// Accumulated known entities from prior detection calls. + pub known_entities: Vec, +} + +impl<'a> NerContext<'a> { + /// Create a context with no known entities. + pub fn new(text: &'a str) -> Self { + Self { + text, + known_entities: Vec::new(), + } + } + + /// Create a context with previously identified entities. + pub fn with_known(text: &'a str, known_entities: Vec) -> Self { + Self { + text, + known_entities, + } + } + + /// Set the text to analyse, keeping accumulated known entities. + pub fn set_text(&mut self, text: &'a str) { + self.text = text; + } + + /// Merge newly detected entities into the known set. + /// + /// For each entity: if a [`KnownNerEntity`] with the same `entity_id` + /// already exists, its `values` list is extended with any new surface + /// forms and new descriptions are appended. Otherwise a new + /// `KnownNerEntity` is created. + pub fn merge(&mut self, entities: Vec) { + for entity in entities { + if let Some(known) = self + .known_entities + .iter_mut() + .find(|k| k.entity_id == entity.entity_id) + { + // Add new surface form if not already present. + if !known.values.iter().any(|v| v == &entity.value) { + known.values.push(entity.value); + } + + // Append new description if not already present. + if let Some(desc) = entity.description + && !known.descriptions.iter().any(|d| d == &desc) + { + known.descriptions.push(desc); + } + + // Fill in entity_type if it was previously unknown. + if known.entity_type.is_none() { + known.entity_type = entity.entity_type; + } + } else { + self.known_entities.push(KnownNerEntity { + entity_id: entity.entity_id, + entity_type: entity.entity_type, + values: vec![entity.value], + descriptions: entity.description.into_iter().collect(), + }); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use nvisy_ontology::entity::EntityKind; + + fn ner_entity(id: &str, value: &str, desc: Option<&str>) -> NerEntity { + NerEntity { + entity_id: id.into(), + category: None, + entity_type: Some(EntityKind::PersonName), + value: value.into(), + confidence: None, + context: None, + description: desc.map(Into::into), + } + } + + #[test] + fn merge_creates_new_known_entity() { + let mut ctx = NerContext::new(""); + ctx.merge(vec![ner_entity("person_1", "John Smith", Some("the CEO"))]); + + assert_eq!(ctx.known_entities.len(), 1); + assert_eq!(ctx.known_entities[0].entity_id, "person_1"); + assert_eq!(ctx.known_entities[0].values, vec!["John Smith"]); + assert_eq!(ctx.known_entities[0].descriptions, vec!["the CEO"]); + } + + #[test] + fn merge_accumulates_surface_forms() { + let mut ctx = NerContext::new(""); + ctx.merge(vec![ner_entity("person_1", "John Smith", None)]); + ctx.merge(vec![ner_entity("person_1", "John", None)]); + ctx.merge(vec![ner_entity("person_1", "Mr. Smith", None)]); + // Duplicate value should not be added. + ctx.merge(vec![ner_entity("person_1", "John", None)]); + + assert_eq!(ctx.known_entities.len(), 1); + assert_eq!( + ctx.known_entities[0].values, + vec!["John Smith", "John", "Mr. Smith"], + ); + } + + #[test] + fn merge_accumulates_descriptions() { + let mut ctx = NerContext::new(""); + ctx.merge(vec![ner_entity("person_1", "Alice", Some("the CEO"))]); + ctx.merge(vec![ner_entity("person_1", "Alice", Some("signed the contract on Jan 5"))]); + + assert_eq!( + ctx.known_entities[0].descriptions, + vec!["the CEO", "signed the contract on Jan 5"], + ); + } + + #[test] + fn merge_deduplicates_descriptions() { + let mut ctx = NerContext::new(""); + ctx.merge(vec![ner_entity("person_1", "Alice", Some("the CEO"))]); + ctx.merge(vec![ner_entity("person_1", "Alice", Some("the CEO"))]); + + assert_eq!(ctx.known_entities[0].descriptions, vec!["the CEO"]); + } + + #[test] + fn merge_no_description() { + let mut ctx = NerContext::new(""); + ctx.merge(vec![ner_entity("person_1", "Alice", None)]); + + assert!(ctx.known_entities[0].descriptions.is_empty()); + } + + #[test] + fn merge_fills_missing_entity_type() { + let mut ctx = NerContext::new(""); + let mut e = ner_entity("org_1", "Acme", None); + e.entity_type = None; + ctx.merge(vec![e]); + assert!(ctx.known_entities[0].entity_type.is_none()); + + ctx.merge(vec![ner_entity("org_1", "Acme Corp", None)]); + assert_eq!(ctx.known_entities[0].entity_type, Some(EntityKind::PersonName)); + } +} diff --git a/crates/nvisy-rig/src/agent/ner/mod.rs b/crates/nvisy-rig/src/agent/ner/mod.rs new file mode 100644 index 0000000..bbccfe5 --- /dev/null +++ b/crates/nvisy-rig/src/agent/ner/mod.rs @@ -0,0 +1,84 @@ +//! Named Entity Recognition (NER) agent for textual PII/entity detection. +//! +//! [`NerAgent`] wraps a [`BaseAgent`](crate::backend::BaseAgent) with +//! NER-specific prompts. It is a pure LLM agent (no tools) that analyses +//! text and returns structured entity detections. + +mod context; +mod output; +mod prompt; + +pub use context::NerContext; +pub use output::{KnownNerEntity, NerEntities, NerEntity, ResolvedOffsets}; + +use uuid::Uuid; + +use crate::backend::{DetectionConfig, Provider, UsageTracker}; +use super::{BaseAgent, BaseAgentConfig}; +use crate::error::Error; +use prompt::{NER_SYSTEM_PROMPT, NerPromptBuilder}; + +/// Agent for textual PII/entity detection using LLM-based NER. +/// +/// # Workflow +/// +/// 1. Caller passes a [`NerContext`] and a [`DetectionConfig`] to +/// [`detect`](Self::detect). +/// 2. The agent builds a user prompt via [`NerPromptBuilder`] that +/// specifies entity types, confidence thresholds, and known entities. +/// 3. Structured output is parsed into `Vec`. +pub struct NerAgent { + base: BaseAgent, +} + +impl NerAgent { + /// Create a new NER agent. + pub fn new(provider: &Provider, mut config: BaseAgentConfig) -> Result { + config.preamble.get_or_insert_with(|| NER_SYSTEM_PROMPT.into()); + let base = BaseAgent::builder(provider, config).build()?; + Ok(Self { base }) + } + + /// Unique identifier for this agent instance (UUIDv7). + pub fn id(&self) -> Uuid { + self.base.id() + } + + /// Access the usage tracker for this agent's LLM calls. + pub fn tracker(&self) -> &UsageTracker { + self.base.tracker() + } + + /// Detect entities in text using structured output with text-based fallback. + /// + /// When [`NerContext::known_entities`] is non-empty the LLM is + /// instructed to reuse their `entity_id` values for coreferent + /// mentions, enabling cross-chunk coreference resolution. + #[tracing::instrument( + skip_all, + fields(text_len = ctx.text.len(), agent = "ner"), + )] + pub async fn detect( + &self, + ctx: &NerContext<'_>, + config: &DetectionConfig, + ) -> Result, Error> { + let prompt = NerPromptBuilder::new(config, &ctx.known_entities).build(ctx.text); + + tracing::debug!( + prompt_len = prompt.len(), + entity_kinds = config.entity_kinds.len(), + known = ctx.known_entities.len(), + "built ner prompt" + ); + + let result: NerEntities = self.base.prompt_structured(&prompt).await?; + + tracing::info!( + entity_count = result.entities.len(), + "ner detection complete" + ); + + Ok(result.entities) + } +} diff --git a/crates/nvisy-rig/src/agent/ner/output.rs b/crates/nvisy-rig/src/agent/ner/output.rs new file mode 100644 index 0000000..3a87845 --- /dev/null +++ b/crates/nvisy-rig/src/agent/ner/output.rs @@ -0,0 +1,172 @@ +//! Structured output types for NER entity detection. + +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use nvisy_ontology::entity::{EntityCategory, EntityKind}; + +use super::NerContext; + +/// A list of NER entities returned by structured output. +#[derive(Debug, Clone, Default, PartialEq, Deserialize, Serialize, JsonSchema)] +pub struct NerEntities { + /// Detected entities. + pub entities: Vec, +} + +/// A single NER entity from structured LLM output. +#[derive(Debug, Clone, PartialEq, Deserialize, Serialize, JsonSchema)] +pub struct NerEntity { + /// Stable identifier for the real-world entity this mention refers to. + /// + /// All mentions of the same person, organisation, etc. share the same + /// `entity_id` (e.g. `"person_1"`). When known entities are provided + /// as context, the LLM reuses their IDs for coreferent mentions. + pub entity_id: String, + /// Broad classification (may be absent for coreferent mentions like pronouns). + pub category: Option, + /// Specific entity type (may be absent for coreferent mentions like pronouns). + pub entity_type: Option, + /// The matched text value. + pub value: String, + /// Detection confidence (0.0..=1.0). + pub confidence: Option, + /// A short snippet of surrounding text that uniquely locates this mention + /// within the input. Used to compute byte offsets deterministically by + /// finding `context` in the span, then `value` within the `context`. + pub context: Option, + /// Brief description of the real-world entity (e.g. "CEO of Acme Corp, + /// mentioned as the signatory"). Carried forward via [`KnownNerEntity`] so + /// the LLM can disambiguate entities across chunks. + pub description: Option, +} + +/// A previously identified entity carried as context between detection calls. +/// +/// Lighter than [`NerEntity`] — holds only the information the LLM needs to +/// recognise and reuse an existing `entity_id`. Created via +/// [`NerContext::merge`]. +#[derive(Debug, Clone, PartialEq)] +pub struct KnownNerEntity { + /// Stable identifier (e.g. `"person_1"`). + pub entity_id: String, + /// Entity type, if known. + pub entity_type: Option, + /// All surface forms seen so far (e.g. `["John Smith", "John", "Mr. Smith"]`). + pub values: Vec, + /// Accumulated descriptions from successive detection calls. + pub descriptions: Vec, +} + +/// Resolved byte offsets for an entity mention within its source text. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ResolvedOffsets { + /// Start byte offset in the source text. + pub start: usize, + /// End byte offset (exclusive) in the source text. + pub end: usize, +} + +impl NerEntity { + /// Resolve byte offsets of this entity's `value` within the text + /// from the [`NerContext`] that produced it. + /// + /// When `context` is present, first locates the context snippet in + /// the source text, then finds `value` within it. Falls back to + /// searching for `value` directly in the source text when `context` + /// is absent or not found. + /// + /// Returns `None` if the value cannot be located. + pub fn resolve_offsets(&self, ctx: &NerContext<'_>) -> Option { + let text = ctx.text; + + if let Some(ref context) = self.context + && let Some(ctx_start) = text.find(context.as_str()) + && let Some(val_offset) = context.find(&self.value) + { + let start = ctx_start + val_offset; + return Some(ResolvedOffsets { + start, + end: start + self.value.len(), + }); + } + + // Fallback: search for value directly in the source text. + let start = text.find(&self.value)?; + Some(ResolvedOffsets { + start, + end: start + self.value.len(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn entity(value: &str, context: Option<&str>) -> NerEntity { + NerEntity { + entity_id: "test_1".into(), + category: None, + entity_type: None, + value: value.into(), + confidence: None, + context: context.map(Into::into), + description: None, + } + } + + #[test] + fn resolve_with_context() { + let text = "Alice met Bob. Later Alice called him."; + let ctx = NerContext::new(text); + let e = entity("Alice", Some("Later Alice called")); + + let offsets = e.resolve_offsets(&ctx).unwrap(); + assert_eq!(offsets.start, 21); + assert_eq!(offsets.end, 26); + assert_eq!(&text[offsets.start..offsets.end], "Alice"); + } + + #[test] + fn resolve_without_context_finds_first() { + let text = "Alice met Bob. Later Alice called him."; + let ctx = NerContext::new(text); + let e = entity("Alice", None); + + let offsets = e.resolve_offsets(&ctx).unwrap(); + assert_eq!(offsets.start, 0); + assert_eq!(offsets.end, 5); + } + + #[test] + fn resolve_missing_value_returns_none() { + let text = "No match here."; + let ctx = NerContext::new(text); + let e = entity("Charlie", Some("with Charlie")); + + assert!(e.resolve_offsets(&ctx).is_none()); + } + + #[test] + fn resolve_context_not_found_falls_back() { + let text = "Alice is here."; + let ctx = NerContext::new(text); + let e = entity("Alice", Some("stale context from another chunk")); + + let offsets = e.resolve_offsets(&ctx).unwrap(); + assert_eq!(offsets.start, 0); + assert_eq!(offsets.end, 5); + } + + #[test] + fn resolve_disambiguates_duplicate_values() { + let text = "He went home. She said he was tired."; + let ctx = NerContext::new(text); + + let e1 = entity("he", Some("said he was")); + let offsets = e1.resolve_offsets(&ctx).unwrap(); + assert_eq!(&text[offsets.start..offsets.end], "he"); + assert_eq!(offsets.start, 23); + } +} diff --git a/crates/nvisy-rig/src/agent/ner/prompt.rs b/crates/nvisy-rig/src/agent/ner/prompt.rs new file mode 100644 index 0000000..27c54f6 --- /dev/null +++ b/crates/nvisy-rig/src/agent/ner/prompt.rs @@ -0,0 +1,69 @@ +//! NER-specific prompt construction. + +use crate::backend::DetectionConfig; +use crate::bridge::PromptBuilder; + +use super::KnownNerEntity; + +/// Builds user prompts for NER entity detection. +pub(crate) struct NerPromptBuilder<'a> { + inner: PromptBuilder<'a>, + known_entities: &'a [KnownNerEntity], +} + +impl<'a> NerPromptBuilder<'a> { + /// Create a prompt builder from a [`DetectionConfig`]. + pub fn new(config: &'a DetectionConfig, known_entities: &'a [KnownNerEntity]) -> Self { + Self { + inner: PromptBuilder::new(config), + known_entities, + } + } + + /// Build the user prompt for the given text. + pub fn build(&self, text: &str) -> String { + let mut prompt = self.inner.build(text); + + if !self.known_entities.is_empty() { + prompt.push_str("\n\nPreviously identified entities (reuse their entity_id for coreferent mentions):\n"); + for e in self.known_entities { + let type_str = match &e.entity_type { + Some(t) => t.to_string(), + None => "unknown".to_string(), + }; + let values = e.values.iter().map(|v| format!("\"{v}\"")).collect::>().join(", "); + prompt.push_str(&format!( + "- entity_id={}, type={}, values=[{}]", + e.entity_id, type_str, values, + )); + if !e.descriptions.is_empty() { + let descs = e.descriptions.join("; "); + prompt.push_str(&format!(", description=\"{descs}\"")); + } + prompt.push('\n'); + } + } + + prompt + } +} + +/// Default system prompt for NER detection. +pub(super) const NER_SYSTEM_PROMPT: &str = "\ +You are a precise named-entity recognition system. \ +Identify personally identifiable information (PII), protected health information (PHI), \ +financial data, and credentials in the provided text. \ +Return results as a JSON object with an \"entities\" key containing an array of objects with keys: \ +entity_id, category (optional), entity_type (optional), value, confidence (optional), \ +context (optional), description (optional). \ +Assign a stable entity_id (e.g. \"person_1\", \"org_1\") to each unique real-world entity. \ +All mentions of the same entity must share the same entity_id. \ +When previously identified entities are provided, reuse their entity_id for any coreferent mentions. \ +The \"context\" field should be a short surrounding snippet of text that uniquely locates this \ +mention within the input. Include enough words before and after the value so that the context \ +string appears exactly once in the input text. This is especially important when the same value \ +(e.g. \"he\") appears multiple times. \ +The \"description\" field should be a brief description of the real-world entity \ +(e.g. \"CEO of Acme Corp\", \"patient's home address\"). Provide it for the first mention \ +of each entity or when additional context becomes available. \ +If no entities are found, return {\"entities\": []}."; diff --git a/crates/nvisy-rig/src/agent/ocr/mod.rs b/crates/nvisy-rig/src/agent/ocr/mod.rs new file mode 100644 index 0000000..539a034 --- /dev/null +++ b/crates/nvisy-rig/src/agent/ocr/mod.rs @@ -0,0 +1,123 @@ +//! OCR agent for vision-based text extraction and entity detection. +//! +//! [`OcrAgent`] wraps a [`BaseAgent`](crate::backend::BaseAgent) with an +//! [`OcrProvider`]-backed tool. It encodes an image as base64, prompts the +//! VLM to call the OCR tool, and returns extracted text together with any +//! entities found in it. + +mod output; +mod prompt; +mod tool; + +pub use output::{OcrOutput, OcrEntity}; + +use async_trait::async_trait; +use base64::Engine; +use base64::engine::general_purpose::STANDARD; +use serde::Serialize; +use uuid::Uuid; + +use crate::backend::{DetectionConfig, Provider, UsageTracker}; +use super::{BaseAgent, BaseAgentConfig}; +use crate::error::Error; +use prompt::{OCR_SYSTEM_PROMPT, OcrPromptBuilder}; +use tool::OcrRigTool; + +/// A single text region extracted by an OCR provider. +/// +/// Each region represents a contiguous block of text found in the image, +/// together with an optional bounding box and confidence score. +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct OcrTextRegion { + /// The extracted text content. + pub text: String, + /// Confidence of the OCR extraction (0.0..=1.0). + pub confidence: f64, + /// Optional bounding box `[x, y, width, height]` in pixels. + pub bbox: Option<[f64; 4]>, +} + +/// Trait for OCR capabilities that can be provided to VLM agents. +/// +/// Consumers implement this trait to supply text extraction from images. +/// The trait is intentionally free of rig-core types so it can be +/// implemented in any crate without pulling in the LLM framework. +/// +/// Implementations return a list of [`OcrTextRegion`]s, each carrying the +/// extracted text, a confidence score, and an optional pixel-space bounding +/// box. Returning multiple regions allows the downstream VLM to reason +/// about spatial layout (e.g. headers vs body text, table cells). +#[async_trait] +pub trait OcrProvider: Send + Sync { + /// Extract text regions from raw image bytes (PNG, JPEG, etc.). + async fn extract_text(&self, image_data: &[u8]) -> Result, Error>; +} + +/// VLM agent that extracts text from images and detects entities in it. +/// +/// # Workflow +/// +/// 1. Caller passes raw image bytes to [`extract_and_detect`](Self::extract_and_detect). +/// 2. The agent base64-encodes the image and builds a user prompt via +/// [`OcrPromptBuilder`]. +/// 3. The VLM is instructed to call the `ocr_extract_text` tool (backed by +/// the [`OcrProvider`]) and then analyse the result for PII/PHI entities. +/// 4. Structured output is parsed into [`OcrOutput`]. +pub struct OcrAgent { + base: BaseAgent, +} + +impl OcrAgent { + /// Create a new OCR agent. + pub fn new( + provider: &Provider, + mut config: BaseAgentConfig, + ocr: impl OcrProvider + 'static, + ) -> Result { + config.preamble.get_or_insert_with(|| OCR_SYSTEM_PROMPT.into()); + let base = BaseAgent::builder(provider, config) + .tool(OcrRigTool::new(ocr)) + .build()?; + Ok(Self { base }) + } + + /// Unique identifier for this agent instance (UUIDv7). + pub fn id(&self) -> Uuid { + self.base.id() + } + + /// Access the usage tracker for this agent's LLM calls. + pub fn tracker(&self) -> &UsageTracker { + self.base.tracker() + } + + /// Extract text from an image and detect entities in the extracted text. + #[tracing::instrument( + skip_all, + fields(image_bytes = image_data.len(), agent = "ocr"), + )] + pub async fn extract_and_detect( + &self, + image_data: &[u8], + config: &DetectionConfig, + ) -> Result { + let image_b64 = STANDARD.encode(image_data); + tracing::debug!( + b64_len = image_b64.len(), + entity_kinds = config.entity_kinds.len(), + "encoded image, building prompt" + ); + + let prompt = OcrPromptBuilder::new(config).build(&image_b64); + + let output: OcrOutput = self.base.prompt_structured(&prompt).await?; + + tracing::info!( + text_len = output.extracted_text.len(), + entity_count = output.entities.len(), + "ocr extraction complete" + ); + + Ok(output) + } +} diff --git a/crates/nvisy-rig/src/agent/ocr/output.rs b/crates/nvisy-rig/src/agent/ocr/output.rs new file mode 100644 index 0000000..66baf8b --- /dev/null +++ b/crates/nvisy-rig/src/agent/ocr/output.rs @@ -0,0 +1,30 @@ +//! Structured output types for OCR entity detection. + +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use nvisy_ontology::entity::{EntityCategory, EntityKind}; + +/// Top-level output from the OCR agent. +#[derive(Debug, Clone, Default, PartialEq, Deserialize, Serialize, JsonSchema)] +pub struct OcrOutput { + /// Full text extracted from the image. + pub extracted_text: String, + /// Entities detected in the extracted text. + pub entities: Vec, +} + +/// A single entity detected in OCR-extracted text. +#[derive(Debug, Clone, PartialEq, Deserialize, Serialize, JsonSchema)] +pub struct OcrEntity { + /// Broad classification. + pub category: EntityCategory, + /// Specific entity type. + pub entity_type: EntityKind, + /// The matched text value. + pub value: String, + /// Detection confidence (0.0..=1.0). + pub confidence: f64, + /// Optional bounding box `[x, y, width, height]` in pixels. + pub bbox: Option<[f64; 4]>, +} diff --git a/crates/nvisy-rig/src/agent/ocr/prompt.rs b/crates/nvisy-rig/src/agent/ocr/prompt.rs new file mode 100644 index 0000000..bfb7384 --- /dev/null +++ b/crates/nvisy-rig/src/agent/ocr/prompt.rs @@ -0,0 +1,60 @@ +//! OCR-specific prompt construction. +//! +//! [`OcrPromptBuilder`] constructs the user prompt that instructs the VLM +//! to call the OCR tool and then detect entities in the extracted text. + +use crate::backend::{DetectionConfig, ALL_TYPES_HINT}; + +/// Builds user prompts for OCR-based entity extraction. +/// +/// Encodes entity-kind filters and confidence thresholds into the prompt +/// alongside the base64-encoded image data. +pub(crate) struct OcrPromptBuilder<'a> { + config: &'a DetectionConfig, +} + +impl<'a> OcrPromptBuilder<'a> { + /// Create a prompt builder from a [`DetectionConfig`]. + pub fn new(config: &'a DetectionConfig) -> Self { + Self { config } + } + + /// Build the user prompt for the given base64-encoded image. + pub fn build(&self, image_b64: &str) -> String { + let entity_hint = if self.config.entity_kinds.is_empty() { + ALL_TYPES_HINT.to_string() + } else { + self.config + .entity_kinds + .iter() + .map(|e| e.to_string()) + .collect::>() + .join(", ") + }; + + format!( + "Extract text from the following base64-encoded image using the \ + ocr_extract_text tool, then detect entities of types [{entity_hint}] \ + with minimum confidence {threshold:.2}.\n\n\ + Image (base64): {image_b64}", + threshold = self.config.confidence_threshold, + ) + } +} + +/// Default system prompt for the OCR agent. +pub(super) const OCR_SYSTEM_PROMPT: &str = "\ +You are a vision-language model performing OCR and entity detection on images. \ +You have access to an OCR tool that extracts text from images. \ +\n\ +Your workflow:\n\ +1. Use the ocr_extract_text tool to extract all text from the provided image.\n\ +2. Analyze the extracted text for personally identifiable information (PII), \ + protected health information (PHI), financial data, and credentials.\n\ +3. Return a JSON object with two fields:\n\ + - \"extracted_text\": the full text extracted from the image\n\ + - \"entities\": a JSON array of detected entities, each with keys: \ + category, entity_type, value, confidence, bbox (optional [x, y, w, h] array)\n\ +\n\ +If no entities are found, return an empty array for \"entities\". \ +If OCR produces no text, return an empty string for \"extracted_text\" and an empty array for \"entities\"."; diff --git a/crates/nvisy-rig/src/agent/ocr/tool.rs b/crates/nvisy-rig/src/agent/ocr/tool.rs new file mode 100644 index 0000000..66fd3b2 --- /dev/null +++ b/crates/nvisy-rig/src/agent/ocr/tool.rs @@ -0,0 +1,66 @@ +//! Internal rig `Tool` wrapper for [`OcrProvider`]. + +use std::sync::Arc; + +use base64::Engine; +use base64::engine::general_purpose::STANDARD; +use rig::completion::ToolDefinition; +use rig::tool::Tool; +use schemars::JsonSchema; +use serde::Deserialize; + +use super::OcrProvider; + +/// Arguments for the OCR tool call. +#[derive(Debug, Clone, Deserialize, JsonSchema)] +pub(super) struct OcrToolArgs { + /// Base64-encoded image data. + pub image_base64: String, +} + +/// Error returned by the OCR tool. +#[derive(Debug, thiserror::Error)] +#[error("{0}")] +pub(super) struct OcrToolError(String); + +/// Rig `Tool` wrapper around an [`OcrProvider`] implementation. +pub(super) struct OcrRigTool(Arc); + +impl OcrRigTool { + pub fn new(provider: T) -> Self { + Self(Arc::new(provider)) + } +} + +impl Tool for OcrRigTool { + const NAME: &'static str = "ocr_extract_text"; + + type Error = OcrToolError; + type Args = OcrToolArgs; + type Output = String; + + async fn definition(&self, _prompt: String) -> ToolDefinition { + ToolDefinition { + name: Self::NAME.to_string(), + description: "Extract text regions from an image using OCR. \ + Returns a JSON array of regions, each with text, \ + confidence, and optional bounding box. \ + Pass the image as a base64-encoded string." + .to_string(), + parameters: serde_json::to_value(schemars::schema_for!(OcrToolArgs)) + .unwrap_or_default(), + } + } + + async fn call(&self, args: Self::Args) -> Result { + let bytes = STANDARD + .decode(&args.image_base64) + .map_err(|e| OcrToolError(format!("invalid base64: {e}")))?; + let regions = self + .0 + .extract_text(&bytes) + .await + .map_err(|e| OcrToolError(e.to_string()))?; + serde_json::to_string(®ions).map_err(|e| OcrToolError(e.to_string())) + } +} diff --git a/crates/nvisy-rig/src/backend/context.rs b/crates/nvisy-rig/src/backend/context.rs new file mode 100644 index 0000000..d2f15d1 --- /dev/null +++ b/crates/nvisy-rig/src/backend/context.rs @@ -0,0 +1,191 @@ +//! Token budget estimation, text splitting, and truncation. +//! +//! [`ContextWindow`] provides a simple heuristic (~4 chars/token) to decide +//! whether text fits within a model's input budget and, when it doesn't, +//! to split or truncate it at sentence boundaries while staying UTF-8 safe. + +/// Token budget manager for a single model context window. +/// +/// All arithmetic is based on a rough **4 characters ≈ 1 token** heuristic. +/// This is intentionally conservative: over-splitting is harmless while +/// exceeding the real limit causes provider errors. +#[derive(Debug, Clone)] +pub struct ContextWindow { + /// Maximum tokens the model supports. + max_tokens: usize, + /// Tokens reserved for the output/completion. + reserve_output: usize, +} + +impl ContextWindow { + pub fn new(max_tokens: usize, reserve_output: usize) -> Self { + Self { + max_tokens, + reserve_output, + } + } + + /// Rough token count (~4 chars per token for English text). + pub fn estimate_tokens(text: &str) -> usize { + text.len().div_ceil(4) + } + + /// Input token budget (`max_tokens − reserve_output`). + pub(crate) fn input_budget(&self) -> usize { + self.max_tokens.saturating_sub(self.reserve_output) + } + + /// Whether `text` fits within the input budget. + pub fn fits(&self, text: &str) -> bool { + Self::estimate_tokens(text) <= self.input_budget() + } + + /// Split text into chunks that each fit within the input budget. + /// + /// Prefers sentence boundaries (`. ` and `\n`) and is safe for + /// multi-byte UTF-8. + pub fn split_to_fit<'a>(&self, text: &'a str) -> Vec<&'a str> { + if self.fits(text) { + return vec![text]; + } + + let budget = self.input_budget(); + let char_budget = budget * 4; + + let mut chunks = Vec::new(); + let mut remaining = text; + + while !remaining.is_empty() { + if Self::estimate_tokens(remaining) <= budget { + chunks.push(remaining); + break; + } + + let take = snap_to_boundary(remaining, remaining.len().min(char_budget)); + let candidate = &remaining[..take]; + let split_pos = find_last_boundary(candidate).unwrap_or(take); + + let (chunk, rest) = remaining.split_at(split_pos); + if chunk.is_empty() { + let forced = snap_to_boundary(remaining, remaining.len().min(char_budget)); + let (chunk, rest) = remaining.split_at(forced); + chunks.push(chunk); + remaining = rest; + } else { + chunks.push(chunk); + remaining = rest.trim_start_matches(['\n', ' ']); + } + } + + chunks + } + + /// Truncate text to fit, keeping the **tail** (most recent context). + /// + /// Safe for multi-byte UTF-8. + pub fn truncate_to_fit<'a>(&self, text: &'a str) -> &'a str { + if self.fits(text) { + return text; + } + + let budget = self.input_budget(); + let char_budget = budget * 4; + + if text.len() <= char_budget { + return text; + } + + let start = snap_to_boundary(text, text.len() - char_budget); + let adjusted = text[start..] + .find(['\n', '.']) + .map(|pos| start + pos + 1) + .unwrap_or(start); + + let adjusted = snap_to_boundary(text, adjusted.min(text.len())); + &text[adjusted..] + } +} + +/// Snap a byte position to the nearest valid UTF-8 char boundary (walks backward). +fn snap_to_boundary(text: &str, pos: usize) -> usize { + let mut p = pos.min(text.len()); + while p > 0 && !text.is_char_boundary(p) { + p -= 1; + } + p +} + +/// Last sentence boundary (`. ` or `\n`) in `text`. +fn find_last_boundary(text: &str) -> Option { + let last_newline = text.rfind('\n'); + let last_period = text.rfind(". ").map(|p| p + 2); + + match (last_newline, last_period) { + (Some(a), Some(b)) => Some(a.max(b)), + (Some(a), None) => Some(a), + (None, Some(b)) => Some(b), + (None, None) => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn fits_within_budget() { + let cw = ContextWindow::new(100, 20); + assert!(cw.fits(&"a".repeat(300))); // ~75 tokens, budget 80 + assert!(!cw.fits(&"a".repeat(400))); // ~100 tokens, budget 80 + } + + #[test] + fn truncate_keeps_end() { + let cw = ContextWindow::new(10, 2); // budget = 8 tokens ≈ 32 chars + let text = "First sentence. Second sentence. Third sentence. Fourth sentence."; + let truncated = cw.truncate_to_fit(text); + assert!(truncated.len() <= 42); // 32 + slack for boundary + assert!(text.ends_with(truncated) || truncated.contains("sentence")); + } + + #[test] + fn snap_to_boundary_multibyte() { + let text = "a🔥b"; // byte 0: 'a', bytes 1–4: '🔥', byte 5: 'b' + assert_eq!(snap_to_boundary(text, 1), 1); + assert_eq!(snap_to_boundary(text, 2), 1); // mid-emoji → snap back + assert_eq!(snap_to_boundary(text, 3), 1); + assert_eq!(snap_to_boundary(text, 4), 1); + assert_eq!(snap_to_boundary(text, 5), 5); + } + + #[test] + fn split_to_fit_emoji() { + let cw = ContextWindow::new(4, 2); // budget = 2 tokens ≈ 8 bytes + let text = "🔥🔥🔥🔥"; // 16 bytes + let chunks = cw.split_to_fit(text); + assert!(chunks.len() >= 2); + for chunk in &chunks { + assert!(!chunk.is_empty()); + } + } + + #[test] + fn split_to_fit_cjk() { + let cw = ContextWindow::new(4, 2); // budget ≈ 8 bytes + let text = "你好世界测试文字"; // 24 bytes (3 bytes × 8 chars) + let chunks = cw.split_to_fit(text); + assert!(chunks.len() >= 2); + for chunk in &chunks { + assert!(!chunk.is_empty()); + } + } + + #[test] + fn truncate_to_fit_emoji() { + let cw = ContextWindow::new(4, 2); // budget ≈ 8 bytes + let text = "🔥🔥🔥🔥"; // 16 bytes + let truncated = cw.truncate_to_fit(text); + assert!(!truncated.is_empty()); + assert!(text.ends_with(truncated)); + } +} diff --git a/crates/nvisy-rig/src/backend/metrics.rs b/crates/nvisy-rig/src/backend/metrics.rs new file mode 100644 index 0000000..49e71f7 --- /dev/null +++ b/crates/nvisy-rig/src/backend/metrics.rs @@ -0,0 +1,54 @@ +//! Cumulative token-usage tracking across LLM requests. + +use std::sync::Mutex; + +use rig::completion::Usage; + +/// Thread-safe accumulator for LLM token usage. +/// +/// Each agent owns one tracker; callers snapshot it to inspect costs. +pub struct UsageTracker { + inner: Mutex, +} + +/// Point-in-time snapshot of accumulated usage counters. +#[derive(Debug, Default, Clone)] +pub struct UsageStats { + pub total_input_tokens: u64, + pub total_output_tokens: u64, + pub total_requests: u64, + pub total_retries: u64, +} + +impl UsageTracker { + pub fn new() -> Self { + Self { + inner: Mutex::new(UsageStats::default()), + } + } + + /// Record a single LLM request's token usage and retry count. + pub fn record(&self, usage: &Usage, retries: u32) { + let mut stats = self.inner.lock().expect("usage tracker lock poisoned"); + stats.total_input_tokens += usage.input_tokens; + stats.total_output_tokens += usage.output_tokens; + stats.total_requests += 1; + stats.total_retries += u64::from(retries); + } + + /// Snapshot the current counters without resetting them. + pub fn snapshot(&self) -> UsageStats { + self.inner.lock().expect("usage tracker lock poisoned").clone() + } + + /// Reset all counters to zero. + pub fn reset(&self) { + *self.inner.lock().expect("usage tracker lock poisoned") = UsageStats::default(); + } +} + +impl Default for UsageTracker { + fn default() -> Self { + Self::new() + } +} diff --git a/crates/nvisy-rig/src/backend/mod.rs b/crates/nvisy-rig/src/backend/mod.rs new file mode 100644 index 0000000..07660c0 --- /dev/null +++ b/crates/nvisy-rig/src/backend/mod.rs @@ -0,0 +1,43 @@ +//! LLM backend: provider connections, context windowing, and usage tracking. + +mod context; +mod metrics; +mod provider; + +pub use context::ContextWindow; +pub use metrics::{UsageStats, UsageTracker}; +pub use provider::{AuthenticatedProvider, Provider, UnauthenticatedProvider}; +pub(crate) use provider::build_http_client; + +use serde_json::Value; + +use nvisy_ontology::entity::EntityKind; + +/// Fallback hint used in prompts when no specific entity types are requested. +pub(crate) const ALL_TYPES_HINT: &str = "all entity types"; + +/// Configuration for entity detection: which types to look for and at what +/// confidence threshold. +#[derive(Debug, Clone)] +pub struct DetectionConfig { + /// Entity kinds to detect (empty = all). + pub entity_kinds: Vec, + /// Minimum confidence score to include a detection (0.0..=1.0). + pub confidence_threshold: f64, + /// System prompt override (if set, replaces the agent's default). + pub system_prompt: Option, +} + +/// Request payload for the detection service. +#[derive(Debug, Clone)] +pub struct DetectionRequest { + pub text: String, + pub config: DetectionConfig, +} + +/// Response from the detection service. +#[derive(Debug, Clone)] +pub struct DetectionResponse { + pub entities: Vec, + pub usage: Option, +} diff --git a/crates/nvisy-rig/src/backend/provider.rs b/crates/nvisy-rig/src/backend/provider.rs new file mode 100644 index 0000000..be98030 --- /dev/null +++ b/crates/nvisy-rig/src/backend/provider.rs @@ -0,0 +1,118 @@ +//! LLM provider connection parameters. +//! +//! [`Provider`] is a plain enum carrying API keys, model names, and optional +//! base URLs. The actual rig-core client is constructed lazily when a +//! [`BaseAgent`](super::BaseAgent) is built. + +use std::time::Duration; + +use reqwest_middleware::ClientBuilder; +use reqwest_middleware::ClientWithMiddleware; +use reqwest_retry::{RetryTransientMiddleware, policies::ExponentialBackoff}; +use reqwest_tracing::TracingMiddleware; + +/// Provider that requires an API key (OpenAI, Anthropic, Gemini). +#[derive(Clone)] +pub struct AuthenticatedProvider { + pub api_key: String, + pub model: String, + pub base_url: Option, +} + +/// Provider that does not require an API key (Ollama). +#[derive(Clone)] +pub struct UnauthenticatedProvider { + pub model: String, + pub base_url: Option, +} + +/// Supported LLM providers. +/// +/// Each variant holds connection parameters and the model name. The actual +/// rig client is constructed lazily when an agent is built. +/// +/// # Example +/// ```rust,ignore +/// let provider = Provider::openai("sk-...", "gpt-4o"); +/// let agent = NerAgent::new(&provider, config); +/// ``` +#[derive(Clone)] +pub enum Provider { + /// OpenAI (GPT-4o, GPT-4, etc.) + OpenAi(AuthenticatedProvider), + /// Anthropic (Claude) + Anthropic(AuthenticatedProvider), + /// Google Gemini + Gemini(AuthenticatedProvider), + /// Ollama (local models) + Ollama(UnauthenticatedProvider), +} + +impl Provider { + /// Create an OpenAI provider. + pub fn openai(api_key: &str, model: &str) -> Self { + Self::OpenAi(AuthenticatedProvider { + api_key: api_key.to_owned(), + model: model.to_owned(), + base_url: None, + }) + } + + /// Create an Anthropic provider. + pub fn anthropic(api_key: &str, model: &str) -> Self { + Self::Anthropic(AuthenticatedProvider { + api_key: api_key.to_owned(), + model: model.to_owned(), + base_url: None, + }) + } + + /// Create a Google Gemini provider. + pub fn gemini(api_key: &str, model: &str) -> Self { + Self::Gemini(AuthenticatedProvider { + api_key: api_key.to_owned(), + model: model.to_owned(), + base_url: None, + }) + } + + /// Create an Ollama provider using the default local URL. + pub fn ollama(model: &str) -> Self { + Self::Ollama(UnauthenticatedProvider { + model: model.to_owned(), + base_url: None, + }) + } + + /// Create an Ollama provider with a custom base URL. + pub fn ollama_with_url(model: &str, url: &str) -> Self { + Self::Ollama(UnauthenticatedProvider { + model: model.to_owned(), + base_url: Some(url.to_owned()), + }) + } + + /// The model name for this provider. + pub fn model(&self) -> &str { + match self { + Self::OpenAi(p) | Self::Anthropic(p) | Self::Gemini(p) => &p.model, + Self::Ollama(p) => &p.model, + } + } +} + +/// Build a `ClientWithMiddleware` with timeout, retry, and tracing middleware. +pub(crate) fn build_http_client(max_retries: u32) -> ClientWithMiddleware { + let retry_policy = ExponentialBackoff::builder() + .build_with_max_retries(max_retries); + + let client = reqwest_middleware::reqwest::Client::builder() + .timeout(Duration::from_secs(120)) + .build() + .expect("failed to build reqwest client"); + + ClientBuilder::new(client) + .with(TracingMiddleware::default()) + .with(RetryTransientMiddleware::new_with_policy(retry_policy)) + .build() +} diff --git a/crates/nvisy-rig/src/bridge/mod.rs b/crates/nvisy-rig/src/bridge/mod.rs new file mode 100644 index 0000000..4f2b725 --- /dev/null +++ b/crates/nvisy-rig/src/bridge/mod.rs @@ -0,0 +1,11 @@ +//! Prompt construction and LLM response parsing. +//! +//! [`PromptBuilder`] assembles user prompts with entity-kind filters and +//! confidence thresholds. [`ResponseParser`] extracts and deserializes +//! text from rig-core completion responses. + +mod prompt; +mod response; + +pub use prompt::PromptBuilder; +pub use response::ResponseParser; diff --git a/crates/nvisy-rig/src/bridge/prompt.rs b/crates/nvisy-rig/src/bridge/prompt.rs new file mode 100644 index 0000000..efe095d --- /dev/null +++ b/crates/nvisy-rig/src/bridge/prompt.rs @@ -0,0 +1,101 @@ +//! User-prompt construction for LLM entity detection. +//! +//! [`PromptBuilder`] formats the entity-kind list, confidence threshold, +//! and input text into a single prompt string that agent-specific prompt +//! builders can delegate to. + +use std::fmt::Display; + +use nvisy_ontology::entity::EntityKind; + +use crate::backend::{DetectionConfig, ALL_TYPES_HINT}; + +/// Instruction prefix for the user prompt. +const DETECT_PREFIX: &str = "Detect entities of types"; + +/// Suffix describing the expected response format. +const RESPONSE_FORMAT: &str = "\ +Return a JSON array of objects with keys: \ +entity_id, category, entity_type, value, confidence, context."; + +/// Builds user prompts for entity detection requests. +pub struct PromptBuilder<'a> { + entity_kinds: &'a [EntityKind], + confidence_threshold: f64, +} + +impl<'a> PromptBuilder<'a> { + /// Create a prompt builder from a [`DetectionConfig`]. + pub fn new(config: &'a DetectionConfig) -> Self { + Self { + entity_kinds: &config.entity_kinds, + confidence_threshold: config.confidence_threshold, + } + } + + /// Build the user prompt for the given text. + pub fn build(&self, text: &str) -> String { + self.build_for(self.entity_kinds, text) + } + + /// Build a prompt using an arbitrary slice of displayable entity labels. + /// + /// This allows callers to pass any `Vec` where `E: Display` — for + /// example custom string labels or [`EntityKind`] variants. + pub fn build_for(&self, entity_types: &[E], text: &str) -> String { + let types_hint = if entity_types.is_empty() { + ALL_TYPES_HINT.to_string() + } else { + entity_types.iter().map(|e| e.to_string()).collect::>().join(", ") + }; + + format!( + "{DETECT_PREFIX} [{types_hint}] with minimum confidence \ + {threshold:.2} in the following text. {RESPONSE_FORMAT}\n\n---\n{text}\n---", + threshold = self.confidence_threshold, + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn builds_prompt_with_entity_kinds() { + let config = DetectionConfig { + entity_kinds: vec![EntityKind::PersonName, EntityKind::GovernmentId], + confidence_threshold: 0.7, + system_prompt: None, + }; + let prompt = PromptBuilder::new(&config).build("Hello world"); + assert!(prompt.contains("person_name, government_id")); + assert!(prompt.contains("0.70")); + assert!(prompt.contains("Hello world")); + } + + #[test] + fn builds_prompt_without_entity_kinds() { + let config = DetectionConfig { + entity_kinds: vec![], + confidence_threshold: 0.5, + system_prompt: None, + }; + let prompt = PromptBuilder::new(&config).build("test"); + assert!(prompt.contains("all entity types")); + } + + #[test] + fn build_for_with_string_labels() { + let config = DetectionConfig { + entity_kinds: vec![], + confidence_threshold: 0.8, + system_prompt: None, + }; + let builder = PromptBuilder::new(&config); + let labels = vec!["PERSON", "SSN"]; + let prompt = builder.build_for(&labels, "some text"); + assert!(prompt.contains("PERSON, SSN")); + assert!(prompt.contains("0.80")); + } +} diff --git a/crates/nvisy-rig/src/bridge/response.rs b/crates/nvisy-rig/src/bridge/response.rs new file mode 100644 index 0000000..7a28e20 --- /dev/null +++ b/crates/nvisy-rig/src/bridge/response.rs @@ -0,0 +1,141 @@ +//! LLM completion response parsing. +//! +//! [`ResponseParser`] extracts text from rig-core completion responses +//! and deserializes JSON (handling markdown fences and empty responses). + +use std::borrow::Cow; + +use serde::de::DeserializeOwned; + +use rig::completion::{AssistantContent, CompletionResponse}; + +use crate::error::Error; + +/// Thin wrapper around text extracted from an LLM completion response. +pub struct ResponseParser<'a> { + text: Cow<'a, str>, +} + +impl<'a> ResponseParser<'a> { + /// Extract the text content blocks from a completion response. + pub fn extract_text(response: &CompletionResponse) -> Result { + let texts: Vec<&str> = response + .choice + .iter() + .filter_map(|c| match c { + AssistantContent::Text(t) => Some(t.text.as_str()), + _ => None, + }) + .collect(); + + if texts.is_empty() { + return Err(Error::Response( + "LLM response contained no text content".to_string(), + )); + } + + Ok(Self { + text: Cow::Owned(texts.join("\n")), + }) + } + + /// Wrap an already-extracted string. + pub fn from_text(text: impl Into>) -> Self { + Self { text: text.into() } + } + + pub fn as_str(&self) -> &str { + &self.text + } + + pub fn into_string(self) -> String { + self.text.into_owned() + } + + /// Deserialize the text as JSON into `T`. + /// + /// Strips markdown fences when present. Returns `T::default()` for + /// empty / `"none"` / `"no entities"` responses. + pub fn parse_json(&self) -> Result { + let trimmed = self.text.trim(); + + if trimmed.is_empty() + || trimmed.eq_ignore_ascii_case("none") + || trimmed.eq_ignore_ascii_case("no entities") + { + return Ok(T::default()); + } + + let json_str = extract_fenced_json(trimmed).unwrap_or(trimmed); + + serde_json::from_str::(json_str).map_err(|e| { + Error::Response(format!( + "Failed to parse LLM response as JSON: {e}: {}", + truncate(trimmed, 200), + )) + }) + } +} + +/// Extract JSON content from markdown fences (```` ```json ... ``` ````). +fn extract_fenced_json(text: &str) -> Option<&str> { + let start_marker = if let Some(pos) = text.find("```json") { + pos + "```json".len() + } else if let Some(pos) = text.find("```") { + pos + "```".len() + } else { + return None; + }; + + let rest = &text[start_marker..]; + let rest = rest.strip_prefix('\n').unwrap_or(rest); + let end = rest.find("```")?; + let content = rest[..end].trim(); + + if content.is_empty() { + None + } else { + Some(content) + } +} + +fn truncate(s: &str, max_len: usize) -> &str { + if s.len() <= max_len { + s + } else { + let mut end = max_len; + while end > 0 && !s.is_char_boundary(end) { + end -= 1; + } + &s[..end] + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::Value; + + #[test] + fn parse_json_raw_array() { + let text = r#"[{"category":"pii","entity_type":"email_address","value":"a@b.com","confidence":0.9,"start_offset":0,"end_offset":7}]"#; + let result = ResponseParser::from_text(text).parse_json::>().unwrap(); + assert_eq!(result.len(), 1); + } + + #[test] + fn parse_json_fenced() { + let text = "```json\n[{\"category\":\"pii\",\"entity_type\":\"email_address\",\"value\":\"a@b.com\",\"confidence\":0.9}]\n```"; + let result = ResponseParser::from_text(text).parse_json::>().unwrap(); + assert_eq!(result.len(), 1); + } + + #[test] + fn parse_json_empty_and_sentinel() { + let empty: Vec = vec![]; + assert_eq!(ResponseParser::from_text("").parse_json::>().unwrap(), empty); + assert_eq!(ResponseParser::from_text("none").parse_json::>().unwrap(), empty); + assert_eq!(ResponseParser::from_text("No entities").parse_json::>().unwrap(), empty); + } + +} diff --git a/crates/nvisy-rig/src/error.rs b/crates/nvisy-rig/src/error.rs new file mode 100644 index 0000000..60a689f --- /dev/null +++ b/crates/nvisy-rig/src/error.rs @@ -0,0 +1,160 @@ +//! Unified error type covering LLM provider, serialization, and tool failures. + +use rig::completion::{CompletionError, PromptError, StructuredOutputError}; + +/// Error type for all LLM interactions. +/// +/// Variants map 1:1 to rig-core error categories plus crate-specific +/// additions (`Validation`, `Client`, `Core`). Use [`is_retryable`](Self::is_retryable) +/// to decide whether a failed request should be retried. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// An HTTP / network error from the LLM provider. + #[error("HTTP error: {0}")] + Http(String), + + /// A JSON (de)serialization error. + #[error("JSON error: {0}")] + Json(#[from] serde_json::Error), + + /// The LLM provider returned an error response. + #[error("Provider error: {0}")] + Provider(String), + + /// The LLM response was malformed or unexpected. + #[error("Response error: {0}")] + Response(String), + + /// The request could not be constructed. + #[error("Request error: {0}")] + Request(String), + + /// A tool call failed during an agent prompt. + #[error("Tool error: {0}")] + Tool(String), + + /// The agent exceeded its maximum turn limit. + #[error("Agent exceeded max turn limit ({0})")] + MaxTurns(usize), + + /// The prompt was cancelled. + #[error("Prompt cancelled: {0}")] + Cancelled(String), + + /// A validation or parse failure. + #[error("{0}")] + Validation(String), + + /// Wraps `nvisy_core::Error` from provider implementations. + #[error(transparent)] + Core(#[from] nvisy_core::Error), + + /// Structured output failed (prompt error or deserialization). + #[error("Structured output error: {0}")] + StructuredOutput(String), + + /// Failed to construct a provider client. + #[error("Client error: {0}")] + Client(String), +} + +impl Error { + /// Whether this error is likely transient and safe to retry. + pub fn is_retryable(&self) -> bool { + match self { + Self::Http(_) => true, + Self::Provider(msg) => is_retryable_provider_error(msg), + _ => false, + } + } +} + +impl From for Error { + fn from(err: CompletionError) -> Self { + match err { + CompletionError::HttpError(e) => Self::Http(e.to_string()), + CompletionError::JsonError(e) => Self::Json(e), + CompletionError::ProviderError(msg) => Self::Provider(msg), + CompletionError::ResponseError(msg) => Self::Response(msg), + CompletionError::RequestError(e) => Self::Request(e.to_string()), + CompletionError::UrlError(e) => Self::Request(format!("URL: {e}")), + } + } +} + +impl From for Error { + fn from(err: PromptError) -> Self { + match err { + PromptError::CompletionError(e) => Self::from(e), + PromptError::ToolError(e) => Self::Tool(e.to_string()), + PromptError::ToolServerError(e) => Self::Tool(format!("server: {e}")), + PromptError::MaxTurnsError { max_turns, .. } => Self::MaxTurns(max_turns), + PromptError::PromptCancelled { reason, .. } => Self::Cancelled(reason), + } + } +} + +impl From for Error { + fn from(err: StructuredOutputError) -> Self { + match err { + StructuredOutputError::PromptError(e) => Self::from(e), + StructuredOutputError::DeserializationError(e) => { + Self::StructuredOutput(e.to_string()) + } + StructuredOutputError::EmptyResponse => { + Self::StructuredOutput("model returned no content".to_string()) + } + } + } +} + +impl From for nvisy_core::Error { + fn from(err: Error) -> Self { + // Handle the owned `Core` variant first to avoid borrowing issues. + if matches!(&err, Error::Core(_)) { + return match err { + Error::Core(inner) => inner, + _ => unreachable!(), + }; + } + + match &err { + Error::Http(_) => { + nvisy_core::Error::connection(err.to_string(), "rig", true) + } + Error::Json(_) => { + nvisy_core::Error::new(nvisy_core::ErrorKind::Serialization, err.to_string()) + .with_component("rig") + } + Error::Provider(msg) => { + let retryable = is_retryable_provider_error(msg); + nvisy_core::Error::connection(err.to_string(), "rig", retryable) + } + Error::Response(_) | Error::StructuredOutput(_) => { + nvisy_core::Error::runtime(err.to_string(), "rig", false) + } + Error::Request(_) | Error::Validation(_) => { + nvisy_core::Error::validation(err.to_string(), "rig") + } + Error::Tool(_) | Error::MaxTurns(_) | Error::Cancelled(_) => { + nvisy_core::Error::runtime(err.to_string(), "rig", false) + } + Error::Client(_) => { + nvisy_core::Error::connection(err.to_string(), "rig", false) + } + Error::Core(_) => unreachable!(), + } + } +} + +/// Check if a provider error message indicates a retryable condition. +fn is_retryable_provider_error(msg: &str) -> bool { + let lower = msg.to_lowercase(); + lower.contains("rate_limit") + || lower.contains("rate limit") + || lower.contains("overloaded") + || lower.contains("timeout") + || lower.contains("429") + || lower.contains("503") + || lower.contains("529") +} diff --git a/crates/nvisy-rig/src/lib.rs b/crates/nvisy-rig/src/lib.rs index 4a9799c..edb522d 100644 --- a/crates/nvisy-rig/src/lib.rs +++ b/crates/nvisy-rig/src/lib.rs @@ -1,3 +1,25 @@ #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] + +pub mod backend; +pub mod bridge; +pub mod error; +mod agent; + +#[doc(hidden)] +pub mod prelude; + +pub use agent::BaseAgentConfig; +pub use backend::{ + AuthenticatedProvider, ContextWindow, + DetectionConfig, DetectionRequest, DetectionResponse, + Provider, UnauthenticatedProvider, UsageStats, UsageTracker, +}; +pub use error::Error; + +pub use agent::{ + CvAgent, CvDetection, CvEntities, CvEntity, CvProvider, + KnownNerEntity, NerAgent, NerContext, NerEntities, NerEntity, ResolvedOffsets, + OcrAgent, OcrEntity, OcrOutput, OcrProvider, OcrTextRegion, +}; diff --git a/crates/nvisy-rig/src/prelude.rs b/crates/nvisy-rig/src/prelude.rs new file mode 100644 index 0000000..c626bbb --- /dev/null +++ b/crates/nvisy-rig/src/prelude.rs @@ -0,0 +1,14 @@ +//! Convenience re-exports. + +pub use crate::agent::BaseAgentConfig; +pub use crate::backend::{ + AuthenticatedProvider, ContextWindow, + DetectionConfig, DetectionRequest, DetectionResponse, + Provider, UnauthenticatedProvider, UsageStats, UsageTracker, +}; +pub use crate::error::Error; +pub use crate::agent::{ + CvAgent, CvDetection, CvEntities, CvEntity, CvProvider, + KnownNerEntity, NerAgent, NerContext, NerEntities, NerEntity, ResolvedOffsets, + OcrAgent, OcrEntity, OcrOutput, OcrProvider, OcrTextRegion, +}; diff --git a/crates/nvisy-server/src/lib.rs b/crates/nvisy-server/src/lib.rs index c3e7bc3..322d894 100644 --- a/crates/nvisy-server/src/lib.rs +++ b/crates/nvisy-server/src/lib.rs @@ -5,6 +5,3 @@ pub mod handler; pub mod middleware; pub mod service; - -pub use handler::routes; -pub use service::ServiceState; diff --git a/docker/Dockerfile b/docker/Dockerfile index 12e14c6..68f8bfa 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -6,21 +6,29 @@ WORKDIR /app # Copy manifests first to cache dependency builds COPY Cargo.toml Cargo.lock ./ +COPY crates/nvisy-asr/Cargo.toml crates/nvisy-asr/Cargo.toml +COPY crates/nvisy-augment/Cargo.toml crates/nvisy-augment/Cargo.toml +COPY crates/nvisy-cli/Cargo.toml crates/nvisy-cli/Cargo.toml +COPY crates/nvisy-codec/Cargo.toml crates/nvisy-codec/Cargo.toml COPY crates/nvisy-core/Cargo.toml crates/nvisy-core/Cargo.toml -COPY crates/nvisy-detect/Cargo.toml crates/nvisy-detect/Cargo.toml COPY crates/nvisy-engine/Cargo.toml crates/nvisy-engine/Cargo.toml -COPY crates/nvisy-object/Cargo.toml crates/nvisy-object/Cargo.toml +COPY crates/nvisy-identify/Cargo.toml crates/nvisy-identify/Cargo.toml +COPY crates/nvisy-ontology/Cargo.toml crates/nvisy-ontology/Cargo.toml +COPY crates/nvisy-paddle/Cargo.toml crates/nvisy-paddle/Cargo.toml +COPY crates/nvisy-pattern/Cargo.toml crates/nvisy-pattern/Cargo.toml COPY crates/nvisy-python/Cargo.toml crates/nvisy-python/Cargo.toml +COPY crates/nvisy-rig/Cargo.toml crates/nvisy-rig/Cargo.toml COPY crates/nvisy-server/Cargo.toml crates/nvisy-server/Cargo.toml # Create empty src files to satisfy cargo's manifest checks -RUN for crate in nvisy-core nvisy-detect nvisy-engine nvisy-object nvisy-python; do \ +RUN for crate in nvisy-asr nvisy-augment nvisy-codec nvisy-core nvisy-engine nvisy-identify nvisy-ontology nvisy-paddle nvisy-pattern nvisy-python nvisy-rig; do \ mkdir -p crates/$crate/src && echo "" > crates/$crate/src/lib.rs; \ done && \ + mkdir -p crates/nvisy-cli/src && echo "fn main() {}" > crates/nvisy-cli/src/main.rs && \ mkdir -p crates/nvisy-server/src && echo "fn main() {}" > crates/nvisy-server/src/main.rs # Create stub READMEs for crates that use doc = include_str!("../README.md") -RUN for crate in nvisy-core nvisy-detect nvisy-engine nvisy-object nvisy-python nvisy-server; do \ +RUN for crate in nvisy-asr nvisy-augment nvisy-cli nvisy-codec nvisy-core nvisy-engine nvisy-identify nvisy-ontology nvisy-paddle nvisy-pattern nvisy-python nvisy-rig nvisy-server; do \ touch crates/$crate/README.md; \ done