URI-ABD · wmann95 · Nov 12, 2024 · Nov 12, 2024
@@ -7,15 +7,17 @@ use abd_clam::FlatVec;
 mod gen_random;
 mod neighborhood_aware;
 mod vec_metric;
-mod wasserstein;
+pub mod wasserstein;
 
 pub use gen_random::gen_random;
-pub use neighborhood_aware::NeighborhoodAware;
+#[allow(unused_imports)]
+pub use neighborhood_aware::{NeighborhoodAware, NeighborhoodAwareScore};
 pub use vec_metric::VecMetric;
 
 /// Read data from the given path or generate random data.
-pub fn read_or_generate<P: AsRef<Path>>(
-    path: Option<P>,
+#[allow(dead_code)]
+pub fn read_or_generate(
+    path: Option<String>,
     metric: &VecMetric,
     num_inliers: Option<usize>,
     dimensionality: Option<usize>,
@@ -26,7 +28,7 @@ pub fn read_or_generate<P: AsRef<Path>>(
     let metric = metric.metric::<f32, f32>();
 
     let data = if let Some(path) = path {
-        let path = path.as_ref();
+        let path = Path::new(&path);
         if !path.exists() {
             return Err(format!("{path:?} does not exist"));
         }

@@ -1,39 +1,49 @@
 //! A `Dataset` in which every point stores the distances to its `k` nearest neighbors.
 
 use abd_clam::{
-    cluster::ParCluster,
-    dataset::{metric_space::ParMetricSpace, ParDataset},
-    Cluster, Dataset, FlatVec, Metric, MetricSpace, Permutable,
+    cluster::ParCluster, dataset::{metric_space::ParMetricSpace, ParDataset}, utils::{mean, standard_deviation}, Cluster, Dataset, FlatVec, Metric, MetricSpace, Permutable
 };
+use ftlog::debug;
 use rayon::prelude::*;
 
-use super::wasserstein;
+use super::wasserstein::wasserstein;
 
 type Fv = FlatVec<Vec<f32>, f32, usize>;
 
+pub struct NeighborhoodAwareScore{
+    pub score: f32,
+    pub standard_deviation: f32
+}
+
 /// A `Dataset` in which every point stores the distances to its `k` nearest neighbors.
 #[allow(clippy::type_complexity)]
 pub struct NeighborhoodAware {
     data: FlatVec<Vec<f32>, f32, (usize, Vec<(usize, f32)>)>,
     k: usize,
 }
 
+#[allow(dead_code)]
 impl NeighborhoodAware {
     /// Create a new `NeighborhoodAware` `Dataset`.
     ///
     /// This will run knn-search on every point in the dataset and store the
     /// results in the dataset.
     pub fn new<C: Cluster<Vec<f32>, f32, Fv>>(data: &Fv, root: &C, k: usize) -> Self {
-        let alg = abd_clam::cakes::Algorithm::KnnLinear(k);
+        let alg = abd_clam::cakes::Algorithm::KnnLinear(k + 1);
 
-        let results = data
+        let results: Vec<(usize, Vec<(usize, f32)>)> = data
             .instances()
             .iter()
-            .map(|query| alg.search(data, root, query))
+            .enumerate()
+            .map(|(_, query)| {
+                let mut neighbors = alg.search(data, root, query);
+                neighbors.sort_by(|&(_, a),(_, b)|a.partial_cmp(b).unwrap());
+                neighbors
+            })
             .zip(data.metadata().iter())
             .map(|(h, &i)| (i, h))
             .collect();
-
+        
         let data = data
             .clone()
             .with_metadata(results)
@@ -43,53 +53,131 @@ impl NeighborhoodAware {
 
     /// Parallel version of `new`.
     pub fn par_new<C: ParCluster<Vec<f32>, f32, Fv>>(data: &Fv, root: &C, k: usize) -> Self {
-        let alg = abd_clam::cakes::Algorithm::KnnLinear(k);
+        let alg = abd_clam::cakes::Algorithm::KnnLinear(k + 1);
 
-        let results = data
+        let results: Vec<(usize, Vec<(usize, f32)>)> = data
             .instances()
             .par_iter()
-            .map(|query| alg.par_search(data, root, query))
+            .map(|query| {
+                let mut neighbors = alg.par_search(data, root, query);
+                neighbors.par_sort_by(|&(_, a),(_, b)|a.partial_cmp(b).unwrap());
+                neighbors
+            })
             .zip(data.metadata().par_iter())
             .map(|(h, &i)| (i, h))
             .collect();
-
+        
         let data = data
             .clone()
             .with_metadata(results)
             .unwrap_or_else(|e| unreachable!("We created the correct size for neighborhood aware data: {e}"));
         Self { data, k }
     }
-
-    /// Check if a point is an outlier.
-    pub fn is_outlier<C: Cluster<Vec<f32>, f32, Self>>(&self, root: &C, query: &Vec<f32>, threshold: f32) -> bool {
+
+    pub fn outlier_score<C: Cluster<Vec<f32>, f32, Self>>(&self, root: &C, query: &Vec<f32>) -> NeighborhoodAwareScore {
+        debug!("Entering is_outlier.");
+
         let alg = abd_clam::cakes::Algorithm::KnnLinear(self.k);
-
+        
         let hits = alg.search(self, root, query);
+
+        debug!("");
+        debug!("Hits: {:?}", hits);
+
         let neighbors_distances = hits
             .iter()
-            .map(|&(i, _)| self.neighbor_distances(i))
+            .map(|&(i, _)| {
+                self.neighbor_distances(i)
+            })
             .collect::<Vec<_>>();
-
-        // TODO: Compute all-pairs matrix of wasserstein distances among the neighbors' distance distributions.
-
-        // TODO: Compute the wasserstein distances for the query
-
-        // TODO: Optionally use the threshold to determine if the query is an outlier.
-
-        let distances = hits.iter().map(|&(_, d)| d).collect::<Vec<_>>();
-        // TODO: The rest of this is wrong.
-        let wasserstein_distances = neighbors_distances
-            .iter()
-            .map(|d| wasserstein::wasserstein(d, &distances))
+
+        let neighbor_wass_dist_mat = neighbors_distances.iter().map(|v| {
+            neighbors_distances.iter().map(|q| wasserstein(v, q)).collect::<Vec<f32>>()
+        }).collect::<Vec<Vec<f32>>>();
+
+        debug!("Wasserstein distance matrix of neighbors:");
+        for a in &neighbor_wass_dist_mat{
+            debug!("\t{:?}", *a);
+        }
+
+        let query_distances = hits.iter().map(|&(_, d)| d).collect::<Vec<_>>();
+
+        let query_wass_distances = neighbors_distances.iter().map(|v|{
+            wasserstein(&query_distances, v)
+        }).collect::<Vec<f32>>();
+
+        debug!("");
+        debug!("Query wasserstein distances");
+        debug!("{:?}", query_wass_distances);
+
+        /*
+        Now that we have the wasserstein distances, we need to collapse
+        the vectors into some individual score. The way I am choosing to do
+        this is:
+         - find the means of each of the neighbor groups (Collapsing the Vec<Vec<f32>> into a Vec<f32>)
+         - find the standard deviation of the neighbor distances
+         - using this standard deviation, find the average Z-score of the query against the neighbors
+        */
+
+        let neighbor_means = neighbor_wass_dist_mat.iter()
+            .map(|v: &Vec<f32>|{
+                mean::<f32, f32>(v)
+            })
             .collect::<Vec<_>>();
-        let mean_wasserstein: f32 = abd_clam::utils::mean(&wasserstein_distances);
-
-        mean_wasserstein > threshold
+
+        debug!("");
+        debug!("Neighbor means:");
+        debug!("{:?}", neighbor_means);
+
+        let mean_of_neighbor_means: f32 = mean(&neighbor_means);
+
+        let neighbor_standard_deviation: f32 = standard_deviation(&neighbor_means);
+
+        // let mean_squeared_errors = neighbor_means.iter()
+        //     .zip(query_wass_distances.iter())
+        //     .map(|(&u, &x)| (x - u).powi(2))
+        //     .collect::<Vec<_>>();
+
+        let mean_squeared_errors = query_wass_distances.iter()
+            .map(|&x| (x - mean_of_neighbor_means).powi(2))
+            .collect::<Vec<_>>();
+
+        debug!("");
+        debug!("Mean squared errors:");
+        debug!("{:?}", mean_squeared_errors);
+
+        let average_mean_squared_error: f32 = mean(&mean_squeared_errors);
+
+        NeighborhoodAwareScore{
+            score: average_mean_squared_error,
+            standard_deviation: neighbor_standard_deviation,
+        }
+    }
+
+    /// Check if a point is an outlier.
+    pub fn is_outlier<C: Cluster<Vec<f32>, f32, Self>>(&self, root: &C, query: &Vec<f32>) -> bool {
+        let NeighborhoodAwareScore {
+            score,
+            standard_deviation 
+        } = self.outlier_score(root, query);
+
+        let sigma_range = 2f32 * standard_deviation;
+
+        debug!("Score: {score}, standard deviation: {standard_deviation}");
+
+        score.abs() > sigma_range
     }
 
     /// Get the distances to the `k` nearest neighbors of a point.
+    // fn neighbor_distances(&self, i: usize) -> Vec<f32> {
+    //     self.data.metadata()[i].1.iter().map(|&(_, d)| d).collect()
+    // }
+
+    /// Get the nearest neighbors not including the queried index.
     fn neighbor_distances(&self, i: usize) -> Vec<f32> {
-        self.data.metadata()[i].1.iter().map(|&(_, d)| d).collect()
+        self.data.metadata()[i].1.iter()
+            .filter(|(ind, _)| *ind != i)
+            .map(|&(_, d)| d).collect()
     }
 }
 

@@ -3,6 +3,8 @@
 use abd_clam::Metric;
 use distances::{number::Float, Number};
 
+use super::wasserstein::wasserstein;
+
 #[derive(clap::ValueEnum, Debug, Clone)]
 pub enum VecMetric {
     /// The Euclidean (L2) distance.
@@ -16,6 +18,10 @@ pub enum VecMetric {
     /// The Cosine (angular) distance.
     #[clap(name = "cosine")]
     Cosine,
+
+    /// The Wasserstein distance.
+    #[clap(name = "wasserstein")]
+    Wasserstein,
 }
 
 impl VecMetric {
@@ -26,6 +32,7 @@ impl VecMetric {
             Self::Euclidean => |x: &Vec<T>, y: &Vec<T>| distances::vectors::euclidean::<T, U>(x, y),
             Self::Manhattan => |x: &Vec<T>, y: &Vec<T>| U::from(distances::vectors::manhattan::<T>(x, y)),
             Self::Cosine => |x: &Vec<T>, y: &Vec<T>| distances::vectors::cosine::<T, U>(x, y),
+            Self::Wasserstein => |x: &Vec<T>, y: &Vec<T>| U::from(wasserstein::<T, U>(x, y)),
         };
         Metric::new(distance_fn, false)
     }

@@ -1,11 +1,106 @@
 //! 1D Wasserstein distance
+use distances::{number::Float, Number};
 
 /// Compute the Wasserstein distance between two 1D distributions.
 ///
 /// Uses Euclidean distance as the ground metric.
 ///
 /// See the [SciPy documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wasserstein_distance.html) for more information.
-#[allow(unused_variables, clippy::ptr_arg)]
-pub fn wasserstein(x: &Vec<f32>, y: &Vec<f32>) -> f32 {
-    todo!()
+pub fn wasserstein<T: Number, U: Float>(u_values: &Vec<T>, v_values: &Vec<T>) -> U {
+    // Both vecs are assumed to be sorted. This simplifies the calculations to simply be
+    // the summation of the differences of each index as the vecs can be considered to be
+    // cumulative distribution functions. Therefore, we can use the Wasserstein definition:
+    // W(u, v) = integral_-∞^∞(|U - V|)
+
+    let u_values = u_values.iter().map(|f| U::from(*f)).collect::<Vec<U>>();
+    let v_values = v_values.iter().map(|f| U::from(*f)).collect::<Vec<U>>();
+
+    u_values.iter()
+        .zip(v_values.iter())
+        // find the absolute difference
+        .map(|(&l, &r)| l.abs_diff(r))
+        // find the sum
+        .sum::<U>()
 }
+
+#[cfg(test)]
+mod wasserstein_tests{
+    use rand::{thread_rng, Rng};
+
+    use crate::data::wasserstein::wasserstein;
+
+    const K: usize = 100000;
+
+    #[test]
+    fn wasserstein_test(){
+        let mut dirt: Vec<f32> = vec![0.; K];
+        let mut holes: Vec<f32> = vec![0.; K];
+
+        dirt = dirt.iter().map(|_| thread_rng().r#gen::<f32>()).collect();
+        holes = holes.iter().map(|_| thread_rng().r#gen::<f32>()).collect();
+
+        let t = std::time::Instant::now();
+
+        dirt.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        holes.sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+        let res: f32 = wasserstein(&dirt, &holes);
+
+        let time = t.elapsed().as_secs_f64();
+
+        println!("Time: {}", time);
+
+        println!("{}", res);
+        }
+
+        #[test]
+        fn deep_tests(){
+        for _ in 0..100{
+            identity_test();
+            symmetry_test();
+            triangle_inequality_test();
+        }
+    }
+
+    #[test]
+    fn identity_test(){
+        let mut dirt = vec![0.; K];
+        dirt = dirt.iter().map(|_| thread_rng().r#gen::<f32>()).collect();
+
+        let res: f32 = wasserstein(&dirt, &dirt);
+
+        assert_eq!(res, 0.);
+    }
+
+    #[test]
+    fn symmetry_test(){
+        let mut dirt = vec![0.; K];
+        let mut holes = vec![0.; K];
+
+        dirt = dirt.iter().map(|_| thread_rng().r#gen::<f32>()).collect();
+        holes = holes.iter().map(|_| thread_rng().r#gen::<f32>()).collect();
+
+        let res1: f32 = wasserstein(&dirt, &holes);
+        let res2: f32 = wasserstein(&holes, &dirt);
+
+        assert_eq!(res1, res2);
+    }
+
+    #[test]
+    fn triangle_inequality_test(){
+
+        let mut v1 = vec![0.; K];
+        let mut v2 = vec![0.; K];
+        let mut v3 = vec![0.; K];
+
+        v1 = v1.iter().map(|_| thread_rng().r#gen::<f32>()).collect();
+        v2 = v2.iter().map(|_| thread_rng().r#gen::<f32>()).collect();
+        v3 = v3.iter().map(|_| thread_rng().r#gen::<f32>()).collect();
+
+        let d_v1_v3: f32 = wasserstein(&v1, &v3);
+        let d_v1_v2: f32 = wasserstein(&v1, &v2);
+        let d_v2_v3: f32 = wasserstein(&v2, &v3);
+
+        assert!(d_v1_v3 <= d_v1_v2 + d_v2_v3);
+    }
+}