Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions crates/results/rite-solutions/src/data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,17 @@ use abd_clam::FlatVec;
mod gen_random;
mod neighborhood_aware;
mod vec_metric;
mod wasserstein;
pub mod wasserstein;

pub use gen_random::gen_random;
pub use neighborhood_aware::NeighborhoodAware;
#[allow(unused_imports)]
pub use neighborhood_aware::{NeighborhoodAware, NeighborhoodAwareScore};
pub use vec_metric::VecMetric;

/// Read data from the given path or generate random data.
pub fn read_or_generate<P: AsRef<Path>>(
path: Option<P>,
#[allow(dead_code)]
pub fn read_or_generate(
path: Option<String>,
metric: &VecMetric,
num_inliers: Option<usize>,
dimensionality: Option<usize>,
Expand All @@ -26,7 +28,7 @@ pub fn read_or_generate<P: AsRef<Path>>(
let metric = metric.metric::<f32, f32>();

let data = if let Some(path) = path {
let path = path.as_ref();
let path = Path::new(&path);
if !path.exists() {
return Err(format!("{path:?} does not exist"));
}
Expand Down
154 changes: 121 additions & 33 deletions crates/results/rite-solutions/src/data/neighborhood_aware.rs
Original file line number Diff line number Diff line change
@@ -1,39 +1,49 @@
//! A `Dataset` in which every point stores the distances to its `k` nearest neighbors.

use abd_clam::{
cluster::ParCluster,
dataset::{metric_space::ParMetricSpace, ParDataset},
Cluster, Dataset, FlatVec, Metric, MetricSpace, Permutable,
cluster::ParCluster, dataset::{metric_space::ParMetricSpace, ParDataset}, utils::{mean, standard_deviation}, Cluster, Dataset, FlatVec, Metric, MetricSpace, Permutable
};
use ftlog::debug;
use rayon::prelude::*;

use super::wasserstein;
use super::wasserstein::wasserstein;

type Fv = FlatVec<Vec<f32>, f32, usize>;

pub struct NeighborhoodAwareScore{
pub score: f32,
pub standard_deviation: f32
}

/// A `Dataset` in which every point stores the distances to its `k` nearest neighbors.
#[allow(clippy::type_complexity)]
pub struct NeighborhoodAware {
data: FlatVec<Vec<f32>, f32, (usize, Vec<(usize, f32)>)>,
k: usize,
}

#[allow(dead_code)]
impl NeighborhoodAware {
/// Create a new `NeighborhoodAware` `Dataset`.
///
/// This will run knn-search on every point in the dataset and store the
/// results in the dataset.
pub fn new<C: Cluster<Vec<f32>, f32, Fv>>(data: &Fv, root: &C, k: usize) -> Self {
let alg = abd_clam::cakes::Algorithm::KnnLinear(k);
let alg = abd_clam::cakes::Algorithm::KnnLinear(k + 1);

let results = data
let results: Vec<(usize, Vec<(usize, f32)>)> = data
.instances()
.iter()
.map(|query| alg.search(data, root, query))
.enumerate()
.map(|(_, query)| {
let mut neighbors = alg.search(data, root, query);
neighbors.sort_by(|&(_, a),(_, b)|a.partial_cmp(b).unwrap());
neighbors
})
.zip(data.metadata().iter())
.map(|(h, &i)| (i, h))
.collect();

let data = data
.clone()
.with_metadata(results)
Expand All @@ -43,53 +53,131 @@ impl NeighborhoodAware {

/// Parallel version of `new`.
pub fn par_new<C: ParCluster<Vec<f32>, f32, Fv>>(data: &Fv, root: &C, k: usize) -> Self {
let alg = abd_clam::cakes::Algorithm::KnnLinear(k);
let alg = abd_clam::cakes::Algorithm::KnnLinear(k + 1);

let results = data
let results: Vec<(usize, Vec<(usize, f32)>)> = data
.instances()
.par_iter()
.map(|query| alg.par_search(data, root, query))
.map(|query| {
let mut neighbors = alg.par_search(data, root, query);
neighbors.par_sort_by(|&(_, a),(_, b)|a.partial_cmp(b).unwrap());
neighbors
})
.zip(data.metadata().par_iter())
.map(|(h, &i)| (i, h))
.collect();

let data = data
.clone()
.with_metadata(results)
.unwrap_or_else(|e| unreachable!("We created the correct size for neighborhood aware data: {e}"));
Self { data, k }
}

/// Check if a point is an outlier.
pub fn is_outlier<C: Cluster<Vec<f32>, f32, Self>>(&self, root: &C, query: &Vec<f32>, threshold: f32) -> bool {

pub fn outlier_score<C: Cluster<Vec<f32>, f32, Self>>(&self, root: &C, query: &Vec<f32>) -> NeighborhoodAwareScore {
debug!("Entering is_outlier.");

let alg = abd_clam::cakes::Algorithm::KnnLinear(self.k);

let hits = alg.search(self, root, query);

debug!("");
debug!("Hits: {:?}", hits);

let neighbors_distances = hits
.iter()
.map(|&(i, _)| self.neighbor_distances(i))
.map(|&(i, _)| {
self.neighbor_distances(i)
})
.collect::<Vec<_>>();

// TODO: Compute all-pairs matrix of wasserstein distances among the neighbors' distance distributions.

// TODO: Compute the wasserstein distances for the query

// TODO: Optionally use the threshold to determine if the query is an outlier.

let distances = hits.iter().map(|&(_, d)| d).collect::<Vec<_>>();
// TODO: The rest of this is wrong.
let wasserstein_distances = neighbors_distances
.iter()
.map(|d| wasserstein::wasserstein(d, &distances))

let neighbor_wass_dist_mat = neighbors_distances.iter().map(|v| {
neighbors_distances.iter().map(|q| wasserstein(v, q)).collect::<Vec<f32>>()
}).collect::<Vec<Vec<f32>>>();

debug!("Wasserstein distance matrix of neighbors:");
for a in &neighbor_wass_dist_mat{
debug!("\t{:?}", *a);
}

let query_distances = hits.iter().map(|&(_, d)| d).collect::<Vec<_>>();

let query_wass_distances = neighbors_distances.iter().map(|v|{
wasserstein(&query_distances, v)
}).collect::<Vec<f32>>();

debug!("");
debug!("Query wasserstein distances");
debug!("{:?}", query_wass_distances);

/*
Now that we have the wasserstein distances, we need to collapse
the vectors into some individual score. The way I am choosing to do
this is:
- find the means of each of the neighbor groups (Collapsing the Vec<Vec<f32>> into a Vec<f32>)
- find the standard deviation of the neighbor distances
- using this standard deviation, find the average Z-score of the query against the neighbors
*/

let neighbor_means = neighbor_wass_dist_mat.iter()
.map(|v: &Vec<f32>|{
mean::<f32, f32>(v)
})
.collect::<Vec<_>>();
let mean_wasserstein: f32 = abd_clam::utils::mean(&wasserstein_distances);

mean_wasserstein > threshold

debug!("");
debug!("Neighbor means:");
debug!("{:?}", neighbor_means);

let mean_of_neighbor_means: f32 = mean(&neighbor_means);

let neighbor_standard_deviation: f32 = standard_deviation(&neighbor_means);

// let mean_squeared_errors = neighbor_means.iter()
// .zip(query_wass_distances.iter())
// .map(|(&u, &x)| (x - u).powi(2))
// .collect::<Vec<_>>();

let mean_squeared_errors = query_wass_distances.iter()
.map(|&x| (x - mean_of_neighbor_means).powi(2))
.collect::<Vec<_>>();

debug!("");
debug!("Mean squared errors:");
debug!("{:?}", mean_squeared_errors);

let average_mean_squared_error: f32 = mean(&mean_squeared_errors);

NeighborhoodAwareScore{
score: average_mean_squared_error,
standard_deviation: neighbor_standard_deviation,
}
}

/// Check if a point is an outlier.
pub fn is_outlier<C: Cluster<Vec<f32>, f32, Self>>(&self, root: &C, query: &Vec<f32>) -> bool {
let NeighborhoodAwareScore {
score,
standard_deviation
} = self.outlier_score(root, query);

let sigma_range = 2f32 * standard_deviation;

debug!("Score: {score}, standard deviation: {standard_deviation}");

score.abs() > sigma_range
}

/// Get the distances to the `k` nearest neighbors of a point.
// fn neighbor_distances(&self, i: usize) -> Vec<f32> {
// self.data.metadata()[i].1.iter().map(|&(_, d)| d).collect()
// }

/// Get the nearest neighbors not including the queried index.
fn neighbor_distances(&self, i: usize) -> Vec<f32> {
self.data.metadata()[i].1.iter().map(|&(_, d)| d).collect()
self.data.metadata()[i].1.iter()
.filter(|(ind, _)| *ind != i)
.map(|&(_, d)| d).collect()
}
}

Expand Down
7 changes: 7 additions & 0 deletions crates/results/rite-solutions/src/data/vec_metric.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
use abd_clam::Metric;
use distances::{number::Float, Number};

use super::wasserstein::wasserstein;

#[derive(clap::ValueEnum, Debug, Clone)]
pub enum VecMetric {
/// The Euclidean (L2) distance.
Expand All @@ -16,6 +18,10 @@ pub enum VecMetric {
/// The Cosine (angular) distance.
#[clap(name = "cosine")]
Cosine,

/// The Wasserstein distance.
#[clap(name = "wasserstein")]
Wasserstein,
}

impl VecMetric {
Expand All @@ -26,6 +32,7 @@ impl VecMetric {
Self::Euclidean => |x: &Vec<T>, y: &Vec<T>| distances::vectors::euclidean::<T, U>(x, y),
Self::Manhattan => |x: &Vec<T>, y: &Vec<T>| U::from(distances::vectors::manhattan::<T>(x, y)),
Self::Cosine => |x: &Vec<T>, y: &Vec<T>| distances::vectors::cosine::<T, U>(x, y),
Self::Wasserstein => |x: &Vec<T>, y: &Vec<T>| U::from(wasserstein::<T, U>(x, y)),
};
Metric::new(distance_fn, false)
}
Expand Down
101 changes: 98 additions & 3 deletions crates/results/rite-solutions/src/data/wasserstein.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,106 @@
//! 1D Wasserstein distance
use distances::{number::Float, Number};

/// Compute the Wasserstein distance between two 1D distributions.
///
/// Uses Euclidean distance as the ground metric.
///
/// See the [SciPy documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wasserstein_distance.html) for more information.
#[allow(unused_variables, clippy::ptr_arg)]
pub fn wasserstein(x: &Vec<f32>, y: &Vec<f32>) -> f32 {
todo!()
pub fn wasserstein<T: Number, U: Float>(u_values: &Vec<T>, v_values: &Vec<T>) -> U {
// Both vecs are assumed to be sorted. This simplifies the calculations to simply be
// the summation of the differences of each index as the vecs can be considered to be
// cumulative distribution functions. Therefore, we can use the Wasserstein definition:
// W(u, v) = integral_-∞^∞(|U - V|)

let u_values = u_values.iter().map(|f| U::from(*f)).collect::<Vec<U>>();
let v_values = v_values.iter().map(|f| U::from(*f)).collect::<Vec<U>>();

u_values.iter()
.zip(v_values.iter())
// find the absolute difference
.map(|(&l, &r)| l.abs_diff(r))
// find the sum
.sum::<U>()
}

#[cfg(test)]
mod wasserstein_tests{
use rand::{thread_rng, Rng};

use crate::data::wasserstein::wasserstein;

const K: usize = 100000;

#[test]
fn wasserstein_test(){
let mut dirt: Vec<f32> = vec![0.; K];
let mut holes: Vec<f32> = vec![0.; K];

dirt = dirt.iter().map(|_| thread_rng().r#gen::<f32>()).collect();
holes = holes.iter().map(|_| thread_rng().r#gen::<f32>()).collect();

let t = std::time::Instant::now();

dirt.sort_by(|a, b| a.partial_cmp(b).unwrap());
holes.sort_by(|a, b| a.partial_cmp(b).unwrap());

let res: f32 = wasserstein(&dirt, &holes);

let time = t.elapsed().as_secs_f64();

println!("Time: {}", time);

println!("{}", res);
}

#[test]
fn deep_tests(){
for _ in 0..100{
identity_test();
symmetry_test();
triangle_inequality_test();
}
}

#[test]
fn identity_test(){
let mut dirt = vec![0.; K];
dirt = dirt.iter().map(|_| thread_rng().r#gen::<f32>()).collect();

let res: f32 = wasserstein(&dirt, &dirt);

assert_eq!(res, 0.);
}

#[test]
fn symmetry_test(){
let mut dirt = vec![0.; K];
let mut holes = vec![0.; K];

dirt = dirt.iter().map(|_| thread_rng().r#gen::<f32>()).collect();
holes = holes.iter().map(|_| thread_rng().r#gen::<f32>()).collect();

let res1: f32 = wasserstein(&dirt, &holes);
let res2: f32 = wasserstein(&holes, &dirt);

assert_eq!(res1, res2);
}

#[test]
fn triangle_inequality_test(){

let mut v1 = vec![0.; K];
let mut v2 = vec![0.; K];
let mut v3 = vec![0.; K];

v1 = v1.iter().map(|_| thread_rng().r#gen::<f32>()).collect();
v2 = v2.iter().map(|_| thread_rng().r#gen::<f32>()).collect();
v3 = v3.iter().map(|_| thread_rng().r#gen::<f32>()).collect();

let d_v1_v3: f32 = wasserstein(&v1, &v3);
let d_v1_v2: f32 = wasserstein(&v1, &v2);
let d_v2_v3: f32 = wasserstein(&v2, &v3);

assert!(d_v1_v3 <= d_v1_v2 + d_v2_v3);
}
}
Loading