numcodecs_random_projection/
lib.rs

1//! [![CI Status]][workflow] [![MSRV]][repo] [![Latest Version]][crates.io] [![Rust Doc Crate]][docs.rs] [![Rust Doc Main]][docs]
2//!
3//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/numcodecs-rs/ci.yml?branch=main
4//! [workflow]: https://github.com/juntyr/numcodecs-rs/actions/workflows/ci.yml?query=branch%3Amain
5//!
6//! [MSRV]: https://img.shields.io/badge/MSRV-1.82.0-blue
7//! [repo]: https://github.com/juntyr/numcodecs-rs
8//!
9//! [Latest Version]: https://img.shields.io/crates/v/numcodecs-random-projection
10//! [crates.io]: https://crates.io/crates/numcodecs-random-projection
11//!
12//! [Rust Doc Crate]: https://img.shields.io/docsrs/numcodecs-random-projection
13//! [docs.rs]: https://docs.rs/numcodecs-random-projection/
14//!
15//! [Rust Doc Main]: https://img.shields.io/badge/docs-main-blue
16//! [docs]: https://juntyr.github.io/numcodecs-rs/numcodecs_random_projection
17//!
18//! Random Projection codec implementation for the [`numcodecs`] API.
19
20use std::{borrow::Cow, num::NonZeroUsize, ops::AddAssign};
21
22use ndarray::{s, Array, ArrayBase, ArrayViewMut, Data, Dimension, Ix2, ShapeError, Zip};
23use num_traits::{ConstOne, ConstZero, Float, FloatConst};
24use numcodecs::{
25    AnyArray, AnyArrayAssignError, AnyArrayDType, AnyArrayView, AnyArrayViewMut, AnyCowArray,
26    Codec, StaticCodec, StaticCodecConfig,
27};
28use schemars::{json_schema, JsonSchema, Schema, SchemaGenerator};
29use serde::{Deserialize, Deserializer, Serialize, Serializer};
30use thiserror::Error;
31
32#[cfg(test)]
33use ::serde_json as _;
34
35/// Codec that uses random projections to reduce the dimensionality of high-
36/// dimensional data to compress it.
37///
38/// A two-dimensional array of shape `$N \times D$` is encoded as n array of
39/// shape `$N \times K$`, where `$K$` is either set explicitly or chosen using
40/// the the Johnson-Lindenstrauss lemma. For `$K$` to be smaller than `$D$`,
41/// `$D$` must be quite large. Therefore, this codec should only applied on
42/// large datasets as it otherwise significantly inflates the data size instead
43/// of reducing it.
44///
45/// Choosing a lower distortion rate `epsilon` will improve the quality of the
46/// lossy compression, i.e. reduce the compression error, at the cost of
47/// increasing `$K$`.
48///
49/// This codec only supports finite floating point data.
50#[derive(Clone, Serialize, Deserialize, JsonSchema)]
51// FIXME: #[serde(deny_unknown_fields)]
52pub struct RandomProjectionCodec {
53    /// Seed for generating the random projection matrix
54    pub seed: u64,
55    /// Method with which the reduced dimensionality `$K$` is selected
56    #[serde(flatten)]
57    pub reduction: RandomProjectionReduction,
58    /// Projection kind that is used to generate the random projection matrix
59    #[serde(flatten)]
60    pub projection: RandomProjectionKind,
61}
62
63/// Method with which the reduced dimensionality `$K$` is selected
64#[derive(Clone, Serialize, Deserialize, JsonSchema)]
65// FIXME: #[serde(deny_unknown_fields)]
66#[serde(tag = "reduction", rename_all = "kebab-case")]
67pub enum RandomProjectionReduction {
68    /// The reduced dimensionality `$K$` is derived from `epsilon`, as defined
69    /// by the Johnson-Lindenstrauss lemma.
70    JohnsonLindenstrauss {
71        /// Maximum distortion rate
72        epsilon: OpenClosedUnit<f64>,
73    },
74    /// The reduced dimensionality `$K$`, to which the data is projected, is
75    /// given explicitly.
76    Explicit {
77        /// Reduced dimensionality
78        k: NonZeroUsize,
79    },
80}
81
82/// Projection kind that is used to generate the random projection matrix
83#[derive(Clone, Serialize, Deserialize, JsonSchema)]
84// FIXME: #[serde(deny_unknown_fields)]
85#[serde(tag = "projection", rename_all = "kebab-case")]
86pub enum RandomProjectionKind {
87    /// The random projection matrix is dense and its components are sampled
88    /// from `$\text{N}\left( 0, \frac{1}{k} \right)$`
89    Gaussian,
90    /// The random projection matrix is sparse where only `density`% of entries
91    /// are non-zero.
92    ///
93    /// The matrix's components are sampled from
94    ///
95    /// - `$-\sqrt{\frac{1}{k \cdot density}}$` with probability
96    ///   `$0.5 \cdot density$`
97    /// - `$0$` with probability `$1 - density$`
98    /// - `$+\sqrt{\frac{1}{k \cdot density}}$` with probability
99    ///   `$0.5 \cdot density$`
100    Sparse {
101        /// The `density` of the sparse projection matrix.
102        ///
103        /// Setting `density` to `$\frac{1}{3}$` reproduces the settings by
104        /// Achlioptas [^1]. If `density` is `None`, it is set to
105        /// `$\frac{1}{\sqrt{d}}$`,
106        /// the minimum density as recommended by Li et al [^2].
107        ///
108        ///
109        /// [^1]: Achlioptas, D. (2003). Database-friendly random projections:
110        ///       Johnson-Lindenstrauss with binary coins. *Journal of Computer
111        ///       and System Sciences*, 66(4), 671-687. Available from:
112        ///       [doi:10.1016/S0022-0000(03)00025-4](https://doi.org/10.1016/S0022-0000(03)00025-4).
113        ///
114        /// [^2]: Li, P., Hastie, T. J., and Church, K. W. (2006). Very sparse
115        ///       random projections. In *Proceedings of the 12th ACM SIGKDD
116        ///       international conference on Knowledge discovery and data
117        ///       mining (KDD '06)*. Association for Computing Machinery, New
118        ///       York, NY, USA, 287–296. Available from:
119        ///       [doi:10.1145/1150402.1150436](https://doi.org/10.1145/1150402.1150436).
120        #[serde(default, skip_serializing_if = "Option::is_none")]
121        density: Option<OpenClosedUnit<f64>>,
122    },
123}
124
125impl Codec for RandomProjectionCodec {
126    type Error = RandomProjectionCodecError;
127
128    fn encode(&self, data: AnyCowArray) -> Result<AnyArray, Self::Error> {
129        match data {
130            AnyCowArray::F32(data) => Ok(AnyArray::F32(
131                project_with_projection(data, self.seed, &self.reduction, &self.projection)?
132                    .into_dyn(),
133            )),
134            AnyCowArray::F64(data) => Ok(AnyArray::F64(
135                project_with_projection(data, self.seed, &self.reduction, &self.projection)?
136                    .into_dyn(),
137            )),
138            encoded => Err(RandomProjectionCodecError::UnsupportedDtype(
139                encoded.dtype(),
140            )),
141        }
142    }
143
144    fn decode(&self, encoded: AnyCowArray) -> Result<AnyArray, Self::Error> {
145        match encoded {
146            AnyCowArray::F32(encoded) => Ok(AnyArray::F32(
147                reconstruct_with_projection(encoded, self.seed, &self.projection)?.into_dyn(),
148            )),
149            AnyCowArray::F64(encoded) => Ok(AnyArray::F64(
150                reconstruct_with_projection(encoded, self.seed, &self.projection)?.into_dyn(),
151            )),
152            encoded => Err(RandomProjectionCodecError::UnsupportedDtype(
153                encoded.dtype(),
154            )),
155        }
156    }
157
158    fn decode_into(
159        &self,
160        encoded: AnyArrayView,
161        decoded: AnyArrayViewMut,
162    ) -> Result<(), Self::Error> {
163        match (encoded, decoded) {
164            (AnyArrayView::F32(encoded), AnyArrayViewMut::F32(decoded)) => {
165                reconstruct_into_with_projection(encoded, decoded, self.seed, &self.projection)
166            }
167            (AnyArrayView::F64(encoded), AnyArrayViewMut::F64(decoded)) => {
168                reconstruct_into_with_projection(encoded, decoded, self.seed, &self.projection)
169            }
170            (encoded @ (AnyArrayView::F32(_) | AnyArrayView::F64(_)), decoded) => {
171                Err(RandomProjectionCodecError::MismatchedDecodeIntoArray {
172                    source: AnyArrayAssignError::DTypeMismatch {
173                        src: encoded.dtype(),
174                        dst: decoded.dtype(),
175                    },
176                })
177            }
178            (encoded, _decoded) => Err(RandomProjectionCodecError::UnsupportedDtype(
179                encoded.dtype(),
180            )),
181        }
182    }
183}
184
185impl StaticCodec for RandomProjectionCodec {
186    const CODEC_ID: &'static str = "random-projection";
187
188    type Config<'de> = Self;
189
190    fn from_config(config: Self::Config<'_>) -> Self {
191        config
192    }
193
194    fn get_config(&self) -> StaticCodecConfig<Self> {
195        StaticCodecConfig::from(self)
196    }
197}
198
199#[derive(Debug, Error)]
200/// Errors that may occur when applying the [`RandomProjectionCodec`].
201pub enum RandomProjectionCodecError {
202    /// [`RandomProjectionCodec`] does not support the dtype
203    #[error("RandomProjection does not support the dtype {0}")]
204    UnsupportedDtype(AnyArrayDType),
205    /// [`RandomProjectionCodec`] does not support the dtype
206    #[error("RandomProjection only supports matrix / 2d-shaped arrays")]
207    NonMatrixData {
208        /// The source of the error
209        #[from]
210        source: ShapeError,
211    },
212    /// [`RandomProjectionCodec`] does not support non-finite (infinite or NaN)
213    /// floating point data
214    #[error("RandomProjection does not support non-finite (infinite or NaN) floating point data")]
215    NonFiniteData,
216    /// [`RandomProjectionCodec`] cannot encode or decode from an array with
217    /// `$N$` samples to an array with a different number of samples
218    #[error("RandomProjection cannot encode or decode from an array with {input} samples to an array with {output} samples")]
219    NumberOfSamplesMismatch {
220        /// Number of samples `N` in the input array
221        input: usize,
222        /// Number of samples `N` in the output array
223        output: usize,
224    },
225    /// [`RandomProjectionCodec`] cannot decode from an array with zero
226    /// dimensionality `$K$`
227    #[error("RandomProjection cannot decode from an array with zero dimensionality `K`")]
228    ProjectedArrayZeroComponents,
229    /// [`RandomProjectionCodec`] cannot decode from an array with corrupted
230    /// dimensionality metadata
231    #[error("RandomProjection cannot decode from an array with corrupted dimensionality metadata")]
232    CorruptedNumberOfComponents,
233    /// [`RandomProjectionCodec`] cannot decode into an array with `$D$`
234    /// features that differs from the `$D$` stored in the encoded metadata
235    #[error("RandomProjection cannot decode into an array with {output} features that differs from the {metadata} features stored in the encoded metadata")]
236    NumberOfFeaturesMismatch {
237        /// Number of features `$D$` in the encoded array metadata
238        metadata: usize,
239        /// Number of features `$D$` in the decoded output array
240        output: usize,
241    },
242    /// [`RandomProjectionCodec`] cannot decode into the provided array
243    #[error("RandomProjection cannot decode into the provided array")]
244    MismatchedDecodeIntoArray {
245        /// The source of the error
246        #[from]
247        source: AnyArrayAssignError,
248    },
249}
250
251/// Applies random projection to the input `data` with the given `seed`,
252/// `reduction` method, and `projection` kind and returns the resulting
253/// projected array.
254///
255/// # Errors
256///
257/// Errors with
258/// - [`RandomProjectionCodecError::NonMatrixData`] if the input `data` is not
259///   a two-dimensional matrix
260/// - [`RandomProjectionCodecError::NonFiniteData`] if the input `data` or
261///   projected output contains non-finite data
262pub fn project_with_projection<T: FloatExt, S: Data<Elem = T>, D: Dimension>(
263    data: ArrayBase<S, D>,
264    seed: u64,
265    reduction: &RandomProjectionReduction,
266    projection: &RandomProjectionKind,
267) -> Result<Array<T, Ix2>, RandomProjectionCodecError> {
268    let data = data
269        .into_dimensionality()
270        .map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
271
272    let (n, d) = data.dim();
273
274    let k = match reduction {
275        RandomProjectionReduction::JohnsonLindenstrauss { epsilon } => {
276            johnson_lindenstrauss_min_k(n, *epsilon)
277        }
278        RandomProjectionReduction::Explicit { k } => k.get(),
279    };
280
281    let mut projected = Array::<T, Ix2>::from_elem((n, k + 1), T::ZERO);
282
283    // stash the number of features `$d$` in an extra column
284    // this is quite inefficient but works for now
285    for p in projected.slice_mut(s!(.., k)) {
286        *p = T::from_usize(d);
287    }
288
289    match projection {
290        RandomProjectionKind::Gaussian => project_into(
291            data,
292            projected.slice_mut(s!(.., ..k)),
293            |x, y| gaussian_project(x, y, seed),
294            gaussian_normaliser(k),
295        ),
296        RandomProjectionKind::Sparse { density } => {
297            let density = density_or_ping_li_minimum(*density, d);
298            project_into(
299                data,
300                projected.slice_mut(s!(.., ..k)),
301                |x, y| sparse_project(x, y, density, seed),
302                sparse_normaliser(k, density),
303            )
304        }
305    }?;
306
307    Ok(projected)
308}
309
310#[expect(clippy::needless_pass_by_value)]
311/// Applies random projection to the input `data` and outputs into the
312/// `projected` array.
313///
314/// The random projection matrix is defined by the `projection` function
315/// `$(i, j) \rightarrow P[i, j]$` and a globally applied `normalizer` factor.
316///
317/// # Errors
318///
319/// Errors with
320/// - [`RandomProjectionCodecError::NumberOfSamplesMismatch`] if the input
321///   `data`'s number of samples doesn't match the `projected` array's number
322///   of samples
323/// - [`RandomProjectionCodecError::NonFiniteData`] if the input `data` or
324///   projected output contains non-finite data
325pub fn project_into<T: FloatExt, S: Data<Elem = T>>(
326    data: ArrayBase<S, Ix2>,
327    mut projected: ArrayViewMut<T, Ix2>,
328    projection: impl Fn(usize, usize) -> T,
329    normalizer: T,
330) -> Result<(), RandomProjectionCodecError> {
331    let (n, d) = data.dim();
332    let (n2, _k) = projected.dim();
333
334    if n2 != n {
335        return Err(RandomProjectionCodecError::NumberOfSamplesMismatch {
336            input: n,
337            output: n2,
338        });
339    }
340
341    let mut skip_projection_column = Vec::with_capacity(d);
342
343    for (j, projected_j) in projected.columns_mut().into_iter().enumerate() {
344        // materialize one column of the projection matrix
345        //   i.e. instead of A x B = C, compute A x [bs] = [cs].T
346        // the column is stored in a sparse representation [(l, p)]
347        //   where l is the index of the non-zero entry p
348        skip_projection_column.clear();
349        for l in 0..d {
350            let p = projection(l, j);
351            if !p.is_zero() {
352                skip_projection_column.push((l, p));
353            }
354        }
355
356        for (data_i, projected_ij) in data.rows().into_iter().zip(projected_j) {
357            let mut acc = T::ZERO;
358
359            #[expect(clippy::indexing_slicing)]
360            // data_i is of shape (d) and all l's are in 0..d
361            for &(l, p) in &skip_projection_column {
362                acc += data_i[l] * p;
363            }
364
365            *projected_ij = acc * normalizer;
366        }
367    }
368
369    if !Zip::from(projected).all(|x| x.is_finite()) {
370        return Err(RandomProjectionCodecError::NonFiniteData);
371    }
372
373    Ok(())
374}
375
376/// Applies the (approximate) inverse of random projection to the `projected`
377/// array.
378///
379/// The input data is reconstructed with the given `seed` and `projection`
380/// kind and returns the resulting reconstructed array.
381///
382/// # Errors
383///
384/// Errors with
385/// - [`RandomProjectionCodecError::NonMatrixData`] if the `projected` array is
386///   not a two-dimensional matrix
387/// - [`RandomProjectionCodecError::ProjectedArrayZeroComponents`] if the
388///   `projected` array is of shape `$(N, 0)$`
389/// - [`RandomProjectionCodecError::CorruptedNumberOfComponents`] if the
390///   `projected` array's dimensionality metadata is corrupted
391/// - [`RandomProjectionCodecError::NonFiniteData`] if the `projected` array or
392///   the reconstructed output contains non-finite data
393pub fn reconstruct_with_projection<T: FloatExt, S: Data<Elem = T>, D: Dimension>(
394    projected: ArrayBase<S, D>,
395    seed: u64,
396    projection: &RandomProjectionKind,
397) -> Result<Array<T, Ix2>, RandomProjectionCodecError> {
398    let projected = projected
399        .into_dimensionality()
400        .map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
401
402    if projected.is_empty() {
403        return Ok(projected.to_owned());
404    }
405
406    let (_n, k): (usize, usize) = projected.dim();
407    let Some(k) = k.checked_sub(1) else {
408        return Err(RandomProjectionCodecError::ProjectedArrayZeroComponents);
409    };
410
411    // extract the number of features `$d$` from the extra column and check that
412    //  it has been preserved consistently across the column
413    let ds = projected.slice(s!(.., k));
414    let Ok(Some(d)) = ds.fold(Ok(None), |acc, d| match acc {
415        Ok(None) => Ok(Some(d.into_usize())),
416        Ok(Some(d2)) if d2 == d.into_usize() => Ok(Some(d2)),
417        _ => Err(()),
418    }) else {
419        return Err(RandomProjectionCodecError::CorruptedNumberOfComponents);
420    };
421
422    // extract the projected data, excluding the metadata column
423    let projected = projected.slice_move(s!(.., ..k));
424
425    match projection {
426        RandomProjectionKind::Gaussian => reconstruct(
427            projected,
428            d,
429            |x, y| gaussian_project(x, y, seed),
430            gaussian_normaliser(k),
431        ),
432        RandomProjectionKind::Sparse { density } => {
433            let density = density_or_ping_li_minimum(*density, d);
434            reconstruct(
435                projected,
436                d,
437                |x, y| sparse_project(x, y, density, seed),
438                sparse_normaliser(k, density),
439            )
440        }
441    }
442}
443
444#[expect(clippy::needless_pass_by_value)]
445/// Applies the (approximate) inverse of random projection to the `projected`
446/// array to reconstruct the input data with dimensionality `d` and returns the
447/// resulting reconstructed array.
448///
449/// The random projection matrix is defined by the `projection` function
450/// `$(i, j) \rightarrow P[i, j]$` and a globally applied `normalizer` factor.
451///
452/// # Errors
453///
454/// Errors with
455/// - [`RandomProjectionCodecError::NonFiniteData`] if the `projected` array or
456///   the reconstructed output contains non-finite data
457pub fn reconstruct<T: FloatExt, S: Data<Elem = T>>(
458    projected: ArrayBase<S, Ix2>,
459    d: usize,
460    projection: impl Fn(usize, usize) -> T,
461    normalizer: T,
462) -> Result<Array<T, Ix2>, RandomProjectionCodecError> {
463    if projected.is_empty() {
464        return Ok(projected.to_owned());
465    }
466
467    let (n, k) = projected.dim();
468
469    let mut reconstructed = Array::<T, Ix2>::from_elem((n, d), T::ZERO);
470
471    let mut skip_projection_row = Vec::with_capacity(k);
472
473    for (l, reconstructed_l) in reconstructed.columns_mut().into_iter().enumerate() {
474        // materialize one row of the projection matrix transpose
475        // i.e. instead of A x B = C, compute A x [bs] = [cs].T
476        // the row is stored in a sparse representation [(j, p)]
477        //   where j is the index of the non-zero entry p
478        skip_projection_row.clear();
479        for j in 0..k {
480            let p = projection(l, j);
481            if !p.is_zero() {
482                skip_projection_row.push((j, p));
483            }
484        }
485
486        for (projected_i, reconstructed_il) in projected.rows().into_iter().zip(reconstructed_l) {
487            let mut acc = T::ZERO;
488
489            #[expect(clippy::indexing_slicing)]
490            // projected_i is of shape (k) and all j's are in 0..k
491            for &(j, p) in &skip_projection_row {
492                acc += projected_i[j] * p;
493            }
494
495            *reconstructed_il = acc * normalizer;
496        }
497    }
498
499    if !Zip::from(&reconstructed).all(|x| x.is_finite()) {
500        return Err(RandomProjectionCodecError::NonFiniteData);
501    }
502
503    Ok(reconstructed)
504}
505
506/// Applies the (approximate) inverse of random projection to the `projected`
507/// array to reconstruct the input data with the given `seed` and `projection`
508/// kind and outputs into the `reconstructed` array.
509///
510/// # Errors
511///
512/// Errors with
513/// - [`RandomProjectionCodecError::NonMatrixData`] if the `projected` and
514///   `reconstructed` arrays are not two-dimensional matrices
515/// - [`RandomProjectionCodecError::NumberOfSamplesMismatch`] if the number of
516///   samples `$N$` of the `projected` array don't match the number of samples
517///   of the `reconstructed` array
518/// - [`RandomProjectionCodecError::ProjectedArrayZeroComponents`] if the
519///   `projected` array is of shape `$(N, 0)$`
520/// - [`RandomProjectionCodecError::CorruptedNumberOfComponents`] if the
521///   `projected` array's dimensionality metadata is corrupted
522/// - [`RandomProjectionCodecError::NumberOfFeaturesMismatch`] if the
523///   `reconstructed` array's dimensionality `$D$` does not match the
524///   `projected` array's dimensionality metadata
525/// - [`RandomProjectionCodecError::NonFiniteData`] if the `projected` array or
526///   the reconstructed output contains non-finite data
527pub fn reconstruct_into_with_projection<T: FloatExt, S: Data<Elem = T>, D: Dimension>(
528    projected: ArrayBase<S, D>,
529    reconstructed: ArrayViewMut<T, D>,
530    seed: u64,
531    projection: &RandomProjectionKind,
532) -> Result<(), RandomProjectionCodecError> {
533    let projected: ArrayBase<S, Ix2> = projected
534        .into_dimensionality()
535        .map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
536    let reconstructed: ArrayViewMut<T, Ix2> = reconstructed
537        .into_dimensionality()
538        .map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
539
540    let (n, k) = projected.dim();
541    let (n2, d2) = reconstructed.dim();
542
543    if n2 != n {
544        return Err(RandomProjectionCodecError::NumberOfSamplesMismatch {
545            input: n,
546            output: n2,
547        });
548    }
549
550    let Some(k) = k.checked_sub(1) else {
551        return Err(RandomProjectionCodecError::ProjectedArrayZeroComponents);
552    };
553
554    // extract the number of features `$d$` from the extra column and check that
555    //  it has been preserved consistently across the column
556    let ds = projected.slice(s!(.., k));
557    let Ok(Some(d)) = ds.fold(Ok(None), |acc, d| match acc {
558        Ok(None) => Ok(Some(d.into_usize())),
559        Ok(Some(d2)) if d2 == d.into_usize() => Ok(Some(d2)),
560        _ => Err(()),
561    }) else {
562        return Err(RandomProjectionCodecError::CorruptedNumberOfComponents);
563    };
564
565    if d2 != d {
566        return Err(RandomProjectionCodecError::NumberOfFeaturesMismatch {
567            metadata: d,
568            output: d2,
569        });
570    }
571
572    // extract the projected data, excluding the metadata column
573    let projected = projected.slice_move(s!(.., ..k));
574
575    match projection {
576        RandomProjectionKind::Gaussian => reconstruct_into(
577            projected,
578            reconstructed,
579            |x, y| gaussian_project(x, y, seed),
580            gaussian_normaliser(k),
581        ),
582        RandomProjectionKind::Sparse { density } => {
583            let density = density_or_ping_li_minimum(*density, d);
584            reconstruct_into(
585                projected,
586                reconstructed,
587                |x, y| sparse_project(x, y, density, seed),
588                sparse_normaliser(k, density),
589            )
590        }
591    }
592}
593
594#[expect(clippy::needless_pass_by_value)]
595/// Applies the (approximate) inverse of random projection to the `projected`
596/// array to reconstruct the input data outputs into the `reconstructed` array.
597///
598/// The random projection matrix is defined by the `projection` function
599/// `$(i, j) \rightarrow P[i, j]$` and a globally applied `normalizer` factor.
600///
601/// # Errors
602///
603/// Errors with
604/// - [`RandomProjectionCodecError::NumberOfSamplesMismatch`] if the number of
605///   samples `$N$` of the `projected` array don't match the number of samples
606///   of the `reconstructed` array
607/// - [`RandomProjectionCodecError::NonFiniteData`] if the `projected` array or
608///   the reconstructed output contains non-finite data
609pub fn reconstruct_into<T: FloatExt, S: Data<Elem = T>>(
610    projected: ArrayBase<S, Ix2>,
611    mut reconstructed: ArrayViewMut<T, Ix2>,
612    projection: impl Fn(usize, usize) -> T,
613    normalizer: T,
614) -> Result<(), RandomProjectionCodecError> {
615    let (n, k) = projected.dim();
616    let (n2, _d) = reconstructed.dim();
617
618    if n2 != n {
619        return Err(RandomProjectionCodecError::NumberOfSamplesMismatch {
620            input: n,
621            output: n2,
622        });
623    }
624
625    let mut skip_projection_row = Vec::with_capacity(k);
626
627    for (l, reconstructed_l) in reconstructed.columns_mut().into_iter().enumerate() {
628        // materialize one row of the projection matrix transpose
629        // i.e. instead of A x B = C, compute A x [bs] = [cs].T
630        // the row is stored in a sparse representation [(j, p)]
631        //   where j is the index of the non-zero entry p
632        skip_projection_row.clear();
633        for j in 0..k {
634            let p = projection(l, j);
635            if !p.is_zero() {
636                skip_projection_row.push((j, p));
637            }
638        }
639
640        for (projected_i, reconstructed_il) in projected.rows().into_iter().zip(reconstructed_l) {
641            let mut acc = T::ZERO;
642
643            #[expect(clippy::indexing_slicing)]
644            // projected_i is of shape (k) and all j's are in 0..k
645            for &(j, p) in &skip_projection_row {
646                acc += projected_i[j] * p;
647            }
648
649            *reconstructed_il = acc * normalizer;
650        }
651    }
652
653    if !Zip::from(reconstructed).all(|x| x.is_finite()) {
654        return Err(RandomProjectionCodecError::NonFiniteData);
655    }
656
657    Ok(())
658}
659
660/// Find a 'safe' number of components `$K$` to randomly project to.
661///
662/// The minimum number of components to guarantee the `$\epsilon$`-embedding is
663/// given by
664/// `$K \ge \frac{4 \cdot \ln(N)}{\frac{{\epsilon}^{2}}{2} - \frac{{\epsilon}^{3}}{3}}$`.
665///
666/// The implementation is adapted from [`sklearn`]'s.
667///
668/// [`sklearn`]: https://github.com/scikit-learn/scikit-learn/blob/3b39d7cb957ab781744b346c1848be9db3f4e221/sklearn/random_projection.py#L56-L142
669#[must_use]
670pub fn johnson_lindenstrauss_min_k(
671    n_samples: usize,
672    OpenClosedUnit(epsilon): OpenClosedUnit<f64>,
673) -> usize {
674    #[expect(clippy::suboptimal_flops)]
675    let denominator = (epsilon * epsilon * 0.5) - (epsilon * epsilon * epsilon / 3.0);
676    #[expect(clippy::cast_precision_loss)]
677    let min_k = (n_samples as f64).ln() * 4.0 / denominator;
678    #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
679    let min_k = min_k as usize;
680    min_k
681}
682
683/// Extract the provided `density` if it is `Some(_)`, or compute the minimum
684/// required density `$\frac{1}{\sqrt{d}}$` as recommended by Li et al [^3].
685///
686/// [^3]: Li, P., Hastie, T. J., and Church, K. W. (2006). Very sparse
687///       random projections. In *Proceedings of the 12th ACM SIGKDD
688///       international conference on Knowledge discovery and data
689///       mining (KDD '06)*. Association for Computing Machinery, New
690///       York, NY, USA, 287–296. Available from:
691///       [doi:10.1145/1150402.1150436](https://doi.org/10.1145/1150402.1150436).
692#[must_use]
693pub fn density_or_ping_li_minimum<T: FloatExt>(
694    density: Option<OpenClosedUnit<f64>>,
695    d: usize,
696) -> T {
697    match density {
698        Some(OpenClosedUnit(density)) => T::from_f64(density),
699        None => T::from_usize(d).sqrt().recip(),
700    }
701}
702
703/// Sample from `$\text{N}(0, 1)$` at the coordinate `$(x, y)$` with the random
704/// `seed`
705fn gaussian_project<T: FloatExt>(x: usize, y: usize, seed: u64) -> T {
706    let (ClosedOpenUnit(u0), OpenClosedUnit(u1)) = T::u01x2(hash_matrix_index(x, y, seed));
707
708    let theta = -T::TAU() * u0;
709    let r = (-T::TWO * u1.ln()).sqrt();
710
711    r * theta.sin()
712}
713
714fn gaussian_normaliser<T: FloatExt>(k: usize) -> T {
715    T::from_usize(k).sqrt().recip()
716}
717
718/// Sample from `${ -1, 0, +1 }$` with probabilities
719/// `${ 0.5 \cdot density, 1 - density, 0.5 \cdot density }$` at the coordinate
720/// `$(x, y)$` with the random `seed`
721fn sparse_project<T: FloatExt>(x: usize, y: usize, density: T, seed: u64) -> T {
722    let (ClosedOpenUnit(u0), _u1) = T::u01x2(hash_matrix_index(x, y, seed));
723
724    if u0 < (density * T::HALF) {
725        -T::ONE
726    } else if u0 < density {
727        T::ONE
728    } else {
729        T::ZERO
730    }
731}
732
733fn sparse_normaliser<T: FloatExt>(k: usize, density: T) -> T {
734    (T::from_usize(k) * density).recip().sqrt()
735}
736
737const fn hash_matrix_index(x: usize, y: usize, seed: u64) -> u64 {
738    seahash_diffuse(seahash_diffuse(x as u64) ^ seed ^ (y as u64))
739}
740
741const fn seahash_diffuse(mut x: u64) -> u64 {
742    // SeaHash diffusion function
743    // https://docs.rs/seahash/4.1.0/src/seahash/helper.rs.html#75-92
744
745    // These are derived from the PCG RNG's round. Thanks to @Veedrac for proposing
746    // this. The basic idea is that we use dynamic shifts, which are determined
747    // by the input itself. The shift is chosen by the higher bits, which means
748    // that changing those flips the lower bits, which scatters upwards because
749    // of the multiplication.
750
751    x = x.wrapping_mul(0x6eed_0e9d_a4d9_4a4f);
752
753    let a = x >> 32;
754    let b = x >> 60;
755
756    x ^= a >> b;
757
758    x = x.wrapping_mul(0x6eed_0e9d_a4d9_4a4f);
759
760    x
761}
762
763#[expect(clippy::derive_partial_eq_without_eq)] // floats are not Eq
764#[derive(Copy, Clone, PartialEq, PartialOrd, Hash)]
765/// Floating point number in `$[0.0, 1.0)$`
766pub struct ClosedOpenUnit<T: FloatExt>(T);
767
768#[expect(clippy::derive_partial_eq_without_eq)] // floats are not Eq
769#[derive(Copy, Clone, PartialEq, PartialOrd, Hash)]
770/// Floating point number in `$(0.0, 1.0]$`
771pub struct OpenClosedUnit<T: FloatExt>(T);
772
773impl Serialize for OpenClosedUnit<f64> {
774    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
775        serializer.serialize_f64(self.0)
776    }
777}
778
779impl<'de> Deserialize<'de> for OpenClosedUnit<f64> {
780    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
781        let x = f64::deserialize(deserializer)?;
782
783        if x > 0.0 && x <= 1.0 {
784            Ok(Self(x))
785        } else {
786            Err(serde::de::Error::invalid_value(
787                serde::de::Unexpected::Float(x),
788                &"a value in (0.0, 1.0]",
789            ))
790        }
791    }
792}
793
794impl JsonSchema for OpenClosedUnit<f64> {
795    fn schema_name() -> Cow<'static, str> {
796        Cow::Borrowed("OpenClosedUnitF64")
797    }
798
799    fn schema_id() -> Cow<'static, str> {
800        Cow::Borrowed(concat!(module_path!(), "::", "OpenClosedUnit<f64>"))
801    }
802
803    fn json_schema(_gen: &mut SchemaGenerator) -> Schema {
804        json_schema!({
805            "type": "number",
806            "exclusiveMinimum": 0.0,
807            "maximum": 1.0,
808        })
809    }
810}
811
812/// Floating point types.
813pub trait FloatExt: Float + ConstZero + ConstOne + FloatConst + AddAssign {
814    /// `$0.5$`
815    const HALF: Self;
816    /// `$2.0$`
817    const TWO: Self;
818
819    /// Converts from a [`f64`].
820    #[must_use]
821    fn from_f64(x: f64) -> Self;
822
823    /// Converts from a [`usize`].
824    #[must_use]
825    fn from_usize(n: usize) -> Self;
826
827    /// Converts into a [`usize`].
828    #[must_use]
829    fn into_usize(self) -> usize;
830
831    /// Generates two uniform random numbers from a random `hash` value.
832    ///
833    /// The first is sampled from `$[0.0, 1.0)$`, the second from
834    /// `$(0.0, 1.0]$`.
835    #[must_use]
836    fn u01x2(hash: u64) -> (ClosedOpenUnit<Self>, OpenClosedUnit<Self>);
837}
838
839impl FloatExt for f32 {
840    const HALF: Self = 0.5;
841    const TWO: Self = 2.0;
842
843    #[expect(clippy::cast_possible_truncation)]
844    fn from_f64(x: f64) -> Self {
845        x as Self
846    }
847
848    #[expect(clippy::cast_precision_loss)]
849    fn from_usize(n: usize) -> Self {
850        n as Self
851    }
852
853    #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
854    fn into_usize(self) -> usize {
855        self as usize
856    }
857
858    fn u01x2(hash: u64) -> (ClosedOpenUnit<Self>, OpenClosedUnit<Self>) {
859        #[expect(clippy::cast_possible_truncation)] // split u64 into (u32, u32)
860        let (low, high) = (
861            (hash & u64::from(u32::MAX)) as u32,
862            ((hash >> 32) & u64::from(u32::MAX)) as u32,
863        );
864
865        // https://prng.di.unimi.it -> Generating uniform doubles in the unit interval [0.0, 1.0)
866        // the hash is shifted to only cover the mantissa
867        #[expect(clippy::cast_precision_loss)]
868        let u0 = ClosedOpenUnit(((high >> 8) as Self) * Self::from_bits(0x3380_0000_u32)); // 0x1.0p-24
869
870        // https://prng.di.unimi.it -> Generating uniform doubles in the unit interval (0.0, 1.0]
871        // the hash is shifted to only cover the mantissa
872        #[expect(clippy::cast_precision_loss)]
873        let u1 = OpenClosedUnit((((low >> 8) + 1) as Self) * Self::from_bits(0x3380_0000_u32)); // 0x1.0p-24
874
875        (u0, u1)
876    }
877}
878
879impl FloatExt for f64 {
880    const HALF: Self = 0.5;
881    const TWO: Self = 2.0;
882
883    fn from_f64(x: f64) -> Self {
884        x
885    }
886
887    #[expect(clippy::cast_precision_loss)]
888    fn from_usize(n: usize) -> Self {
889        n as Self
890    }
891
892    #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
893    fn into_usize(self) -> usize {
894        self as usize
895    }
896
897    fn u01x2(hash: u64) -> (ClosedOpenUnit<Self>, OpenClosedUnit<Self>) {
898        // https://prng.di.unimi.it -> Generating uniform doubles in the unit interval [0.0, 1.0)
899        // the hash is shifted to only cover the mantissa
900        #[expect(clippy::cast_precision_loss)]
901        let u0 =
902            ClosedOpenUnit(((hash >> 11) as Self) * Self::from_bits(0x3CA0_0000_0000_0000_u64)); // 0x1.0p-53
903
904        let hash = seahash_diffuse(hash + 1);
905
906        // https://prng.di.unimi.it -> Generating uniform doubles in the unit interval (0.0, 1.0]
907        // the hash is shifted to only cover the mantissa
908        #[expect(clippy::cast_precision_loss)]
909        let u1 = OpenClosedUnit(
910            (((hash >> 11) + 1) as Self) * Self::from_bits(0x3CA0_0000_0000_0000_u64),
911        ); // 0x1.0p-53
912
913        (u0, u1)
914    }
915}
916
917#[cfg(test)]
918#[expect(clippy::unwrap_used, clippy::expect_used)]
919mod tests {
920    use ndarray_rand::rand_distr::{Distribution, Normal};
921    use ndarray_rand::RandomExt;
922
923    use super::*;
924
925    #[test]
926    fn gaussian_f32() {
927        test_error_decline::<f32>(
928            (100, 100),
929            Normal::new(42.0, 24.0).unwrap(),
930            42,
931            RandomProjectionKind::Gaussian,
932        );
933    }
934
935    #[test]
936    fn gaussian_f64() {
937        test_error_decline::<f64>(
938            (100, 100),
939            Normal::new(42.0, 24.0).unwrap(),
940            42,
941            RandomProjectionKind::Gaussian,
942        );
943    }
944
945    #[test]
946    fn achlioptas_sparse_f32() {
947        test_error_decline::<f32>(
948            (100, 100),
949            Normal::new(42.0, 24.0).unwrap(),
950            42,
951            RandomProjectionKind::Sparse {
952                density: Some(OpenClosedUnit(1.0 / 3.0)),
953            },
954        );
955    }
956
957    #[test]
958    fn achlioptas_sparse_f64() {
959        test_error_decline::<f64>(
960            (100, 100),
961            Normal::new(42.0, 24.0).unwrap(),
962            42,
963            RandomProjectionKind::Sparse {
964                density: Some(OpenClosedUnit(1.0 / 3.0)),
965            },
966        );
967    }
968
969    #[test]
970    fn ping_li_sparse_f32() {
971        test_error_decline::<f32>(
972            (100, 100),
973            Normal::new(42.0, 24.0).unwrap(),
974            42,
975            RandomProjectionKind::Sparse { density: None },
976        );
977    }
978
979    #[test]
980    fn ping_li_sparse_f64() {
981        test_error_decline::<f64>(
982            (100, 100),
983            Normal::new(42.0, 24.0).unwrap(),
984            42,
985            RandomProjectionKind::Sparse { density: None },
986        );
987    }
988
989    #[expect(clippy::needless_pass_by_value)]
990    fn test_error_decline<T: FloatExt + std::fmt::Display>(
991        shape: (usize, usize),
992        distribution: impl Distribution<T>,
993        seed: u64,
994        projection: RandomProjectionKind,
995    ) {
996        let data = Array::<T, Ix2>::random(shape, distribution);
997
998        let mut max_rmse = rmse(
999            &data,
1000            &roundtrip(&data, 42, OpenClosedUnit(1.0), &projection),
1001        );
1002
1003        for epsilon in [
1004            OpenClosedUnit(0.5),
1005            OpenClosedUnit(0.1),
1006            OpenClosedUnit(0.01),
1007        ] {
1008            let new_rmse = rmse(&data, &roundtrip(&data, seed, epsilon, &projection));
1009            assert!(
1010                new_rmse <= max_rmse,
1011                "{new_rmse} > {max_rmse} for {epsilon}",
1012                epsilon = epsilon.0
1013            );
1014            max_rmse = new_rmse;
1015        }
1016    }
1017
1018    fn roundtrip<T: FloatExt>(
1019        data: &Array<T, Ix2>,
1020        seed: u64,
1021        epsilon: OpenClosedUnit<f64>,
1022        projection: &RandomProjectionKind,
1023    ) -> Array<T, Ix2> {
1024        let projected = project_with_projection(
1025            data.view(),
1026            seed,
1027            &RandomProjectionReduction::JohnsonLindenstrauss { epsilon },
1028            projection,
1029        )
1030        .expect("projecting must not fail");
1031        let reconstructed = reconstruct_with_projection(projected, seed, projection)
1032            .expect("reconstruction must not fail");
1033        #[expect(clippy::let_and_return)]
1034        reconstructed
1035    }
1036
1037    fn rmse<T: FloatExt>(data: &Array<T, Ix2>, reconstructed: &Array<T, Ix2>) -> T {
1038        let mut err = T::ZERO;
1039
1040        for (&d, &r) in data.iter().zip(reconstructed) {
1041            err += (d - r) * (d - r);
1042        }
1043
1044        let mse = err * T::from_usize(data.len()).recip();
1045        mse.sqrt()
1046    }
1047}