numcodecs_random_projection/
lib.rs

1//! [![CI Status]][workflow] [![MSRV]][repo] [![Latest Version]][crates.io] [![Rust Doc Crate]][docs.rs] [![Rust Doc Main]][docs]
2//!
3//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/numcodecs-rs/ci.yml?branch=main
4//! [workflow]: https://github.com/juntyr/numcodecs-rs/actions/workflows/ci.yml?query=branch%3Amain
5//!
6//! [MSRV]: https://img.shields.io/badge/MSRV-1.82.0-blue
7//! [repo]: https://github.com/juntyr/numcodecs-rs
8//!
9//! [Latest Version]: https://img.shields.io/crates/v/numcodecs-random-projection
10//! [crates.io]: https://crates.io/crates/numcodecs-random-projection
11//!
12//! [Rust Doc Crate]: https://img.shields.io/docsrs/numcodecs-random-projection
13//! [docs.rs]: https://docs.rs/numcodecs-random-projection/
14//!
15//! [Rust Doc Main]: https://img.shields.io/badge/docs-main-blue
16//! [docs]: https://juntyr.github.io/numcodecs-rs/numcodecs_random_projection
17//!
18//! Random Projection codec implementation for the [`numcodecs`] API.
19
20use std::{borrow::Cow, num::NonZeroUsize, ops::AddAssign};
21
22use ndarray::{s, Array, ArrayBase, ArrayViewMut, Data, Dimension, Ix2, ShapeError, Zip};
23use num_traits::{ConstOne, ConstZero, Float, FloatConst};
24use numcodecs::{
25    AnyArray, AnyArrayAssignError, AnyArrayDType, AnyArrayView, AnyArrayViewMut, AnyCowArray,
26    Codec, StaticCodec, StaticCodecConfig, StaticCodecVersion,
27};
28use schemars::{json_schema, JsonSchema, Schema, SchemaGenerator};
29use serde::{Deserialize, Deserializer, Serialize, Serializer};
30use thiserror::Error;
31
32#[cfg(test)]
33use ::serde_json as _;
34
35/// Codec that uses random projections to reduce the dimensionality of high-
36/// dimensional data to compress it.
37///
38/// A two-dimensional array of shape `$N \times D$` is encoded as n array of
39/// shape `$N \times K$`, where `$K$` is either set explicitly or chosen using
40/// the the Johnson-Lindenstrauss lemma. For `$K$` to be smaller than `$D$`,
41/// `$D$` must be quite large. Therefore, this codec should only applied on
42/// large datasets as it otherwise significantly inflates the data size instead
43/// of reducing it.
44///
45/// Choosing a lower distortion rate `epsilon` will improve the quality of the
46/// lossy compression, i.e. reduce the compression error, at the cost of
47/// increasing `$K$`.
48///
49/// This codec only supports finite floating point data.
50#[derive(Clone, Serialize, Deserialize, JsonSchema)]
51// FIXME: #[serde(deny_unknown_fields)]
52pub struct RandomProjectionCodec {
53    /// Seed for generating the random projection matrix
54    pub seed: u64,
55    /// Method with which the reduced dimensionality `$K$` is selected
56    #[serde(flatten)]
57    pub reduction: RandomProjectionReduction,
58    /// Projection kind that is used to generate the random projection matrix
59    #[serde(flatten)]
60    pub projection: RandomProjectionKind,
61    /// The codec's encoding format version. Do not provide this parameter explicitly.
62    #[serde(default, rename = "_version")]
63    pub version: StaticCodecVersion<0, 1, 0>,
64}
65
66/// Method with which the reduced dimensionality `$K$` is selected
67#[derive(Clone, Serialize, Deserialize, JsonSchema)]
68// FIXME: #[serde(deny_unknown_fields)]
69#[serde(tag = "reduction", rename_all = "kebab-case")]
70pub enum RandomProjectionReduction {
71    /// The reduced dimensionality `$K$` is derived from `epsilon`, as defined
72    /// by the Johnson-Lindenstrauss lemma.
73    JohnsonLindenstrauss {
74        /// Maximum distortion rate
75        epsilon: OpenClosedUnit<f64>,
76    },
77    /// The reduced dimensionality `$K$`, to which the data is projected, is
78    /// given explicitly.
79    Explicit {
80        /// Reduced dimensionality
81        k: NonZeroUsize,
82    },
83}
84
85/// Projection kind that is used to generate the random projection matrix
86#[derive(Clone, Serialize, Deserialize, JsonSchema)]
87// FIXME: #[serde(deny_unknown_fields)]
88#[serde(tag = "projection", rename_all = "kebab-case")]
89pub enum RandomProjectionKind {
90    /// The random projection matrix is dense and its components are sampled
91    /// from `$\text{N}\left( 0, \frac{1}{k} \right)$`
92    Gaussian,
93    /// The random projection matrix is sparse where only `density`% of entries
94    /// are non-zero.
95    ///
96    /// The matrix's components are sampled from
97    ///
98    /// - `$-\sqrt{\frac{1}{k \cdot density}}$` with probability
99    ///   `$0.5 \cdot density$`
100    /// - `$0$` with probability `$1 - density$`
101    /// - `$+\sqrt{\frac{1}{k \cdot density}}$` with probability
102    ///   `$0.5 \cdot density$`
103    Sparse {
104        /// The `density` of the sparse projection matrix.
105        ///
106        /// Setting `density` to `$\frac{1}{3}$` reproduces the settings by
107        /// Achlioptas [^1]. If `density` is `None`, it is set to
108        /// `$\frac{1}{\sqrt{d}}$`,
109        /// the minimum density as recommended by Li et al [^2].
110        ///
111        ///
112        /// [^1]: Achlioptas, D. (2003). Database-friendly random projections:
113        ///       Johnson-Lindenstrauss with binary coins. *Journal of Computer
114        ///       and System Sciences*, 66(4), 671-687. Available from:
115        ///       [doi:10.1016/S0022-0000(03)00025-4](https://doi.org/10.1016/S0022-0000(03)00025-4).
116        ///
117        /// [^2]: Li, P., Hastie, T. J., and Church, K. W. (2006). Very sparse
118        ///       random projections. In *Proceedings of the 12th ACM SIGKDD
119        ///       international conference on Knowledge discovery and data
120        ///       mining (KDD '06)*. Association for Computing Machinery, New
121        ///       York, NY, USA, 287–296. Available from:
122        ///       [doi:10.1145/1150402.1150436](https://doi.org/10.1145/1150402.1150436).
123        #[serde(default, skip_serializing_if = "Option::is_none")]
124        density: Option<OpenClosedUnit<f64>>,
125    },
126}
127
128impl Codec for RandomProjectionCodec {
129    type Error = RandomProjectionCodecError;
130
131    fn encode(&self, data: AnyCowArray) -> Result<AnyArray, Self::Error> {
132        match data {
133            AnyCowArray::F32(data) => Ok(AnyArray::F32(
134                project_with_projection(data, self.seed, &self.reduction, &self.projection)?
135                    .into_dyn(),
136            )),
137            AnyCowArray::F64(data) => Ok(AnyArray::F64(
138                project_with_projection(data, self.seed, &self.reduction, &self.projection)?
139                    .into_dyn(),
140            )),
141            encoded => Err(RandomProjectionCodecError::UnsupportedDtype(
142                encoded.dtype(),
143            )),
144        }
145    }
146
147    fn decode(&self, encoded: AnyCowArray) -> Result<AnyArray, Self::Error> {
148        match encoded {
149            AnyCowArray::F32(encoded) => Ok(AnyArray::F32(
150                reconstruct_with_projection(encoded, self.seed, &self.projection)?.into_dyn(),
151            )),
152            AnyCowArray::F64(encoded) => Ok(AnyArray::F64(
153                reconstruct_with_projection(encoded, self.seed, &self.projection)?.into_dyn(),
154            )),
155            encoded => Err(RandomProjectionCodecError::UnsupportedDtype(
156                encoded.dtype(),
157            )),
158        }
159    }
160
161    fn decode_into(
162        &self,
163        encoded: AnyArrayView,
164        decoded: AnyArrayViewMut,
165    ) -> Result<(), Self::Error> {
166        match (encoded, decoded) {
167            (AnyArrayView::F32(encoded), AnyArrayViewMut::F32(decoded)) => {
168                reconstruct_into_with_projection(encoded, decoded, self.seed, &self.projection)
169            }
170            (AnyArrayView::F64(encoded), AnyArrayViewMut::F64(decoded)) => {
171                reconstruct_into_with_projection(encoded, decoded, self.seed, &self.projection)
172            }
173            (encoded @ (AnyArrayView::F32(_) | AnyArrayView::F64(_)), decoded) => {
174                Err(RandomProjectionCodecError::MismatchedDecodeIntoArray {
175                    source: AnyArrayAssignError::DTypeMismatch {
176                        src: encoded.dtype(),
177                        dst: decoded.dtype(),
178                    },
179                })
180            }
181            (encoded, _decoded) => Err(RandomProjectionCodecError::UnsupportedDtype(
182                encoded.dtype(),
183            )),
184        }
185    }
186}
187
188impl StaticCodec for RandomProjectionCodec {
189    const CODEC_ID: &'static str = "random-projection.rs";
190
191    type Config<'de> = Self;
192
193    fn from_config(config: Self::Config<'_>) -> Self {
194        config
195    }
196
197    fn get_config(&self) -> StaticCodecConfig<Self> {
198        StaticCodecConfig::from(self)
199    }
200}
201
202#[derive(Debug, Error)]
203/// Errors that may occur when applying the [`RandomProjectionCodec`].
204pub enum RandomProjectionCodecError {
205    /// [`RandomProjectionCodec`] does not support the dtype
206    #[error("RandomProjection does not support the dtype {0}")]
207    UnsupportedDtype(AnyArrayDType),
208    /// [`RandomProjectionCodec`] only supports matrix / 2d-shaped arrays
209    #[error("RandomProjection only supports matrix / 2d-shaped arrays")]
210    NonMatrixData {
211        /// The source of the error
212        #[from]
213        source: ShapeError,
214    },
215    /// [`RandomProjectionCodec`] does not support non-finite (infinite or NaN)
216    /// floating point data
217    #[error("RandomProjection does not support non-finite (infinite or NaN) floating point data")]
218    NonFiniteData,
219    /// [`RandomProjectionCodec`] cannot encode or decode from an array with
220    /// `$N$` samples to an array with a different number of samples
221    #[error("RandomProjection cannot encode or decode from an array with {input} samples to an array with {output} samples")]
222    NumberOfSamplesMismatch {
223        /// Number of samples `N` in the input array
224        input: usize,
225        /// Number of samples `N` in the output array
226        output: usize,
227    },
228    /// [`RandomProjectionCodec`] cannot decode from an array with zero
229    /// dimensionality `$K$`
230    #[error("RandomProjection cannot decode from an array with zero dimensionality `K`")]
231    ProjectedArrayZeroComponents,
232    /// [`RandomProjectionCodec`] cannot decode from an array with corrupted
233    /// dimensionality metadata
234    #[error("RandomProjection cannot decode from an array with corrupted dimensionality metadata")]
235    CorruptedNumberOfComponents,
236    /// [`RandomProjectionCodec`] cannot decode into an array with `$D$`
237    /// features that differs from the `$D$` stored in the encoded metadata
238    #[error("RandomProjection cannot decode into an array with {output} features that differs from the {metadata} features stored in the encoded metadata")]
239    NumberOfFeaturesMismatch {
240        /// Number of features `$D$` in the encoded array metadata
241        metadata: usize,
242        /// Number of features `$D$` in the decoded output array
243        output: usize,
244    },
245    /// [`RandomProjectionCodec`] cannot decode into the provided array
246    #[error("RandomProjection cannot decode into the provided array")]
247    MismatchedDecodeIntoArray {
248        /// The source of the error
249        #[from]
250        source: AnyArrayAssignError,
251    },
252}
253
254/// Applies random projection to the input `data` with the given `seed`,
255/// `reduction` method, and `projection` kind and returns the resulting
256/// projected array.
257///
258/// # Errors
259///
260/// Errors with
261/// - [`RandomProjectionCodecError::NonMatrixData`] if the input `data` is not
262///   a two-dimensional matrix
263/// - [`RandomProjectionCodecError::NonFiniteData`] if the input `data` or
264///   projected output contains non-finite data
265pub fn project_with_projection<T: FloatExt, S: Data<Elem = T>, D: Dimension>(
266    data: ArrayBase<S, D>,
267    seed: u64,
268    reduction: &RandomProjectionReduction,
269    projection: &RandomProjectionKind,
270) -> Result<Array<T, Ix2>, RandomProjectionCodecError> {
271    let data = data
272        .into_dimensionality()
273        .map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
274
275    let (n, d) = data.dim();
276
277    let k = match reduction {
278        RandomProjectionReduction::JohnsonLindenstrauss { epsilon } => {
279            johnson_lindenstrauss_min_k(n, *epsilon)
280        }
281        RandomProjectionReduction::Explicit { k } => k.get(),
282    };
283
284    let mut projected = Array::<T, Ix2>::from_elem((n, k + 1), T::ZERO);
285
286    // stash the number of features `$d$` in an extra column
287    // this is quite inefficient but works for now
288    for p in projected.slice_mut(s!(.., k)) {
289        *p = T::from_usize(d);
290    }
291
292    match projection {
293        RandomProjectionKind::Gaussian => project_into(
294            data,
295            projected.slice_mut(s!(.., ..k)),
296            |x, y| gaussian_project(x, y, seed),
297            gaussian_normaliser(k),
298        ),
299        RandomProjectionKind::Sparse { density } => {
300            let density = density_or_ping_li_minimum(*density, d);
301            project_into(
302                data,
303                projected.slice_mut(s!(.., ..k)),
304                |x, y| sparse_project(x, y, density, seed),
305                sparse_normaliser(k, density),
306            )
307        }
308    }?;
309
310    Ok(projected)
311}
312
313#[expect(clippy::needless_pass_by_value)]
314/// Applies random projection to the input `data` and outputs into the
315/// `projected` array.
316///
317/// The random projection matrix is defined by the `projection` function
318/// `$(i, j) \rightarrow P[i, j]$` and a globally applied `normalizer` factor.
319///
320/// # Errors
321///
322/// Errors with
323/// - [`RandomProjectionCodecError::NumberOfSamplesMismatch`] if the input
324///   `data`'s number of samples doesn't match the `projected` array's number
325///   of samples
326/// - [`RandomProjectionCodecError::NonFiniteData`] if the input `data` or
327///   projected output contains non-finite data
328pub fn project_into<T: FloatExt, S: Data<Elem = T>>(
329    data: ArrayBase<S, Ix2>,
330    mut projected: ArrayViewMut<T, Ix2>,
331    projection: impl Fn(usize, usize) -> T,
332    normalizer: T,
333) -> Result<(), RandomProjectionCodecError> {
334    let (n, d) = data.dim();
335    let (n2, _k) = projected.dim();
336
337    if n2 != n {
338        return Err(RandomProjectionCodecError::NumberOfSamplesMismatch {
339            input: n,
340            output: n2,
341        });
342    }
343
344    let mut skip_projection_column = Vec::with_capacity(d);
345
346    for (j, projected_j) in projected.columns_mut().into_iter().enumerate() {
347        // materialize one column of the projection matrix
348        //   i.e. instead of A x B = C, compute A x [bs] = [cs].T
349        // the column is stored in a sparse representation [(l, p)]
350        //   where l is the index of the non-zero entry p
351        skip_projection_column.clear();
352        for l in 0..d {
353            let p = projection(l, j);
354            if !p.is_zero() {
355                skip_projection_column.push((l, p));
356            }
357        }
358
359        for (data_i, projected_ij) in data.rows().into_iter().zip(projected_j) {
360            let mut acc = T::ZERO;
361
362            #[expect(clippy::indexing_slicing)]
363            // data_i is of shape (d) and all l's are in 0..d
364            for &(l, p) in &skip_projection_column {
365                acc += data_i[l] * p;
366            }
367
368            *projected_ij = acc * normalizer;
369        }
370    }
371
372    if !Zip::from(projected).all(|x| x.is_finite()) {
373        return Err(RandomProjectionCodecError::NonFiniteData);
374    }
375
376    Ok(())
377}
378
379/// Applies the (approximate) inverse of random projection to the `projected`
380/// array.
381///
382/// The input data is reconstructed with the given `seed` and `projection`
383/// kind and returns the resulting reconstructed array.
384///
385/// # Errors
386///
387/// Errors with
388/// - [`RandomProjectionCodecError::NonMatrixData`] if the `projected` array is
389///   not a two-dimensional matrix
390/// - [`RandomProjectionCodecError::ProjectedArrayZeroComponents`] if the
391///   `projected` array is of shape `$(N, 0)$`
392/// - [`RandomProjectionCodecError::CorruptedNumberOfComponents`] if the
393///   `projected` array's dimensionality metadata is corrupted
394/// - [`RandomProjectionCodecError::NonFiniteData`] if the `projected` array or
395///   the reconstructed output contains non-finite data
396pub fn reconstruct_with_projection<T: FloatExt, S: Data<Elem = T>, D: Dimension>(
397    projected: ArrayBase<S, D>,
398    seed: u64,
399    projection: &RandomProjectionKind,
400) -> Result<Array<T, Ix2>, RandomProjectionCodecError> {
401    let projected = projected
402        .into_dimensionality()
403        .map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
404
405    if projected.is_empty() {
406        return Ok(projected.to_owned());
407    }
408
409    let (_n, k): (usize, usize) = projected.dim();
410    let Some(k) = k.checked_sub(1) else {
411        return Err(RandomProjectionCodecError::ProjectedArrayZeroComponents);
412    };
413
414    // extract the number of features `$d$` from the extra column and check that
415    //  it has been preserved consistently across the column
416    let ds = projected.slice(s!(.., k));
417    let Ok(Some(d)) = ds.fold(Ok(None), |acc, d| match acc {
418        Ok(None) => Ok(Some(d.into_usize())),
419        Ok(Some(d2)) if d2 == d.into_usize() => Ok(Some(d2)),
420        _ => Err(()),
421    }) else {
422        return Err(RandomProjectionCodecError::CorruptedNumberOfComponents);
423    };
424
425    // extract the projected data, excluding the metadata column
426    let projected = projected.slice_move(s!(.., ..k));
427
428    match projection {
429        RandomProjectionKind::Gaussian => reconstruct(
430            projected,
431            d,
432            |x, y| gaussian_project(x, y, seed),
433            gaussian_normaliser(k),
434        ),
435        RandomProjectionKind::Sparse { density } => {
436            let density = density_or_ping_li_minimum(*density, d);
437            reconstruct(
438                projected,
439                d,
440                |x, y| sparse_project(x, y, density, seed),
441                sparse_normaliser(k, density),
442            )
443        }
444    }
445}
446
447#[expect(clippy::needless_pass_by_value)]
448/// Applies the (approximate) inverse of random projection to the `projected`
449/// array to reconstruct the input data with dimensionality `d` and returns the
450/// resulting reconstructed array.
451///
452/// The random projection matrix is defined by the `projection` function
453/// `$(i, j) \rightarrow P[i, j]$` and a globally applied `normalizer` factor.
454///
455/// # Errors
456///
457/// Errors with
458/// - [`RandomProjectionCodecError::NonFiniteData`] if the `projected` array or
459///   the reconstructed output contains non-finite data
460pub fn reconstruct<T: FloatExt, S: Data<Elem = T>>(
461    projected: ArrayBase<S, Ix2>,
462    d: usize,
463    projection: impl Fn(usize, usize) -> T,
464    normalizer: T,
465) -> Result<Array<T, Ix2>, RandomProjectionCodecError> {
466    if projected.is_empty() {
467        return Ok(projected.to_owned());
468    }
469
470    let (n, k) = projected.dim();
471
472    let mut reconstructed = Array::<T, Ix2>::from_elem((n, d), T::ZERO);
473
474    let mut skip_projection_row = Vec::with_capacity(k);
475
476    for (l, reconstructed_l) in reconstructed.columns_mut().into_iter().enumerate() {
477        // materialize one row of the projection matrix transpose
478        // i.e. instead of A x B = C, compute A x [bs] = [cs].T
479        // the row is stored in a sparse representation [(j, p)]
480        //   where j is the index of the non-zero entry p
481        skip_projection_row.clear();
482        for j in 0..k {
483            let p = projection(l, j);
484            if !p.is_zero() {
485                skip_projection_row.push((j, p));
486            }
487        }
488
489        for (projected_i, reconstructed_il) in projected.rows().into_iter().zip(reconstructed_l) {
490            let mut acc = T::ZERO;
491
492            #[expect(clippy::indexing_slicing)]
493            // projected_i is of shape (k) and all j's are in 0..k
494            for &(j, p) in &skip_projection_row {
495                acc += projected_i[j] * p;
496            }
497
498            *reconstructed_il = acc * normalizer;
499        }
500    }
501
502    if !Zip::from(&reconstructed).all(|x| x.is_finite()) {
503        return Err(RandomProjectionCodecError::NonFiniteData);
504    }
505
506    Ok(reconstructed)
507}
508
509/// Applies the (approximate) inverse of random projection to the `projected`
510/// array to reconstruct the input data with the given `seed` and `projection`
511/// kind and outputs into the `reconstructed` array.
512///
513/// # Errors
514///
515/// Errors with
516/// - [`RandomProjectionCodecError::NonMatrixData`] if the `projected` and
517///   `reconstructed` arrays are not two-dimensional matrices
518/// - [`RandomProjectionCodecError::NumberOfSamplesMismatch`] if the number of
519///   samples `$N$` of the `projected` array don't match the number of samples
520///   of the `reconstructed` array
521/// - [`RandomProjectionCodecError::ProjectedArrayZeroComponents`] if the
522///   `projected` array is of shape `$(N, 0)$`
523/// - [`RandomProjectionCodecError::CorruptedNumberOfComponents`] if the
524///   `projected` array's dimensionality metadata is corrupted
525/// - [`RandomProjectionCodecError::NumberOfFeaturesMismatch`] if the
526///   `reconstructed` array's dimensionality `$D$` does not match the
527///   `projected` array's dimensionality metadata
528/// - [`RandomProjectionCodecError::NonFiniteData`] if the `projected` array or
529///   the reconstructed output contains non-finite data
530pub fn reconstruct_into_with_projection<T: FloatExt, S: Data<Elem = T>, D: Dimension>(
531    projected: ArrayBase<S, D>,
532    reconstructed: ArrayViewMut<T, D>,
533    seed: u64,
534    projection: &RandomProjectionKind,
535) -> Result<(), RandomProjectionCodecError> {
536    let projected: ArrayBase<S, Ix2> = projected
537        .into_dimensionality()
538        .map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
539    let reconstructed: ArrayViewMut<T, Ix2> = reconstructed
540        .into_dimensionality()
541        .map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
542
543    let (n, k) = projected.dim();
544    let (n2, d2) = reconstructed.dim();
545
546    if n2 != n {
547        return Err(RandomProjectionCodecError::NumberOfSamplesMismatch {
548            input: n,
549            output: n2,
550        });
551    }
552
553    let Some(k) = k.checked_sub(1) else {
554        return Err(RandomProjectionCodecError::ProjectedArrayZeroComponents);
555    };
556
557    // extract the number of features `$d$` from the extra column and check that
558    //  it has been preserved consistently across the column
559    let ds = projected.slice(s!(.., k));
560    let Ok(Some(d)) = ds.fold(Ok(None), |acc, d| match acc {
561        Ok(None) => Ok(Some(d.into_usize())),
562        Ok(Some(d2)) if d2 == d.into_usize() => Ok(Some(d2)),
563        _ => Err(()),
564    }) else {
565        return Err(RandomProjectionCodecError::CorruptedNumberOfComponents);
566    };
567
568    if d2 != d {
569        return Err(RandomProjectionCodecError::NumberOfFeaturesMismatch {
570            metadata: d,
571            output: d2,
572        });
573    }
574
575    // extract the projected data, excluding the metadata column
576    let projected = projected.slice_move(s!(.., ..k));
577
578    match projection {
579        RandomProjectionKind::Gaussian => reconstruct_into(
580            projected,
581            reconstructed,
582            |x, y| gaussian_project(x, y, seed),
583            gaussian_normaliser(k),
584        ),
585        RandomProjectionKind::Sparse { density } => {
586            let density = density_or_ping_li_minimum(*density, d);
587            reconstruct_into(
588                projected,
589                reconstructed,
590                |x, y| sparse_project(x, y, density, seed),
591                sparse_normaliser(k, density),
592            )
593        }
594    }
595}
596
597#[expect(clippy::needless_pass_by_value)]
598/// Applies the (approximate) inverse of random projection to the `projected`
599/// array to reconstruct the input data outputs into the `reconstructed` array.
600///
601/// The random projection matrix is defined by the `projection` function
602/// `$(i, j) \rightarrow P[i, j]$` and a globally applied `normalizer` factor.
603///
604/// # Errors
605///
606/// Errors with
607/// - [`RandomProjectionCodecError::NumberOfSamplesMismatch`] if the number of
608///   samples `$N$` of the `projected` array don't match the number of samples
609///   of the `reconstructed` array
610/// - [`RandomProjectionCodecError::NonFiniteData`] if the `projected` array or
611///   the reconstructed output contains non-finite data
612pub fn reconstruct_into<T: FloatExt, S: Data<Elem = T>>(
613    projected: ArrayBase<S, Ix2>,
614    mut reconstructed: ArrayViewMut<T, Ix2>,
615    projection: impl Fn(usize, usize) -> T,
616    normalizer: T,
617) -> Result<(), RandomProjectionCodecError> {
618    let (n, k) = projected.dim();
619    let (n2, _d) = reconstructed.dim();
620
621    if n2 != n {
622        return Err(RandomProjectionCodecError::NumberOfSamplesMismatch {
623            input: n,
624            output: n2,
625        });
626    }
627
628    let mut skip_projection_row = Vec::with_capacity(k);
629
630    for (l, reconstructed_l) in reconstructed.columns_mut().into_iter().enumerate() {
631        // materialize one row of the projection matrix transpose
632        // i.e. instead of A x B = C, compute A x [bs] = [cs].T
633        // the row is stored in a sparse representation [(j, p)]
634        //   where j is the index of the non-zero entry p
635        skip_projection_row.clear();
636        for j in 0..k {
637            let p = projection(l, j);
638            if !p.is_zero() {
639                skip_projection_row.push((j, p));
640            }
641        }
642
643        for (projected_i, reconstructed_il) in projected.rows().into_iter().zip(reconstructed_l) {
644            let mut acc = T::ZERO;
645
646            #[expect(clippy::indexing_slicing)]
647            // projected_i is of shape (k) and all j's are in 0..k
648            for &(j, p) in &skip_projection_row {
649                acc += projected_i[j] * p;
650            }
651
652            *reconstructed_il = acc * normalizer;
653        }
654    }
655
656    if !Zip::from(reconstructed).all(|x| x.is_finite()) {
657        return Err(RandomProjectionCodecError::NonFiniteData);
658    }
659
660    Ok(())
661}
662
663/// Find a 'safe' number of components `$K$` to randomly project to.
664///
665/// The minimum number of components to guarantee the `$\epsilon$`-embedding is
666/// given by
667/// `$K \ge \frac{4 \cdot \ln(N)}{\frac{{\epsilon}^{2}}{2} - \frac{{\epsilon}^{3}}{3}}$`.
668///
669/// The implementation is adapted from [`sklearn`]'s.
670///
671/// [`sklearn`]: https://github.com/scikit-learn/scikit-learn/blob/3b39d7cb957ab781744b346c1848be9db3f4e221/sklearn/random_projection.py#L56-L142
672#[must_use]
673pub fn johnson_lindenstrauss_min_k(
674    n_samples: usize,
675    OpenClosedUnit(epsilon): OpenClosedUnit<f64>,
676) -> usize {
677    #[expect(clippy::suboptimal_flops)]
678    let denominator = (epsilon * epsilon * 0.5) - (epsilon * epsilon * epsilon / 3.0);
679    #[expect(clippy::cast_precision_loss)]
680    let min_k = (n_samples as f64).ln() * 4.0 / denominator;
681    #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
682    let min_k = min_k as usize;
683    min_k
684}
685
686/// Extract the provided `density` if it is `Some(_)`, or compute the minimum
687/// required density `$\frac{1}{\sqrt{d}}$` as recommended by Li et al [^3].
688///
689/// [^3]: Li, P., Hastie, T. J., and Church, K. W. (2006). Very sparse
690///       random projections. In *Proceedings of the 12th ACM SIGKDD
691///       international conference on Knowledge discovery and data
692///       mining (KDD '06)*. Association for Computing Machinery, New
693///       York, NY, USA, 287–296. Available from:
694///       [doi:10.1145/1150402.1150436](https://doi.org/10.1145/1150402.1150436).
695#[must_use]
696pub fn density_or_ping_li_minimum<T: FloatExt>(
697    density: Option<OpenClosedUnit<f64>>,
698    d: usize,
699) -> T {
700    match density {
701        Some(OpenClosedUnit(density)) => T::from_f64(density),
702        None => T::from_usize(d).sqrt().recip(),
703    }
704}
705
706/// Sample from `$\text{N}(0, 1)$` at the coordinate `$(x, y)$` with the random
707/// `seed`
708fn gaussian_project<T: FloatExt>(x: usize, y: usize, seed: u64) -> T {
709    let (ClosedOpenUnit(u0), OpenClosedUnit(u1)) = T::u01x2(hash_matrix_index(x, y, seed));
710
711    let theta = -T::TAU() * u0;
712    let r = (-T::TWO * u1.ln()).sqrt();
713
714    r * theta.sin()
715}
716
717fn gaussian_normaliser<T: FloatExt>(k: usize) -> T {
718    T::from_usize(k).sqrt().recip()
719}
720
721/// Sample from `${ -1, 0, +1 }$` with probabilities
722/// `${ 0.5 \cdot density, 1 - density, 0.5 \cdot density }$` at the coordinate
723/// `$(x, y)$` with the random `seed`
724fn sparse_project<T: FloatExt>(x: usize, y: usize, density: T, seed: u64) -> T {
725    let (ClosedOpenUnit(u0), _u1) = T::u01x2(hash_matrix_index(x, y, seed));
726
727    if u0 < (density * T::HALF) {
728        -T::ONE
729    } else if u0 < density {
730        T::ONE
731    } else {
732        T::ZERO
733    }
734}
735
736fn sparse_normaliser<T: FloatExt>(k: usize, density: T) -> T {
737    (T::from_usize(k) * density).recip().sqrt()
738}
739
740const fn hash_matrix_index(x: usize, y: usize, seed: u64) -> u64 {
741    seahash_diffuse(seahash_diffuse(x as u64) ^ seed ^ (y as u64))
742}
743
744const fn seahash_diffuse(mut x: u64) -> u64 {
745    // SeaHash diffusion function
746    // https://docs.rs/seahash/4.1.0/src/seahash/helper.rs.html#75-92
747
748    // These are derived from the PCG RNG's round. Thanks to @Veedrac for proposing
749    // this. The basic idea is that we use dynamic shifts, which are determined
750    // by the input itself. The shift is chosen by the higher bits, which means
751    // that changing those flips the lower bits, which scatters upwards because
752    // of the multiplication.
753
754    x = x.wrapping_mul(0x6eed_0e9d_a4d9_4a4f);
755
756    let a = x >> 32;
757    let b = x >> 60;
758
759    x ^= a >> b;
760
761    x = x.wrapping_mul(0x6eed_0e9d_a4d9_4a4f);
762
763    x
764}
765
766#[expect(clippy::derive_partial_eq_without_eq)] // floats are not Eq
767#[derive(Copy, Clone, PartialEq, PartialOrd, Hash)]
768/// Floating point number in `$[0.0, 1.0)$`
769pub struct ClosedOpenUnit<T: FloatExt>(T);
770
771#[expect(clippy::derive_partial_eq_without_eq)] // floats are not Eq
772#[derive(Copy, Clone, PartialEq, PartialOrd, Hash)]
773/// Floating point number in `$(0.0, 1.0]$`
774pub struct OpenClosedUnit<T: FloatExt>(T);
775
776impl Serialize for OpenClosedUnit<f64> {
777    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
778        serializer.serialize_f64(self.0)
779    }
780}
781
782impl<'de> Deserialize<'de> for OpenClosedUnit<f64> {
783    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
784        let x = f64::deserialize(deserializer)?;
785
786        if x > 0.0 && x <= 1.0 {
787            Ok(Self(x))
788        } else {
789            Err(serde::de::Error::invalid_value(
790                serde::de::Unexpected::Float(x),
791                &"a value in (0.0, 1.0]",
792            ))
793        }
794    }
795}
796
797impl JsonSchema for OpenClosedUnit<f64> {
798    fn schema_name() -> Cow<'static, str> {
799        Cow::Borrowed("OpenClosedUnitF64")
800    }
801
802    fn schema_id() -> Cow<'static, str> {
803        Cow::Borrowed(concat!(module_path!(), "::", "OpenClosedUnit<f64>"))
804    }
805
806    fn json_schema(_gen: &mut SchemaGenerator) -> Schema {
807        json_schema!({
808            "type": "number",
809            "exclusiveMinimum": 0.0,
810            "maximum": 1.0,
811        })
812    }
813}
814
815/// Floating point types.
816pub trait FloatExt: Float + ConstZero + ConstOne + FloatConst + AddAssign {
817    /// `$0.5$`
818    const HALF: Self;
819    /// `$2.0$`
820    const TWO: Self;
821
822    /// Converts from a [`f64`].
823    #[must_use]
824    fn from_f64(x: f64) -> Self;
825
826    /// Converts from a [`usize`].
827    #[must_use]
828    fn from_usize(n: usize) -> Self;
829
830    /// Converts into a [`usize`].
831    #[must_use]
832    fn into_usize(self) -> usize;
833
834    /// Generates two uniform random numbers from a random `hash` value.
835    ///
836    /// The first is sampled from `$[0.0, 1.0)$`, the second from
837    /// `$(0.0, 1.0]$`.
838    #[must_use]
839    fn u01x2(hash: u64) -> (ClosedOpenUnit<Self>, OpenClosedUnit<Self>);
840}
841
842impl FloatExt for f32 {
843    const HALF: Self = 0.5;
844    const TWO: Self = 2.0;
845
846    #[expect(clippy::cast_possible_truncation)]
847    fn from_f64(x: f64) -> Self {
848        x as Self
849    }
850
851    #[expect(clippy::cast_precision_loss)]
852    fn from_usize(n: usize) -> Self {
853        n as Self
854    }
855
856    #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
857    fn into_usize(self) -> usize {
858        self as usize
859    }
860
861    fn u01x2(hash: u64) -> (ClosedOpenUnit<Self>, OpenClosedUnit<Self>) {
862        #[expect(clippy::cast_possible_truncation)] // split u64 into (u32, u32)
863        let (low, high) = (
864            (hash & u64::from(u32::MAX)) as u32,
865            ((hash >> 32) & u64::from(u32::MAX)) as u32,
866        );
867
868        // https://prng.di.unimi.it -> Generating uniform doubles in the unit interval [0.0, 1.0)
869        // the hash is shifted to only cover the mantissa
870        #[expect(clippy::cast_precision_loss)]
871        let u0 = ClosedOpenUnit(((high >> 8) as Self) * Self::from_bits(0x3380_0000_u32)); // 0x1.0p-24
872
873        // https://prng.di.unimi.it -> Generating uniform doubles in the unit interval (0.0, 1.0]
874        // the hash is shifted to only cover the mantissa
875        #[expect(clippy::cast_precision_loss)]
876        let u1 = OpenClosedUnit((((low >> 8) + 1) as Self) * Self::from_bits(0x3380_0000_u32)); // 0x1.0p-24
877
878        (u0, u1)
879    }
880}
881
882impl FloatExt for f64 {
883    const HALF: Self = 0.5;
884    const TWO: Self = 2.0;
885
886    fn from_f64(x: f64) -> Self {
887        x
888    }
889
890    #[expect(clippy::cast_precision_loss)]
891    fn from_usize(n: usize) -> Self {
892        n as Self
893    }
894
895    #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
896    fn into_usize(self) -> usize {
897        self as usize
898    }
899
900    fn u01x2(hash: u64) -> (ClosedOpenUnit<Self>, OpenClosedUnit<Self>) {
901        // https://prng.di.unimi.it -> Generating uniform doubles in the unit interval [0.0, 1.0)
902        // the hash is shifted to only cover the mantissa
903        #[expect(clippy::cast_precision_loss)]
904        let u0 =
905            ClosedOpenUnit(((hash >> 11) as Self) * Self::from_bits(0x3CA0_0000_0000_0000_u64)); // 0x1.0p-53
906
907        let hash = seahash_diffuse(hash + 1);
908
909        // https://prng.di.unimi.it -> Generating uniform doubles in the unit interval (0.0, 1.0]
910        // the hash is shifted to only cover the mantissa
911        #[expect(clippy::cast_precision_loss)]
912        let u1 = OpenClosedUnit(
913            (((hash >> 11) + 1) as Self) * Self::from_bits(0x3CA0_0000_0000_0000_u64),
914        ); // 0x1.0p-53
915
916        (u0, u1)
917    }
918}
919
920#[cfg(test)]
921#[expect(clippy::unwrap_used, clippy::expect_used)]
922mod tests {
923    use ndarray_rand::rand_distr::{Distribution, Normal};
924    use ndarray_rand::RandomExt;
925
926    use super::*;
927
928    #[test]
929    fn gaussian_f32() {
930        test_error_decline::<f32>(
931            (100, 100),
932            Normal::new(42.0, 24.0).unwrap(),
933            42,
934            RandomProjectionKind::Gaussian,
935        );
936    }
937
938    #[test]
939    fn gaussian_f64() {
940        test_error_decline::<f64>(
941            (100, 100),
942            Normal::new(42.0, 24.0).unwrap(),
943            42,
944            RandomProjectionKind::Gaussian,
945        );
946    }
947
948    #[test]
949    fn achlioptas_sparse_f32() {
950        test_error_decline::<f32>(
951            (100, 100),
952            Normal::new(42.0, 24.0).unwrap(),
953            42,
954            RandomProjectionKind::Sparse {
955                density: Some(OpenClosedUnit(1.0 / 3.0)),
956            },
957        );
958    }
959
960    #[test]
961    fn achlioptas_sparse_f64() {
962        test_error_decline::<f64>(
963            (100, 100),
964            Normal::new(42.0, 24.0).unwrap(),
965            42,
966            RandomProjectionKind::Sparse {
967                density: Some(OpenClosedUnit(1.0 / 3.0)),
968            },
969        );
970    }
971
972    #[test]
973    fn ping_li_sparse_f32() {
974        test_error_decline::<f32>(
975            (100, 100),
976            Normal::new(42.0, 24.0).unwrap(),
977            42,
978            RandomProjectionKind::Sparse { density: None },
979        );
980    }
981
982    #[test]
983    fn ping_li_sparse_f64() {
984        test_error_decline::<f64>(
985            (100, 100),
986            Normal::new(42.0, 24.0).unwrap(),
987            42,
988            RandomProjectionKind::Sparse { density: None },
989        );
990    }
991
992    #[expect(clippy::needless_pass_by_value)]
993    fn test_error_decline<T: FloatExt + std::fmt::Display>(
994        shape: (usize, usize),
995        distribution: impl Distribution<T>,
996        seed: u64,
997        projection: RandomProjectionKind,
998    ) {
999        let data = Array::<T, Ix2>::random(shape, distribution);
1000
1001        let mut max_rmse = rmse(
1002            &data,
1003            &roundtrip(&data, 42, OpenClosedUnit(1.0), &projection),
1004        );
1005
1006        for epsilon in [
1007            OpenClosedUnit(0.5),
1008            OpenClosedUnit(0.1),
1009            OpenClosedUnit(0.01),
1010        ] {
1011            let new_rmse = rmse(&data, &roundtrip(&data, seed, epsilon, &projection));
1012            assert!(
1013                new_rmse <= max_rmse,
1014                "{new_rmse} > {max_rmse} for {epsilon}",
1015                epsilon = epsilon.0
1016            );
1017            max_rmse = new_rmse;
1018        }
1019    }
1020
1021    fn roundtrip<T: FloatExt>(
1022        data: &Array<T, Ix2>,
1023        seed: u64,
1024        epsilon: OpenClosedUnit<f64>,
1025        projection: &RandomProjectionKind,
1026    ) -> Array<T, Ix2> {
1027        let projected = project_with_projection(
1028            data.view(),
1029            seed,
1030            &RandomProjectionReduction::JohnsonLindenstrauss { epsilon },
1031            projection,
1032        )
1033        .expect("projecting must not fail");
1034        let reconstructed = reconstruct_with_projection(projected, seed, projection)
1035            .expect("reconstruction must not fail");
1036        #[expect(clippy::let_and_return)]
1037        reconstructed
1038    }
1039
1040    fn rmse<T: FloatExt>(data: &Array<T, Ix2>, reconstructed: &Array<T, Ix2>) -> T {
1041        let mut err = T::ZERO;
1042
1043        for (&d, &r) in data.iter().zip(reconstructed) {
1044            err += (d - r) * (d - r);
1045        }
1046
1047        let mse = err * T::from_usize(data.len()).recip();
1048        mse.sqrt()
1049    }
1050}