numcodecs_random_projection/
lib.rs

1//! [![CI Status]][workflow] [![MSRV]][repo] [![Latest Version]][crates.io] [![Rust Doc Crate]][docs.rs] [![Rust Doc Main]][docs]
2//!
3//! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/numcodecs-rs/ci.yml?branch=main
4//! [workflow]: https://github.com/juntyr/numcodecs-rs/actions/workflows/ci.yml?query=branch%3Amain
5//!
6//! [MSRV]: https://img.shields.io/badge/MSRV-1.85.0-blue
7//! [repo]: https://github.com/juntyr/numcodecs-rs
8//!
9//! [Latest Version]: https://img.shields.io/crates/v/numcodecs-random-projection
10//! [crates.io]: https://crates.io/crates/numcodecs-random-projection
11//!
12//! [Rust Doc Crate]: https://img.shields.io/docsrs/numcodecs-random-projection
13//! [docs.rs]: https://docs.rs/numcodecs-random-projection/
14//!
15//! [Rust Doc Main]: https://img.shields.io/badge/docs-main-blue
16//! [docs]: https://juntyr.github.io/numcodecs-rs/numcodecs_random_projection
17//!
18//! Random Projection codec implementation for the [`numcodecs`] API.
19
20use std::{borrow::Cow, num::NonZeroUsize, ops::AddAssign};
21
22use ndarray::{Array, ArrayBase, ArrayViewMut, Data, Dimension, Ix2, ShapeError, Zip, s};
23use num_traits::{ConstOne, ConstZero, Float, FloatConst};
24use numcodecs::{
25    AnyArray, AnyArrayAssignError, AnyArrayDType, AnyArrayView, AnyArrayViewMut, AnyCowArray,
26    Codec, StaticCodec, StaticCodecConfig, StaticCodecVersion,
27};
28use schemars::{JsonSchema, Schema, SchemaGenerator, json_schema};
29use serde::{Deserialize, Deserializer, Serialize, Serializer};
30use thiserror::Error;
31
32#[cfg(test)]
33use ::serde_json as _;
34
35/// Codec that uses random projections to reduce the dimensionality of high-
36/// dimensional data to compress it.
37///
38/// A two-dimensional array of shape `$N \times D$` is encoded as n array of
39/// shape `$N \times K$`, where `$K$` is either set explicitly or chosen using
40/// the the Johnson-Lindenstrauss lemma. For `$K$` to be smaller than `$D$`,
41/// `$D$` must be quite large. Therefore, this codec should only applied on
42/// large datasets as it otherwise significantly inflates the data size instead
43/// of reducing it.
44///
45/// Choosing a lower distortion rate `epsilon` will improve the quality of the
46/// lossy compression, i.e. reduce the compression error, at the cost of
47/// increasing `$K$`.
48///
49/// This codec only supports finite floating point data.
50#[derive(Clone, Serialize, Deserialize, JsonSchema)]
51// FIXME: #[serde(deny_unknown_fields)]
52pub struct RandomProjectionCodec {
53    /// Seed for generating the random projection matrix
54    pub seed: u64,
55    /// Method with which the reduced dimensionality `$K$` is selected
56    #[serde(flatten)]
57    pub reduction: RandomProjectionReduction,
58    /// Projection kind that is used to generate the random projection matrix
59    #[serde(flatten)]
60    pub projection: RandomProjectionKind,
61    /// The codec's encoding format version. Do not provide this parameter explicitly.
62    #[serde(default, rename = "_version")]
63    pub version: StaticCodecVersion<0, 1, 0>,
64}
65
66/// Method with which the reduced dimensionality `$K$` is selected
67#[derive(Clone, Serialize, Deserialize, JsonSchema)]
68// FIXME: #[serde(deny_unknown_fields)]
69#[serde(tag = "reduction", rename_all = "kebab-case")]
70pub enum RandomProjectionReduction {
71    /// The reduced dimensionality `$K$` is derived from `epsilon`, as defined
72    /// by the Johnson-Lindenstrauss lemma.
73    JohnsonLindenstrauss {
74        /// Maximum distortion rate
75        epsilon: OpenClosedUnit<f64>,
76    },
77    /// The reduced dimensionality `$K$`, to which the data is projected, is
78    /// given explicitly.
79    Explicit {
80        /// Reduced dimensionality
81        k: NonZeroUsize,
82    },
83}
84
85/// Projection kind that is used to generate the random projection matrix
86#[derive(Clone, Serialize, Deserialize, JsonSchema)]
87// FIXME: #[serde(deny_unknown_fields)]
88#[serde(tag = "projection", rename_all = "kebab-case")]
89pub enum RandomProjectionKind {
90    /// The random projection matrix is dense and its components are sampled
91    /// from `$\text{N}\left( 0, \frac{1}{k} \right)$`
92    Gaussian,
93    /// The random projection matrix is sparse where only `density`% of entries
94    /// are non-zero.
95    ///
96    /// The matrix's components are sampled from
97    ///
98    /// - `$-\sqrt{\frac{1}{k \cdot density}}$` with probability
99    ///   `$0.5 \cdot density$`
100    /// - `$0$` with probability `$1 - density$`
101    /// - `$+\sqrt{\frac{1}{k \cdot density}}$` with probability
102    ///   `$0.5 \cdot density$`
103    Sparse {
104        /// The `density` of the sparse projection matrix.
105        ///
106        /// Setting `density` to `$\frac{1}{3}$` reproduces the settings by
107        /// Achlioptas [^1]. If `density` is `None`, it is set to
108        /// `$\frac{1}{\sqrt{d}}$`,
109        /// the minimum density as recommended by Li et al [^2].
110        ///
111        ///
112        /// [^1]: Achlioptas, D. (2003). Database-friendly random projections:
113        ///       Johnson-Lindenstrauss with binary coins. *Journal of Computer
114        ///       and System Sciences*, 66(4), 671-687. Available from:
115        ///       [doi:10.1016/S0022-0000(03)00025-4](https://doi.org/10.1016/S0022-0000(03)00025-4).
116        ///
117        /// [^2]: Li, P., Hastie, T. J., and Church, K. W. (2006). Very sparse
118        ///       random projections. In *Proceedings of the 12th ACM SIGKDD
119        ///       international conference on Knowledge discovery and data
120        ///       mining (KDD '06)*. Association for Computing Machinery, New
121        ///       York, NY, USA, 287–296. Available from:
122        ///       [doi:10.1145/1150402.1150436](https://doi.org/10.1145/1150402.1150436).
123        #[serde(default, skip_serializing_if = "Option::is_none")]
124        density: Option<OpenClosedUnit<f64>>,
125    },
126}
127
128impl Codec for RandomProjectionCodec {
129    type Error = RandomProjectionCodecError;
130
131    fn encode(&self, data: AnyCowArray) -> Result<AnyArray, Self::Error> {
132        match data {
133            AnyCowArray::F32(data) => Ok(AnyArray::F32(
134                project_with_projection(data, self.seed, &self.reduction, &self.projection)?
135                    .into_dyn(),
136            )),
137            AnyCowArray::F64(data) => Ok(AnyArray::F64(
138                project_with_projection(data, self.seed, &self.reduction, &self.projection)?
139                    .into_dyn(),
140            )),
141            encoded => Err(RandomProjectionCodecError::UnsupportedDtype(
142                encoded.dtype(),
143            )),
144        }
145    }
146
147    fn decode(&self, encoded: AnyCowArray) -> Result<AnyArray, Self::Error> {
148        match encoded {
149            AnyCowArray::F32(encoded) => Ok(AnyArray::F32(
150                reconstruct_with_projection(encoded, self.seed, &self.projection)?.into_dyn(),
151            )),
152            AnyCowArray::F64(encoded) => Ok(AnyArray::F64(
153                reconstruct_with_projection(encoded, self.seed, &self.projection)?.into_dyn(),
154            )),
155            encoded => Err(RandomProjectionCodecError::UnsupportedDtype(
156                encoded.dtype(),
157            )),
158        }
159    }
160
161    fn decode_into(
162        &self,
163        encoded: AnyArrayView,
164        decoded: AnyArrayViewMut,
165    ) -> Result<(), Self::Error> {
166        match (encoded, decoded) {
167            (AnyArrayView::F32(encoded), AnyArrayViewMut::F32(decoded)) => {
168                reconstruct_into_with_projection(encoded, decoded, self.seed, &self.projection)
169            }
170            (AnyArrayView::F64(encoded), AnyArrayViewMut::F64(decoded)) => {
171                reconstruct_into_with_projection(encoded, decoded, self.seed, &self.projection)
172            }
173            (encoded @ (AnyArrayView::F32(_) | AnyArrayView::F64(_)), decoded) => {
174                Err(RandomProjectionCodecError::MismatchedDecodeIntoArray {
175                    source: AnyArrayAssignError::DTypeMismatch {
176                        src: encoded.dtype(),
177                        dst: decoded.dtype(),
178                    },
179                })
180            }
181            (encoded, _decoded) => Err(RandomProjectionCodecError::UnsupportedDtype(
182                encoded.dtype(),
183            )),
184        }
185    }
186}
187
188impl StaticCodec for RandomProjectionCodec {
189    const CODEC_ID: &'static str = "random-projection.rs";
190
191    type Config<'de> = Self;
192
193    fn from_config(config: Self::Config<'_>) -> Self {
194        config
195    }
196
197    fn get_config(&self) -> StaticCodecConfig<Self> {
198        StaticCodecConfig::from(self)
199    }
200}
201
202#[derive(Debug, Error)]
203/// Errors that may occur when applying the [`RandomProjectionCodec`].
204pub enum RandomProjectionCodecError {
205    /// [`RandomProjectionCodec`] does not support the dtype
206    #[error("RandomProjection does not support the dtype {0}")]
207    UnsupportedDtype(AnyArrayDType),
208    /// [`RandomProjectionCodec`] only supports matrix / 2d-shaped arrays
209    #[error("RandomProjection only supports matrix / 2d-shaped arrays")]
210    NonMatrixData {
211        /// The source of the error
212        #[from]
213        source: ShapeError,
214    },
215    /// [`RandomProjectionCodec`] does not support non-finite (infinite or NaN)
216    /// floating point data
217    #[error("RandomProjection does not support non-finite (infinite or NaN) floating point data")]
218    NonFiniteData,
219    /// [`RandomProjectionCodec`] cannot encode or decode from an array with
220    /// `$N$` samples to an array with a different number of samples
221    #[error(
222        "RandomProjection cannot encode or decode from an array with {input} samples to an array with {output} samples"
223    )]
224    NumberOfSamplesMismatch {
225        /// Number of samples `N` in the input array
226        input: usize,
227        /// Number of samples `N` in the output array
228        output: usize,
229    },
230    /// [`RandomProjectionCodec`] cannot decode from an array with zero
231    /// dimensionality `$K$`
232    #[error("RandomProjection cannot decode from an array with zero dimensionality `K`")]
233    ProjectedArrayZeroComponents,
234    /// [`RandomProjectionCodec`] cannot decode from an array with corrupted
235    /// dimensionality metadata
236    #[error("RandomProjection cannot decode from an array with corrupted dimensionality metadata")]
237    CorruptedNumberOfComponents,
238    /// [`RandomProjectionCodec`] cannot decode into an array with `$D$`
239    /// features that differs from the `$D$` stored in the encoded metadata
240    #[error(
241        "RandomProjection cannot decode into an array with {output} features that differs from the {metadata} features stored in the encoded metadata"
242    )]
243    NumberOfFeaturesMismatch {
244        /// Number of features `$D$` in the encoded array metadata
245        metadata: usize,
246        /// Number of features `$D$` in the decoded output array
247        output: usize,
248    },
249    /// [`RandomProjectionCodec`] cannot decode into the provided array
250    #[error("RandomProjection cannot decode into the provided array")]
251    MismatchedDecodeIntoArray {
252        /// The source of the error
253        #[from]
254        source: AnyArrayAssignError,
255    },
256}
257
258/// Applies random projection to the input `data` with the given `seed`,
259/// `reduction` method, and `projection` kind and returns the resulting
260/// projected array.
261///
262/// # Errors
263///
264/// Errors with
265/// - [`RandomProjectionCodecError::NonMatrixData`] if the input `data` is not
266///   a two-dimensional matrix
267/// - [`RandomProjectionCodecError::NonFiniteData`] if the input `data` or
268///   projected output contains non-finite data
269pub fn project_with_projection<T: FloatExt, S: Data<Elem = T>, D: Dimension>(
270    data: ArrayBase<S, D>,
271    seed: u64,
272    reduction: &RandomProjectionReduction,
273    projection: &RandomProjectionKind,
274) -> Result<Array<T, Ix2>, RandomProjectionCodecError> {
275    let data = data
276        .into_dimensionality()
277        .map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
278
279    let (n, d) = data.dim();
280
281    let k = match reduction {
282        RandomProjectionReduction::JohnsonLindenstrauss { epsilon } => {
283            johnson_lindenstrauss_min_k(n, *epsilon)
284        }
285        RandomProjectionReduction::Explicit { k } => k.get(),
286    };
287
288    let mut projected = Array::<T, Ix2>::from_elem((n, k + 1), T::ZERO);
289
290    // stash the number of features `$d$` in an extra column
291    // this is quite inefficient but works for now
292    for p in projected.slice_mut(s!(.., k)) {
293        *p = T::from_usize(d);
294    }
295
296    match projection {
297        RandomProjectionKind::Gaussian => project_into(
298            data,
299            projected.slice_mut(s!(.., ..k)),
300            |x, y| gaussian_project(x, y, seed),
301            gaussian_normaliser(k),
302        ),
303        RandomProjectionKind::Sparse { density } => {
304            let density = density_or_ping_li_minimum(*density, d);
305            project_into(
306                data,
307                projected.slice_mut(s!(.., ..k)),
308                |x, y| sparse_project(x, y, density, seed),
309                sparse_normaliser(k, density),
310            )
311        }
312    }?;
313
314    Ok(projected)
315}
316
317#[expect(clippy::needless_pass_by_value)]
318/// Applies random projection to the input `data` and outputs into the
319/// `projected` array.
320///
321/// The random projection matrix is defined by the `projection` function
322/// `$(i, j) \rightarrow P[i, j]$` and a globally applied `normalizer` factor.
323///
324/// # Errors
325///
326/// Errors with
327/// - [`RandomProjectionCodecError::NumberOfSamplesMismatch`] if the input
328///   `data`'s number of samples doesn't match the `projected` array's number
329///   of samples
330/// - [`RandomProjectionCodecError::NonFiniteData`] if the input `data` or
331///   projected output contains non-finite data
332pub fn project_into<T: FloatExt, S: Data<Elem = T>>(
333    data: ArrayBase<S, Ix2>,
334    mut projected: ArrayViewMut<T, Ix2>,
335    projection: impl Fn(usize, usize) -> T,
336    normalizer: T,
337) -> Result<(), RandomProjectionCodecError> {
338    let (n, d) = data.dim();
339    let (n2, _k) = projected.dim();
340
341    if n2 != n {
342        return Err(RandomProjectionCodecError::NumberOfSamplesMismatch {
343            input: n,
344            output: n2,
345        });
346    }
347
348    let mut skip_projection_column = Vec::with_capacity(d);
349
350    for (j, projected_j) in projected.columns_mut().into_iter().enumerate() {
351        // materialize one column of the projection matrix
352        //   i.e. instead of A x B = C, compute A x [bs] = [cs].T
353        // the column is stored in a sparse representation [(l, p)]
354        //   where l is the index of the non-zero entry p
355        skip_projection_column.clear();
356        for l in 0..d {
357            let p = projection(l, j);
358            if !p.is_zero() {
359                skip_projection_column.push((l, p));
360            }
361        }
362
363        for (data_i, projected_ij) in data.rows().into_iter().zip(projected_j) {
364            let mut acc = T::ZERO;
365
366            #[expect(clippy::indexing_slicing)]
367            // data_i is of shape (d) and all l's are in 0..d
368            for &(l, p) in &skip_projection_column {
369                acc += data_i[l] * p;
370            }
371
372            *projected_ij = acc * normalizer;
373        }
374    }
375
376    if !Zip::from(projected).all(|x| x.is_finite()) {
377        return Err(RandomProjectionCodecError::NonFiniteData);
378    }
379
380    Ok(())
381}
382
383/// Applies the (approximate) inverse of random projection to the `projected`
384/// array.
385///
386/// The input data is reconstructed with the given `seed` and `projection`
387/// kind and returns the resulting reconstructed array.
388///
389/// # Errors
390///
391/// Errors with
392/// - [`RandomProjectionCodecError::NonMatrixData`] if the `projected` array is
393///   not a two-dimensional matrix
394/// - [`RandomProjectionCodecError::ProjectedArrayZeroComponents`] if the
395///   `projected` array is of shape `$(N, 0)$`
396/// - [`RandomProjectionCodecError::CorruptedNumberOfComponents`] if the
397///   `projected` array's dimensionality metadata is corrupted
398/// - [`RandomProjectionCodecError::NonFiniteData`] if the `projected` array or
399///   the reconstructed output contains non-finite data
400pub fn reconstruct_with_projection<T: FloatExt, S: Data<Elem = T>, D: Dimension>(
401    projected: ArrayBase<S, D>,
402    seed: u64,
403    projection: &RandomProjectionKind,
404) -> Result<Array<T, Ix2>, RandomProjectionCodecError> {
405    let projected = projected
406        .into_dimensionality()
407        .map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
408
409    if projected.is_empty() {
410        return Ok(projected.to_owned());
411    }
412
413    let (_n, k): (usize, usize) = projected.dim();
414    let Some(k) = k.checked_sub(1) else {
415        return Err(RandomProjectionCodecError::ProjectedArrayZeroComponents);
416    };
417
418    // extract the number of features `$d$` from the extra column and check that
419    //  it has been preserved consistently across the column
420    let ds = projected.slice(s!(.., k));
421    let Ok(Some(d)) = ds.fold(Ok(None), |acc, d| match acc {
422        Ok(None) => Ok(Some(d.into_usize())),
423        Ok(Some(d2)) if d2 == d.into_usize() => Ok(Some(d2)),
424        _ => Err(()),
425    }) else {
426        return Err(RandomProjectionCodecError::CorruptedNumberOfComponents);
427    };
428
429    // extract the projected data, excluding the metadata column
430    let projected = projected.slice_move(s!(.., ..k));
431
432    match projection {
433        RandomProjectionKind::Gaussian => reconstruct(
434            projected,
435            d,
436            |x, y| gaussian_project(x, y, seed),
437            gaussian_normaliser(k),
438        ),
439        RandomProjectionKind::Sparse { density } => {
440            let density = density_or_ping_li_minimum(*density, d);
441            reconstruct(
442                projected,
443                d,
444                |x, y| sparse_project(x, y, density, seed),
445                sparse_normaliser(k, density),
446            )
447        }
448    }
449}
450
451#[expect(clippy::needless_pass_by_value)]
452/// Applies the (approximate) inverse of random projection to the `projected`
453/// array to reconstruct the input data with dimensionality `d` and returns the
454/// resulting reconstructed array.
455///
456/// The random projection matrix is defined by the `projection` function
457/// `$(i, j) \rightarrow P[i, j]$` and a globally applied `normalizer` factor.
458///
459/// # Errors
460///
461/// Errors with
462/// - [`RandomProjectionCodecError::NonFiniteData`] if the `projected` array or
463///   the reconstructed output contains non-finite data
464pub fn reconstruct<T: FloatExt, S: Data<Elem = T>>(
465    projected: ArrayBase<S, Ix2>,
466    d: usize,
467    projection: impl Fn(usize, usize) -> T,
468    normalizer: T,
469) -> Result<Array<T, Ix2>, RandomProjectionCodecError> {
470    if projected.is_empty() {
471        return Ok(projected.to_owned());
472    }
473
474    let (n, k) = projected.dim();
475
476    let mut reconstructed = Array::<T, Ix2>::from_elem((n, d), T::ZERO);
477
478    let mut skip_projection_row = Vec::with_capacity(k);
479
480    for (l, reconstructed_l) in reconstructed.columns_mut().into_iter().enumerate() {
481        // materialize one row of the projection matrix transpose
482        // i.e. instead of A x B = C, compute A x [bs] = [cs].T
483        // the row is stored in a sparse representation [(j, p)]
484        //   where j is the index of the non-zero entry p
485        skip_projection_row.clear();
486        for j in 0..k {
487            let p = projection(l, j);
488            if !p.is_zero() {
489                skip_projection_row.push((j, p));
490            }
491        }
492
493        for (projected_i, reconstructed_il) in projected.rows().into_iter().zip(reconstructed_l) {
494            let mut acc = T::ZERO;
495
496            #[expect(clippy::indexing_slicing)]
497            // projected_i is of shape (k) and all j's are in 0..k
498            for &(j, p) in &skip_projection_row {
499                acc += projected_i[j] * p;
500            }
501
502            *reconstructed_il = acc * normalizer;
503        }
504    }
505
506    if !Zip::from(&reconstructed).all(|x| x.is_finite()) {
507        return Err(RandomProjectionCodecError::NonFiniteData);
508    }
509
510    Ok(reconstructed)
511}
512
513/// Applies the (approximate) inverse of random projection to the `projected`
514/// array to reconstruct the input data with the given `seed` and `projection`
515/// kind and outputs into the `reconstructed` array.
516///
517/// # Errors
518///
519/// Errors with
520/// - [`RandomProjectionCodecError::NonMatrixData`] if the `projected` and
521///   `reconstructed` arrays are not two-dimensional matrices
522/// - [`RandomProjectionCodecError::NumberOfSamplesMismatch`] if the number of
523///   samples `$N$` of the `projected` array don't match the number of samples
524///   of the `reconstructed` array
525/// - [`RandomProjectionCodecError::ProjectedArrayZeroComponents`] if the
526///   `projected` array is of shape `$(N, 0)$`
527/// - [`RandomProjectionCodecError::CorruptedNumberOfComponents`] if the
528///   `projected` array's dimensionality metadata is corrupted
529/// - [`RandomProjectionCodecError::NumberOfFeaturesMismatch`] if the
530///   `reconstructed` array's dimensionality `$D$` does not match the
531///   `projected` array's dimensionality metadata
532/// - [`RandomProjectionCodecError::NonFiniteData`] if the `projected` array or
533///   the reconstructed output contains non-finite data
534pub fn reconstruct_into_with_projection<T: FloatExt, S: Data<Elem = T>, D: Dimension>(
535    projected: ArrayBase<S, D>,
536    reconstructed: ArrayViewMut<T, D>,
537    seed: u64,
538    projection: &RandomProjectionKind,
539) -> Result<(), RandomProjectionCodecError> {
540    let projected: ArrayBase<S, Ix2> = projected
541        .into_dimensionality()
542        .map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
543    let reconstructed: ArrayViewMut<T, Ix2> = reconstructed
544        .into_dimensionality()
545        .map_err(|err| RandomProjectionCodecError::NonMatrixData { source: err })?;
546
547    let (n, k) = projected.dim();
548    let (n2, d2) = reconstructed.dim();
549
550    if n2 != n {
551        return Err(RandomProjectionCodecError::NumberOfSamplesMismatch {
552            input: n,
553            output: n2,
554        });
555    }
556
557    let Some(k) = k.checked_sub(1) else {
558        return Err(RandomProjectionCodecError::ProjectedArrayZeroComponents);
559    };
560
561    // extract the number of features `$d$` from the extra column and check that
562    //  it has been preserved consistently across the column
563    let ds = projected.slice(s!(.., k));
564    let Ok(Some(d)) = ds.fold(Ok(None), |acc, d| match acc {
565        Ok(None) => Ok(Some(d.into_usize())),
566        Ok(Some(d2)) if d2 == d.into_usize() => Ok(Some(d2)),
567        _ => Err(()),
568    }) else {
569        return Err(RandomProjectionCodecError::CorruptedNumberOfComponents);
570    };
571
572    if d2 != d {
573        return Err(RandomProjectionCodecError::NumberOfFeaturesMismatch {
574            metadata: d,
575            output: d2,
576        });
577    }
578
579    // extract the projected data, excluding the metadata column
580    let projected = projected.slice_move(s!(.., ..k));
581
582    match projection {
583        RandomProjectionKind::Gaussian => reconstruct_into(
584            projected,
585            reconstructed,
586            |x, y| gaussian_project(x, y, seed),
587            gaussian_normaliser(k),
588        ),
589        RandomProjectionKind::Sparse { density } => {
590            let density = density_or_ping_li_minimum(*density, d);
591            reconstruct_into(
592                projected,
593                reconstructed,
594                |x, y| sparse_project(x, y, density, seed),
595                sparse_normaliser(k, density),
596            )
597        }
598    }
599}
600
601#[expect(clippy::needless_pass_by_value)]
602/// Applies the (approximate) inverse of random projection to the `projected`
603/// array to reconstruct the input data outputs into the `reconstructed` array.
604///
605/// The random projection matrix is defined by the `projection` function
606/// `$(i, j) \rightarrow P[i, j]$` and a globally applied `normalizer` factor.
607///
608/// # Errors
609///
610/// Errors with
611/// - [`RandomProjectionCodecError::NumberOfSamplesMismatch`] if the number of
612///   samples `$N$` of the `projected` array don't match the number of samples
613///   of the `reconstructed` array
614/// - [`RandomProjectionCodecError::NonFiniteData`] if the `projected` array or
615///   the reconstructed output contains non-finite data
616pub fn reconstruct_into<T: FloatExt, S: Data<Elem = T>>(
617    projected: ArrayBase<S, Ix2>,
618    mut reconstructed: ArrayViewMut<T, Ix2>,
619    projection: impl Fn(usize, usize) -> T,
620    normalizer: T,
621) -> Result<(), RandomProjectionCodecError> {
622    let (n, k) = projected.dim();
623    let (n2, _d) = reconstructed.dim();
624
625    if n2 != n {
626        return Err(RandomProjectionCodecError::NumberOfSamplesMismatch {
627            input: n,
628            output: n2,
629        });
630    }
631
632    let mut skip_projection_row = Vec::with_capacity(k);
633
634    for (l, reconstructed_l) in reconstructed.columns_mut().into_iter().enumerate() {
635        // materialize one row of the projection matrix transpose
636        // i.e. instead of A x B = C, compute A x [bs] = [cs].T
637        // the row is stored in a sparse representation [(j, p)]
638        //   where j is the index of the non-zero entry p
639        skip_projection_row.clear();
640        for j in 0..k {
641            let p = projection(l, j);
642            if !p.is_zero() {
643                skip_projection_row.push((j, p));
644            }
645        }
646
647        for (projected_i, reconstructed_il) in projected.rows().into_iter().zip(reconstructed_l) {
648            let mut acc = T::ZERO;
649
650            #[expect(clippy::indexing_slicing)]
651            // projected_i is of shape (k) and all j's are in 0..k
652            for &(j, p) in &skip_projection_row {
653                acc += projected_i[j] * p;
654            }
655
656            *reconstructed_il = acc * normalizer;
657        }
658    }
659
660    if !Zip::from(reconstructed).all(|x| x.is_finite()) {
661        return Err(RandomProjectionCodecError::NonFiniteData);
662    }
663
664    Ok(())
665}
666
667/// Find a 'safe' number of components `$K$` to randomly project to.
668///
669/// The minimum number of components to guarantee the `$\epsilon$`-embedding is
670/// given by
671/// `$K \ge \frac{4 \cdot \ln(N)}{\frac{{\epsilon}^{2}}{2} - \frac{{\epsilon}^{3}}{3}}$`.
672///
673/// The implementation is adapted from [`sklearn`]'s.
674///
675/// [`sklearn`]: https://github.com/scikit-learn/scikit-learn/blob/3b39d7cb957ab781744b346c1848be9db3f4e221/sklearn/random_projection.py#L56-L142
676#[must_use]
677pub fn johnson_lindenstrauss_min_k(
678    n_samples: usize,
679    OpenClosedUnit(epsilon): OpenClosedUnit<f64>,
680) -> usize {
681    #[expect(clippy::suboptimal_flops)]
682    let denominator = (epsilon * epsilon * 0.5) - (epsilon * epsilon * epsilon / 3.0);
683    #[expect(clippy::cast_precision_loss)]
684    let min_k = (n_samples as f64).ln() * 4.0 / denominator;
685    #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
686    let min_k = min_k as usize;
687    min_k
688}
689
690/// Extract the provided `density` if it is `Some(_)`, or compute the minimum
691/// required density `$\frac{1}{\sqrt{d}}$` as recommended by Li et al [^3].
692///
693/// [^3]: Li, P., Hastie, T. J., and Church, K. W. (2006). Very sparse
694///       random projections. In *Proceedings of the 12th ACM SIGKDD
695///       international conference on Knowledge discovery and data
696///       mining (KDD '06)*. Association for Computing Machinery, New
697///       York, NY, USA, 287–296. Available from:
698///       [doi:10.1145/1150402.1150436](https://doi.org/10.1145/1150402.1150436).
699#[must_use]
700pub fn density_or_ping_li_minimum<T: FloatExt>(
701    density: Option<OpenClosedUnit<f64>>,
702    d: usize,
703) -> T {
704    match density {
705        Some(OpenClosedUnit(density)) => T::from_f64(density),
706        None => T::from_usize(d).sqrt().recip(),
707    }
708}
709
710/// Sample from `$\text{N}(0, 1)$` at the coordinate `$(x, y)$` with the random
711/// `seed`
712fn gaussian_project<T: FloatExt>(x: usize, y: usize, seed: u64) -> T {
713    let (ClosedOpenUnit(u0), OpenClosedUnit(u1)) = T::u01x2(hash_matrix_index(x, y, seed));
714
715    let theta = -T::TAU() * u0;
716    let r = (-T::TWO * u1.ln()).sqrt();
717
718    r * theta.sin()
719}
720
721fn gaussian_normaliser<T: FloatExt>(k: usize) -> T {
722    T::from_usize(k).sqrt().recip()
723}
724
725/// Sample from `${ -1, 0, +1 }$` with probabilities
726/// `${ 0.5 \cdot density, 1 - density, 0.5 \cdot density }$` at the coordinate
727/// `$(x, y)$` with the random `seed`
728fn sparse_project<T: FloatExt>(x: usize, y: usize, density: T, seed: u64) -> T {
729    let (ClosedOpenUnit(u0), _u1) = T::u01x2(hash_matrix_index(x, y, seed));
730
731    if u0 < (density * T::HALF) {
732        -T::ONE
733    } else if u0 < density {
734        T::ONE
735    } else {
736        T::ZERO
737    }
738}
739
740fn sparse_normaliser<T: FloatExt>(k: usize, density: T) -> T {
741    (T::from_usize(k) * density).recip().sqrt()
742}
743
744const fn hash_matrix_index(x: usize, y: usize, seed: u64) -> u64 {
745    seahash_diffuse(seahash_diffuse(x as u64) ^ seed ^ (y as u64))
746}
747
748const fn seahash_diffuse(mut x: u64) -> u64 {
749    // SeaHash diffusion function
750    // https://docs.rs/seahash/4.1.0/src/seahash/helper.rs.html#75-92
751
752    // These are derived from the PCG RNG's round. Thanks to @Veedrac for proposing
753    // this. The basic idea is that we use dynamic shifts, which are determined
754    // by the input itself. The shift is chosen by the higher bits, which means
755    // that changing those flips the lower bits, which scatters upwards because
756    // of the multiplication.
757
758    x = x.wrapping_mul(0x6eed_0e9d_a4d9_4a4f);
759
760    let a = x >> 32;
761    let b = x >> 60;
762
763    x ^= a >> b;
764
765    x = x.wrapping_mul(0x6eed_0e9d_a4d9_4a4f);
766
767    x
768}
769
770#[expect(clippy::derive_partial_eq_without_eq)] // floats are not Eq
771#[derive(Copy, Clone, PartialEq, PartialOrd, Hash)]
772/// Floating point number in `$[0.0, 1.0)$`
773pub struct ClosedOpenUnit<T: FloatExt>(T);
774
775#[expect(clippy::derive_partial_eq_without_eq)] // floats are not Eq
776#[derive(Copy, Clone, PartialEq, PartialOrd, Hash)]
777/// Floating point number in `$(0.0, 1.0]$`
778pub struct OpenClosedUnit<T: FloatExt>(T);
779
780impl Serialize for OpenClosedUnit<f64> {
781    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
782        serializer.serialize_f64(self.0)
783    }
784}
785
786impl<'de> Deserialize<'de> for OpenClosedUnit<f64> {
787    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
788        let x = f64::deserialize(deserializer)?;
789
790        if x > 0.0 && x <= 1.0 {
791            Ok(Self(x))
792        } else {
793            Err(serde::de::Error::invalid_value(
794                serde::de::Unexpected::Float(x),
795                &"a value in (0.0, 1.0]",
796            ))
797        }
798    }
799}
800
801impl JsonSchema for OpenClosedUnit<f64> {
802    fn schema_name() -> Cow<'static, str> {
803        Cow::Borrowed("OpenClosedUnitF64")
804    }
805
806    fn schema_id() -> Cow<'static, str> {
807        Cow::Borrowed(concat!(module_path!(), "::", "OpenClosedUnit<f64>"))
808    }
809
810    fn json_schema(_gen: &mut SchemaGenerator) -> Schema {
811        json_schema!({
812            "type": "number",
813            "exclusiveMinimum": 0.0,
814            "maximum": 1.0,
815        })
816    }
817}
818
819/// Floating point types.
820pub trait FloatExt: Float + ConstZero + ConstOne + FloatConst + AddAssign {
821    /// `$0.5$`
822    const HALF: Self;
823    /// `$2.0$`
824    const TWO: Self;
825
826    /// Converts from a [`f64`].
827    #[must_use]
828    fn from_f64(x: f64) -> Self;
829
830    /// Converts from a [`usize`].
831    #[must_use]
832    fn from_usize(n: usize) -> Self;
833
834    /// Converts into a [`usize`].
835    #[must_use]
836    fn into_usize(self) -> usize;
837
838    /// Generates two uniform random numbers from a random `hash` value.
839    ///
840    /// The first is sampled from `$[0.0, 1.0)$`, the second from
841    /// `$(0.0, 1.0]$`.
842    #[must_use]
843    fn u01x2(hash: u64) -> (ClosedOpenUnit<Self>, OpenClosedUnit<Self>);
844}
845
846impl FloatExt for f32 {
847    const HALF: Self = 0.5;
848    const TWO: Self = 2.0;
849
850    #[expect(clippy::cast_possible_truncation)]
851    fn from_f64(x: f64) -> Self {
852        x as Self
853    }
854
855    #[expect(clippy::cast_precision_loss)]
856    fn from_usize(n: usize) -> Self {
857        n as Self
858    }
859
860    #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
861    fn into_usize(self) -> usize {
862        self as usize
863    }
864
865    fn u01x2(hash: u64) -> (ClosedOpenUnit<Self>, OpenClosedUnit<Self>) {
866        #[expect(clippy::cast_possible_truncation)] // split u64 into (u32, u32)
867        let (low, high) = (
868            (hash & u64::from(u32::MAX)) as u32,
869            ((hash >> 32) & u64::from(u32::MAX)) as u32,
870        );
871
872        // https://prng.di.unimi.it -> Generating uniform doubles in the unit interval [0.0, 1.0)
873        // the hash is shifted to only cover the mantissa
874        #[expect(clippy::cast_precision_loss)]
875        let u0 = ClosedOpenUnit(((high >> 8) as Self) * Self::from_bits(0x3380_0000_u32)); // 0x1.0p-24
876
877        // https://prng.di.unimi.it -> Generating uniform doubles in the unit interval (0.0, 1.0]
878        // the hash is shifted to only cover the mantissa
879        #[expect(clippy::cast_precision_loss)]
880        let u1 = OpenClosedUnit((((low >> 8) + 1) as Self) * Self::from_bits(0x3380_0000_u32)); // 0x1.0p-24
881
882        (u0, u1)
883    }
884}
885
886impl FloatExt for f64 {
887    const HALF: Self = 0.5;
888    const TWO: Self = 2.0;
889
890    fn from_f64(x: f64) -> Self {
891        x
892    }
893
894    #[expect(clippy::cast_precision_loss)]
895    fn from_usize(n: usize) -> Self {
896        n as Self
897    }
898
899    #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
900    fn into_usize(self) -> usize {
901        self as usize
902    }
903
904    fn u01x2(hash: u64) -> (ClosedOpenUnit<Self>, OpenClosedUnit<Self>) {
905        // https://prng.di.unimi.it -> Generating uniform doubles in the unit interval [0.0, 1.0)
906        // the hash is shifted to only cover the mantissa
907        #[expect(clippy::cast_precision_loss)]
908        let u0 =
909            ClosedOpenUnit(((hash >> 11) as Self) * Self::from_bits(0x3CA0_0000_0000_0000_u64)); // 0x1.0p-53
910
911        let hash = seahash_diffuse(hash + 1);
912
913        // https://prng.di.unimi.it -> Generating uniform doubles in the unit interval (0.0, 1.0]
914        // the hash is shifted to only cover the mantissa
915        #[expect(clippy::cast_precision_loss)]
916        let u1 = OpenClosedUnit(
917            (((hash >> 11) + 1) as Self) * Self::from_bits(0x3CA0_0000_0000_0000_u64),
918        ); // 0x1.0p-53
919
920        (u0, u1)
921    }
922}
923
924#[cfg(test)]
925#[expect(clippy::unwrap_used, clippy::expect_used)]
926mod tests {
927    use ndarray_rand::RandomExt;
928    use ndarray_rand::rand_distr::{Distribution, Normal};
929
930    use super::*;
931
932    #[test]
933    fn gaussian_f32() {
934        test_error_decline::<f32>(
935            (100, 100),
936            Normal::new(42.0, 24.0).unwrap(),
937            42,
938            RandomProjectionKind::Gaussian,
939        );
940    }
941
942    #[test]
943    fn gaussian_f64() {
944        test_error_decline::<f64>(
945            (100, 100),
946            Normal::new(42.0, 24.0).unwrap(),
947            42,
948            RandomProjectionKind::Gaussian,
949        );
950    }
951
952    #[test]
953    fn achlioptas_sparse_f32() {
954        test_error_decline::<f32>(
955            (100, 100),
956            Normal::new(42.0, 24.0).unwrap(),
957            42,
958            RandomProjectionKind::Sparse {
959                density: Some(OpenClosedUnit(1.0 / 3.0)),
960            },
961        );
962    }
963
964    #[test]
965    fn achlioptas_sparse_f64() {
966        test_error_decline::<f64>(
967            (100, 100),
968            Normal::new(42.0, 24.0).unwrap(),
969            42,
970            RandomProjectionKind::Sparse {
971                density: Some(OpenClosedUnit(1.0 / 3.0)),
972            },
973        );
974    }
975
976    #[test]
977    fn ping_li_sparse_f32() {
978        test_error_decline::<f32>(
979            (100, 100),
980            Normal::new(42.0, 24.0).unwrap(),
981            42,
982            RandomProjectionKind::Sparse { density: None },
983        );
984    }
985
986    #[test]
987    fn ping_li_sparse_f64() {
988        test_error_decline::<f64>(
989            (100, 100),
990            Normal::new(42.0, 24.0).unwrap(),
991            42,
992            RandomProjectionKind::Sparse { density: None },
993        );
994    }
995
996    #[expect(clippy::needless_pass_by_value)]
997    fn test_error_decline<T: FloatExt + std::fmt::Display>(
998        shape: (usize, usize),
999        distribution: impl Distribution<T>,
1000        seed: u64,
1001        projection: RandomProjectionKind,
1002    ) {
1003        let data = Array::<T, Ix2>::random(shape, distribution);
1004
1005        let mut max_rmse = rmse(
1006            &data,
1007            &roundtrip(&data, 42, OpenClosedUnit(1.0), &projection),
1008        );
1009
1010        for epsilon in [
1011            OpenClosedUnit(0.5),
1012            OpenClosedUnit(0.1),
1013            OpenClosedUnit(0.01),
1014        ] {
1015            let new_rmse = rmse(&data, &roundtrip(&data, seed, epsilon, &projection));
1016            assert!(
1017                new_rmse <= max_rmse,
1018                "{new_rmse} > {max_rmse} for {epsilon}",
1019                epsilon = epsilon.0
1020            );
1021            max_rmse = new_rmse;
1022        }
1023    }
1024
1025    fn roundtrip<T: FloatExt>(
1026        data: &Array<T, Ix2>,
1027        seed: u64,
1028        epsilon: OpenClosedUnit<f64>,
1029        projection: &RandomProjectionKind,
1030    ) -> Array<T, Ix2> {
1031        let projected = project_with_projection(
1032            data.view(),
1033            seed,
1034            &RandomProjectionReduction::JohnsonLindenstrauss { epsilon },
1035            projection,
1036        )
1037        .expect("projecting must not fail");
1038        let reconstructed = reconstruct_with_projection(projected, seed, projection)
1039            .expect("reconstruction must not fail");
1040        #[expect(clippy::let_and_return)]
1041        reconstructed
1042    }
1043
1044    fn rmse<T: FloatExt>(data: &Array<T, Ix2>, reconstructed: &Array<T, Ix2>) -> T {
1045        let mut err = T::ZERO;
1046
1047        for (&d, &r) in data.iter().zip(reconstructed) {
1048            err += (d - r) * (d - r);
1049        }
1050
1051        let mse = err * T::from_usize(data.len()).recip();
1052        mse.sqrt()
1053    }
1054}
numcodecs_random_projection/lib.rs

numcodecs_random_projection/
lib.rs