Initial commit

2026-03-20 11:48:07 +01:00 · 2026-03-20 11:48:07 +01:00 · fa8eae3146
commit fa8eae3146
17 changed files with 141328 additions and 0 deletions
--- a/src/transforms.rs
+++ b/src/transforms.rs
@ -0,0 +1,315 @@
+use crate::float::Float;
+use crate::linear_algebra::{ColumnEfficientMatrix, Vector};
+use rand_distr::Normal;
+
+pub fn l2_error(xs: &[Float], ys: &[Float]) -> Float {
+    // err-squared
+    let mut result = 0.0;
+    for (x, y) in xs.iter().zip(ys) {
+        let s = x - y;
+        result += s * s;
+    }
+    0.5 * result
+}
+
+pub fn gradient_l2_error_mut(xs: &[Float], ys: &[Float], out: &mut [Float]) {
+    for (i, (x, y)) in xs.iter().zip(ys).enumerate() {
+        out[i] = x - y;
+    }
+}
+
+fn sigmoid(x: Float) -> Float {
+    1.0 / (1.0 + (-x).exp())
+}
+
+fn activation_of_sigmoid_potential(potential: Float) -> Float {
+    sigmoid(potential)
+}
+
+fn vectorized_sigmoid_activation_of_potential(potentials: &[Float], out: &mut [Float]) {
+    for (potential, y) in potentials.iter().zip(out) {
+        *y = activation_of_sigmoid_potential(*potential)
+    }
+}
+
+fn derivative_of_sigmoid_activation_of_potential(potential: Float) -> Float {
+    let s = sigmoid(potential);
+    s * (1.0 - s)
+}
+
+fn vectorized_gradient_of_sigmoid_activation_of_potential_mut(
+    potentials: &[Float],
+    derivatives_state: &mut [Float],
+    output_gradient: &[Float],
+    input_gradient: &mut [Float],
+) {
+    use crate::linear_algebra::DiagonalMatrix;
+    for (potential, state) in potentials.iter().zip(derivatives_state.iter_mut()) {
+        *state = derivative_of_sigmoid_activation_of_potential(*potential)
+    }
+    DiagonalMatrix::new(derivatives_state).apply_mut(output_gradient, input_gradient);
+}
+
+// =====cross-entropy=====
+// takes in two probability distributions
+fn cross_entropy(p: &[Float], q: &[Float]) -> Float {
+    let mut result = 0.0;
+    for (x, y) in p.iter().zip(q) {
+        result += y * x.ln()
+    }
+    -result
+}
+
+// Second probability distribution is deterministic,
+// i.e. it deterministically outputs the same value.
+fn cross_entropy_simple(p: &[Float], q: usize) -> Float {
+    -p[q].ln()
+}
+
+fn cross_entropy_derivative_mut(p: &[Float], q: &[Float], out: &mut [Float]) {
+    for ((a, b), c) in p.iter().zip(q).zip(out) {
+        *c = -b / a
+    }
+}
+
+// Second probability distribution is deterministic,
+// i.e. it deterministically outputs the same value.
+pub fn cross_entropy_derivative_simple(p: &[Float], q: usize, out: &mut [Float]) {
+    // TODO: Do we really need to reset everything to besides the q-th index?
+    for a in out.iter_mut() {
+        *a = 0.0;
+    }
+    out[q] = -1.0 / p[q];
+}
+
+fn softmax_mut(input: &[Float], out: &mut [Float]) {
+    let mut s = 0.0;
+    for (x, y) in input.iter().zip(out.iter_mut()) {
+        let e = x.exp();
+        *y = e;
+        s += e
+    }
+
+    for y in out {
+        *y /= s
+    }
+}
+
+fn softmax_gradient_mut(
+    softmax_output: &[Float],
+    gradient_output: &[Float],
+    gradient_input: &mut [Float],
+) {
+    for (j, dx) in gradient_input.iter_mut().enumerate() {
+        *dx = 0.0;
+        for (i, dy) in gradient_output.iter().enumerate() {
+            // Note that the gradient matrix is symmetric, so don't worry about the order of
+            // indices
+            *dx += softmax_output[i] * (if i == j { 1.0 } else { 0.0 } - softmax_output[j]) * dy
+        }
+    }
+}
+
+// relu
+fn relu_mut(input: &[Float], out: &mut [Float]) {
+    for (x, y) in input.iter().zip(out.iter_mut()) {
+        *y = x.max(0.0)
+    }
+}
+
+fn relu_gradient_mut(input: &[Float], gradient_output: &[Float], gradient_input: &mut [Float]) {
+    for ((dy, dx), x) in gradient_output.iter().zip(gradient_input).zip(input) {
+        *dx = if *x > 0.0 { *dy } else { 0.0 }
+    }
+}
+
+// =====sigmoid=====
+#[derive(Debug)]
+pub struct SigmoidTransform {
+    pub weight: ColumnEfficientMatrix,
+    potential_vector: Vector,
+    derivatives_state: Vector,
+    potential_gradient: Vector,
+
+    pub weight_gradient: ColumnEfficientMatrix,
+}
+
+impl SigmoidTransform {
+    pub fn new(input_dimension: usize, output_dimension: usize) -> Self {
+        let mean = 0.0;
+        let std_dev = 1.0 / (input_dimension as Float).sqrt();
+        let normal_distr = Normal::new(mean, std_dev).unwrap();
+
+        Self {
+            weight: ColumnEfficientMatrix::random_with_normal_distribution(
+                input_dimension,
+                output_dimension,
+                normal_distr,
+            ),
+            potential_vector: Vector::zero(output_dimension),
+            derivatives_state: Vector::zero(output_dimension), // TODO: Can I get rid of this?
+            potential_gradient: Vector::zero(output_dimension),
+
+            weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension),
+        }
+    }
+
+    pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) {
+        self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input]
+        vectorized_sigmoid_activation_of_potential(&self.potential_vector[..], output);
+        // y = f(potential)
+    }
+
+    // Note below that (1) and (2) are independent, but they both depend on (0).
+
+    // (0)
+    pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) {
+        // updates the potential gradient
+        vectorized_gradient_of_sigmoid_activation_of_potential_mut(
+            &self.potential_vector[..],
+            &mut self.derivatives_state[..],
+            output_gradient,
+            &mut self.potential_gradient[..],
+        ); // potential_gradient = grad[f](potential)[output_gradient]
+    }
+
+    // Note that it makes sense to have the two gradients split, since for the input layer we will
+    // not need to compute the input gradient, only the weight gradient is important.
+    // WARNING: You need to call `potential_gradient_mut` before using the below function
+    // (1)
+    pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) {
+        // updates the input gradient
+        //
+        // Note that the first column of `self.weight` is the bias,
+        // and the previous layer doesn't care about its gradient.
+        // So we just return the gradient below the first component of the input
+        // by dropping the bias column.
+        self.weight
+            .drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient);
+        // transpose[T without the first column][potential_gradient]
+    }
+
+    // Note that the proof ensures that the `potential_gradient` has been updated
+    // WARNING: You need to call `potential_gradient_mut` before using the below function
+    // (2)
+    pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) {
+        use crate::linear_algebra::VectorTensorCovectorMatrix;
+
+        let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input
+        matrix.add_to_mut(&mut self.weight_gradient);
+    }
+}
+
+// =====softmax=====
+#[derive(Debug)]
+pub struct SoftmaxTransform {
+    pub weight: ColumnEfficientMatrix,
+    potential_vector: Vector,
+    softmax_output: Vector, // Used for computation of the softmax gradient
+    potential_gradient: Vector,
+
+    pub weight_gradient: ColumnEfficientMatrix,
+}
+
+impl SoftmaxTransform {
+    pub fn new(input_dimension: usize, output_dimension: usize) -> Self {
+        let mean = 0.0;
+        let std_dev = 1.0 / (input_dimension as Float).sqrt();
+        let normal_distr = Normal::new(mean, std_dev).unwrap();
+
+        Self {
+            weight: ColumnEfficientMatrix::random_with_normal_distribution(
+                input_dimension,
+                output_dimension,
+                normal_distr,
+            ),
+            potential_vector: Vector::zero(output_dimension),
+            softmax_output: Vector::zero(output_dimension),
+            potential_gradient: Vector::zero(output_dimension),
+
+            weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension),
+        }
+    }
+
+    pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) {
+        self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input]
+        softmax_mut(&self.potential_vector[..], output); // y = f(potential)
+        self.softmax_output.copy_from_slice(output)
+    }
+
+    pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) {
+        softmax_gradient_mut(
+            &self.softmax_output[..],
+            output_gradient,
+            &mut self.potential_gradient[..],
+        ); // potential_gradient = grad[softmax](potential)[output_gradient]
+    }
+
+    pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) {
+        self.weight
+            .drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient);
+        // transpose[T without the first column][potential_gradient]
+    }
+
+    pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) {
+        use crate::linear_algebra::VectorTensorCovectorMatrix;
+
+        let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input
+        matrix.add_to_mut(&mut self.weight_gradient);
+    }
+}
+
+#[derive(Debug)]
+pub struct ReluTransform {
+    pub weight: ColumnEfficientMatrix,
+    potential_vector: Vector,
+    potential_gradient: Vector,
+
+    pub weight_gradient: ColumnEfficientMatrix,
+}
+
+impl ReluTransform {
+    pub fn new(input_dimension: usize, output_dimension: usize) -> Self {
+        let mean = 0.0;
+        let std_dev = 1.0 / (input_dimension as Float).sqrt();
+        let normal_distr = Normal::new(mean, std_dev).unwrap();
+
+        Self {
+            weight: ColumnEfficientMatrix::random_with_normal_distribution(
+                input_dimension,
+                output_dimension,
+                normal_distr,
+            ),
+            potential_vector: Vector::zero(output_dimension),
+            potential_gradient: Vector::zero(output_dimension),
+
+            weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension),
+        }
+    }
+
+    pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) {
+        self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input]
+        relu_mut(&self.potential_vector[..], output); // y = f(potential)
+    }
+
+    pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) {
+        relu_gradient_mut(
+            &self.potential_vector[..],
+            output_gradient,
+            &mut self.potential_gradient[..],
+        ); // potential_gradient = grad[softmax](potential)[output_gradient]
+    }
+
+    pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) {
+        self.weight
+            .drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient);
+        // transpose[T without the first column][potential_gradient]
+    }
+
+    pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) {
+        use crate::linear_algebra::VectorTensorCovectorMatrix;
+
+        let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input
+        matrix.add_to_mut(&mut self.weight_gradient);
+    }
+}