use crate::float::Float; use crate::linear_algebra::{ColumnEfficientMatrix, Vector}; use rand_distr::Normal; pub fn l2_error(xs: &[Float], ys: &[Float]) -> Float { // err-squared let mut result = 0.0; for (x, y) in xs.iter().zip(ys) { let s = x - y; result += s * s; } 0.5 * result } pub fn gradient_l2_error_mut(xs: &[Float], ys: &[Float], out: &mut [Float]) { for (i, (x, y)) in xs.iter().zip(ys).enumerate() { out[i] = x - y; } } fn sigmoid(x: Float) -> Float { 1.0 / (1.0 + (-x).exp()) } fn activation_of_sigmoid_potential(potential: Float) -> Float { sigmoid(potential) } fn vectorized_sigmoid_activation_of_potential(potentials: &[Float], out: &mut [Float]) { for (potential, y) in potentials.iter().zip(out) { *y = activation_of_sigmoid_potential(*potential) } } fn derivative_of_sigmoid_activation_of_potential(potential: Float) -> Float { let s = sigmoid(potential); s * (1.0 - s) } fn vectorized_gradient_of_sigmoid_activation_of_potential_mut( potentials: &[Float], derivatives_state: &mut [Float], output_gradient: &[Float], input_gradient: &mut [Float], ) { use crate::linear_algebra::DiagonalMatrix; for (potential, state) in potentials.iter().zip(derivatives_state.iter_mut()) { *state = derivative_of_sigmoid_activation_of_potential(*potential) } DiagonalMatrix::new(derivatives_state).apply_mut(output_gradient, input_gradient); } // =====cross-entropy===== // takes in two probability distributions fn cross_entropy(p: &[Float], q: &[Float]) -> Float { let mut result = 0.0; for (x, y) in p.iter().zip(q) { result += y * x.ln() } -result } // Second probability distribution is deterministic, // i.e. it deterministically outputs the same value. fn cross_entropy_simple(p: &[Float], q: usize) -> Float { -p[q].ln() } fn cross_entropy_derivative_mut(p: &[Float], q: &[Float], out: &mut [Float]) { for ((a, b), c) in p.iter().zip(q).zip(out) { *c = -b / a } } // Second probability distribution is deterministic, // i.e. it deterministically outputs the same value. pub fn cross_entropy_derivative_simple(p: &[Float], q: usize, out: &mut [Float]) { // TODO: Do we really need to reset everything to besides the q-th index? for a in out.iter_mut() { *a = 0.0; } out[q] = -1.0 / p[q]; } fn softmax_mut(input: &[Float], out: &mut [Float]) { let mut s = 0.0; for (x, y) in input.iter().zip(out.iter_mut()) { let e = x.exp(); *y = e; s += e } for y in out { *y /= s } } fn softmax_gradient_mut( softmax_output: &[Float], gradient_output: &[Float], gradient_input: &mut [Float], ) { for (j, dx) in gradient_input.iter_mut().enumerate() { *dx = 0.0; for (i, dy) in gradient_output.iter().enumerate() { // Note that the gradient matrix is symmetric, so don't worry about the order of // indices *dx += softmax_output[i] * (if i == j { 1.0 } else { 0.0 } - softmax_output[j]) * dy } } } // relu fn relu_mut(input: &[Float], out: &mut [Float]) { for (x, y) in input.iter().zip(out.iter_mut()) { *y = x.max(0.0) } } fn relu_gradient_mut(input: &[Float], gradient_output: &[Float], gradient_input: &mut [Float]) { for ((dy, dx), x) in gradient_output.iter().zip(gradient_input).zip(input) { *dx = if *x > 0.0 { *dy } else { 0.0 } } } // =====sigmoid===== #[derive(Debug)] pub struct SigmoidTransform { pub weight: ColumnEfficientMatrix, potential_vector: Vector, derivatives_state: Vector, potential_gradient: Vector, pub weight_gradient: ColumnEfficientMatrix, } impl SigmoidTransform { pub fn new(input_dimension: usize, output_dimension: usize) -> Self { let mean = 0.0; let std_dev = 1.0 / (input_dimension as Float).sqrt(); let normal_distr = Normal::new(mean, std_dev).unwrap(); Self { weight: ColumnEfficientMatrix::random_with_normal_distribution( input_dimension, output_dimension, normal_distr, ), potential_vector: Vector::zero(output_dimension), derivatives_state: Vector::zero(output_dimension), // TODO: Can I get rid of this? potential_gradient: Vector::zero(output_dimension), weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension), } } pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) { self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input] vectorized_sigmoid_activation_of_potential(&self.potential_vector[..], output); // y = f(potential) } // Note below that (1) and (2) are independent, but they both depend on (0). // (0) pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) { // updates the potential gradient vectorized_gradient_of_sigmoid_activation_of_potential_mut( &self.potential_vector[..], &mut self.derivatives_state[..], output_gradient, &mut self.potential_gradient[..], ); // potential_gradient = grad[f](potential)[output_gradient] } // Note that it makes sense to have the two gradients split, since for the input layer we will // not need to compute the input gradient, only the weight gradient is important. // WARNING: You need to call `potential_gradient_mut` before using the below function // (1) pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) { // updates the input gradient // // Note that the first column of `self.weight` is the bias, // and the previous layer doesn't care about its gradient. // So we just return the gradient below the first component of the input // by dropping the bias column. self.weight .drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient); // transpose[T without the first column][potential_gradient] } // Note that the proof ensures that the `potential_gradient` has been updated // WARNING: You need to call `potential_gradient_mut` before using the below function // (2) pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) { use crate::linear_algebra::VectorTensorCovectorMatrix; let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input matrix.add_to_mut(&mut self.weight_gradient); } } // =====softmax===== #[derive(Debug)] pub struct SoftmaxTransform { pub weight: ColumnEfficientMatrix, potential_vector: Vector, softmax_output: Vector, // Used for computation of the softmax gradient potential_gradient: Vector, pub weight_gradient: ColumnEfficientMatrix, } impl SoftmaxTransform { pub fn new(input_dimension: usize, output_dimension: usize) -> Self { let mean = 0.0; let std_dev = 1.0 / (input_dimension as Float).sqrt(); let normal_distr = Normal::new(mean, std_dev).unwrap(); Self { weight: ColumnEfficientMatrix::random_with_normal_distribution( input_dimension, output_dimension, normal_distr, ), potential_vector: Vector::zero(output_dimension), softmax_output: Vector::zero(output_dimension), potential_gradient: Vector::zero(output_dimension), weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension), } } pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) { self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input] softmax_mut(&self.potential_vector[..], output); // y = f(potential) self.softmax_output.copy_from_slice(output) } pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) { softmax_gradient_mut( &self.softmax_output[..], output_gradient, &mut self.potential_gradient[..], ); // potential_gradient = grad[softmax](potential)[output_gradient] } pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) { self.weight .drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient); // transpose[T without the first column][potential_gradient] } pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) { use crate::linear_algebra::VectorTensorCovectorMatrix; let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input matrix.add_to_mut(&mut self.weight_gradient); } } #[derive(Debug)] pub struct ReluTransform { pub weight: ColumnEfficientMatrix, potential_vector: Vector, potential_gradient: Vector, pub weight_gradient: ColumnEfficientMatrix, } impl ReluTransform { pub fn new(input_dimension: usize, output_dimension: usize) -> Self { let mean = 0.0; let std_dev = 1.0 / (input_dimension as Float).sqrt(); let normal_distr = Normal::new(mean, std_dev).unwrap(); Self { weight: ColumnEfficientMatrix::random_with_normal_distribution( input_dimension, output_dimension, normal_distr, ), potential_vector: Vector::zero(output_dimension), potential_gradient: Vector::zero(output_dimension), weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension), } } pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) { self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input] relu_mut(&self.potential_vector[..], output); // y = f(potential) } pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) { relu_gradient_mut( &self.potential_vector[..], output_gradient, &mut self.potential_gradient[..], ); // potential_gradient = grad[softmax](potential)[output_gradient] } pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) { self.weight .drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient); // transpose[T without the first column][potential_gradient] } pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) { use crate::linear_algebra::VectorTensorCovectorMatrix; let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input matrix.add_to_mut(&mut self.weight_gradient); } }