Initial commit
This commit is contained in:
commit
fa8eae3146
17 changed files with 141328 additions and 0 deletions
315
src/transforms.rs
Normal file
315
src/transforms.rs
Normal file
|
|
@ -0,0 +1,315 @@
|
|||
use crate::float::Float;
|
||||
use crate::linear_algebra::{ColumnEfficientMatrix, Vector};
|
||||
use rand_distr::Normal;
|
||||
|
||||
pub fn l2_error(xs: &[Float], ys: &[Float]) -> Float {
|
||||
// err-squared
|
||||
let mut result = 0.0;
|
||||
for (x, y) in xs.iter().zip(ys) {
|
||||
let s = x - y;
|
||||
result += s * s;
|
||||
}
|
||||
0.5 * result
|
||||
}
|
||||
|
||||
pub fn gradient_l2_error_mut(xs: &[Float], ys: &[Float], out: &mut [Float]) {
|
||||
for (i, (x, y)) in xs.iter().zip(ys).enumerate() {
|
||||
out[i] = x - y;
|
||||
}
|
||||
}
|
||||
|
||||
fn sigmoid(x: Float) -> Float {
|
||||
1.0 / (1.0 + (-x).exp())
|
||||
}
|
||||
|
||||
fn activation_of_sigmoid_potential(potential: Float) -> Float {
|
||||
sigmoid(potential)
|
||||
}
|
||||
|
||||
fn vectorized_sigmoid_activation_of_potential(potentials: &[Float], out: &mut [Float]) {
|
||||
for (potential, y) in potentials.iter().zip(out) {
|
||||
*y = activation_of_sigmoid_potential(*potential)
|
||||
}
|
||||
}
|
||||
|
||||
fn derivative_of_sigmoid_activation_of_potential(potential: Float) -> Float {
|
||||
let s = sigmoid(potential);
|
||||
s * (1.0 - s)
|
||||
}
|
||||
|
||||
fn vectorized_gradient_of_sigmoid_activation_of_potential_mut(
|
||||
potentials: &[Float],
|
||||
derivatives_state: &mut [Float],
|
||||
output_gradient: &[Float],
|
||||
input_gradient: &mut [Float],
|
||||
) {
|
||||
use crate::linear_algebra::DiagonalMatrix;
|
||||
for (potential, state) in potentials.iter().zip(derivatives_state.iter_mut()) {
|
||||
*state = derivative_of_sigmoid_activation_of_potential(*potential)
|
||||
}
|
||||
DiagonalMatrix::new(derivatives_state).apply_mut(output_gradient, input_gradient);
|
||||
}
|
||||
|
||||
// =====cross-entropy=====
|
||||
// takes in two probability distributions
|
||||
fn cross_entropy(p: &[Float], q: &[Float]) -> Float {
|
||||
let mut result = 0.0;
|
||||
for (x, y) in p.iter().zip(q) {
|
||||
result += y * x.ln()
|
||||
}
|
||||
-result
|
||||
}
|
||||
|
||||
// Second probability distribution is deterministic,
|
||||
// i.e. it deterministically outputs the same value.
|
||||
fn cross_entropy_simple(p: &[Float], q: usize) -> Float {
|
||||
-p[q].ln()
|
||||
}
|
||||
|
||||
fn cross_entropy_derivative_mut(p: &[Float], q: &[Float], out: &mut [Float]) {
|
||||
for ((a, b), c) in p.iter().zip(q).zip(out) {
|
||||
*c = -b / a
|
||||
}
|
||||
}
|
||||
|
||||
// Second probability distribution is deterministic,
|
||||
// i.e. it deterministically outputs the same value.
|
||||
pub fn cross_entropy_derivative_simple(p: &[Float], q: usize, out: &mut [Float]) {
|
||||
// TODO: Do we really need to reset everything to besides the q-th index?
|
||||
for a in out.iter_mut() {
|
||||
*a = 0.0;
|
||||
}
|
||||
out[q] = -1.0 / p[q];
|
||||
}
|
||||
|
||||
fn softmax_mut(input: &[Float], out: &mut [Float]) {
|
||||
let mut s = 0.0;
|
||||
for (x, y) in input.iter().zip(out.iter_mut()) {
|
||||
let e = x.exp();
|
||||
*y = e;
|
||||
s += e
|
||||
}
|
||||
|
||||
for y in out {
|
||||
*y /= s
|
||||
}
|
||||
}
|
||||
|
||||
fn softmax_gradient_mut(
|
||||
softmax_output: &[Float],
|
||||
gradient_output: &[Float],
|
||||
gradient_input: &mut [Float],
|
||||
) {
|
||||
for (j, dx) in gradient_input.iter_mut().enumerate() {
|
||||
*dx = 0.0;
|
||||
for (i, dy) in gradient_output.iter().enumerate() {
|
||||
// Note that the gradient matrix is symmetric, so don't worry about the order of
|
||||
// indices
|
||||
*dx += softmax_output[i] * (if i == j { 1.0 } else { 0.0 } - softmax_output[j]) * dy
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// relu
|
||||
fn relu_mut(input: &[Float], out: &mut [Float]) {
|
||||
for (x, y) in input.iter().zip(out.iter_mut()) {
|
||||
*y = x.max(0.0)
|
||||
}
|
||||
}
|
||||
|
||||
fn relu_gradient_mut(input: &[Float], gradient_output: &[Float], gradient_input: &mut [Float]) {
|
||||
for ((dy, dx), x) in gradient_output.iter().zip(gradient_input).zip(input) {
|
||||
*dx = if *x > 0.0 { *dy } else { 0.0 }
|
||||
}
|
||||
}
|
||||
|
||||
// =====sigmoid=====
|
||||
#[derive(Debug)]
|
||||
pub struct SigmoidTransform {
|
||||
pub weight: ColumnEfficientMatrix,
|
||||
potential_vector: Vector,
|
||||
derivatives_state: Vector,
|
||||
potential_gradient: Vector,
|
||||
|
||||
pub weight_gradient: ColumnEfficientMatrix,
|
||||
}
|
||||
|
||||
impl SigmoidTransform {
|
||||
pub fn new(input_dimension: usize, output_dimension: usize) -> Self {
|
||||
let mean = 0.0;
|
||||
let std_dev = 1.0 / (input_dimension as Float).sqrt();
|
||||
let normal_distr = Normal::new(mean, std_dev).unwrap();
|
||||
|
||||
Self {
|
||||
weight: ColumnEfficientMatrix::random_with_normal_distribution(
|
||||
input_dimension,
|
||||
output_dimension,
|
||||
normal_distr,
|
||||
),
|
||||
potential_vector: Vector::zero(output_dimension),
|
||||
derivatives_state: Vector::zero(output_dimension), // TODO: Can I get rid of this?
|
||||
potential_gradient: Vector::zero(output_dimension),
|
||||
|
||||
weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) {
|
||||
self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input]
|
||||
vectorized_sigmoid_activation_of_potential(&self.potential_vector[..], output);
|
||||
// y = f(potential)
|
||||
}
|
||||
|
||||
// Note below that (1) and (2) are independent, but they both depend on (0).
|
||||
|
||||
// (0)
|
||||
pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) {
|
||||
// updates the potential gradient
|
||||
vectorized_gradient_of_sigmoid_activation_of_potential_mut(
|
||||
&self.potential_vector[..],
|
||||
&mut self.derivatives_state[..],
|
||||
output_gradient,
|
||||
&mut self.potential_gradient[..],
|
||||
); // potential_gradient = grad[f](potential)[output_gradient]
|
||||
}
|
||||
|
||||
// Note that it makes sense to have the two gradients split, since for the input layer we will
|
||||
// not need to compute the input gradient, only the weight gradient is important.
|
||||
// WARNING: You need to call `potential_gradient_mut` before using the below function
|
||||
// (1)
|
||||
pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) {
|
||||
// updates the input gradient
|
||||
//
|
||||
// Note that the first column of `self.weight` is the bias,
|
||||
// and the previous layer doesn't care about its gradient.
|
||||
// So we just return the gradient below the first component of the input
|
||||
// by dropping the bias column.
|
||||
self.weight
|
||||
.drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient);
|
||||
// transpose[T without the first column][potential_gradient]
|
||||
}
|
||||
|
||||
// Note that the proof ensures that the `potential_gradient` has been updated
|
||||
// WARNING: You need to call `potential_gradient_mut` before using the below function
|
||||
// (2)
|
||||
pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) {
|
||||
use crate::linear_algebra::VectorTensorCovectorMatrix;
|
||||
|
||||
let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input
|
||||
matrix.add_to_mut(&mut self.weight_gradient);
|
||||
}
|
||||
}
|
||||
|
||||
// =====softmax=====
|
||||
#[derive(Debug)]
|
||||
pub struct SoftmaxTransform {
|
||||
pub weight: ColumnEfficientMatrix,
|
||||
potential_vector: Vector,
|
||||
softmax_output: Vector, // Used for computation of the softmax gradient
|
||||
potential_gradient: Vector,
|
||||
|
||||
pub weight_gradient: ColumnEfficientMatrix,
|
||||
}
|
||||
|
||||
impl SoftmaxTransform {
|
||||
pub fn new(input_dimension: usize, output_dimension: usize) -> Self {
|
||||
let mean = 0.0;
|
||||
let std_dev = 1.0 / (input_dimension as Float).sqrt();
|
||||
let normal_distr = Normal::new(mean, std_dev).unwrap();
|
||||
|
||||
Self {
|
||||
weight: ColumnEfficientMatrix::random_with_normal_distribution(
|
||||
input_dimension,
|
||||
output_dimension,
|
||||
normal_distr,
|
||||
),
|
||||
potential_vector: Vector::zero(output_dimension),
|
||||
softmax_output: Vector::zero(output_dimension),
|
||||
potential_gradient: Vector::zero(output_dimension),
|
||||
|
||||
weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) {
|
||||
self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input]
|
||||
softmax_mut(&self.potential_vector[..], output); // y = f(potential)
|
||||
self.softmax_output.copy_from_slice(output)
|
||||
}
|
||||
|
||||
pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) {
|
||||
softmax_gradient_mut(
|
||||
&self.softmax_output[..],
|
||||
output_gradient,
|
||||
&mut self.potential_gradient[..],
|
||||
); // potential_gradient = grad[softmax](potential)[output_gradient]
|
||||
}
|
||||
|
||||
pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) {
|
||||
self.weight
|
||||
.drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient);
|
||||
// transpose[T without the first column][potential_gradient]
|
||||
}
|
||||
|
||||
pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) {
|
||||
use crate::linear_algebra::VectorTensorCovectorMatrix;
|
||||
|
||||
let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input
|
||||
matrix.add_to_mut(&mut self.weight_gradient);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ReluTransform {
|
||||
pub weight: ColumnEfficientMatrix,
|
||||
potential_vector: Vector,
|
||||
potential_gradient: Vector,
|
||||
|
||||
pub weight_gradient: ColumnEfficientMatrix,
|
||||
}
|
||||
|
||||
impl ReluTransform {
|
||||
pub fn new(input_dimension: usize, output_dimension: usize) -> Self {
|
||||
let mean = 0.0;
|
||||
let std_dev = 1.0 / (input_dimension as Float).sqrt();
|
||||
let normal_distr = Normal::new(mean, std_dev).unwrap();
|
||||
|
||||
Self {
|
||||
weight: ColumnEfficientMatrix::random_with_normal_distribution(
|
||||
input_dimension,
|
||||
output_dimension,
|
||||
normal_distr,
|
||||
),
|
||||
potential_vector: Vector::zero(output_dimension),
|
||||
potential_gradient: Vector::zero(output_dimension),
|
||||
|
||||
weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) {
|
||||
self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input]
|
||||
relu_mut(&self.potential_vector[..], output); // y = f(potential)
|
||||
}
|
||||
|
||||
pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) {
|
||||
relu_gradient_mut(
|
||||
&self.potential_vector[..],
|
||||
output_gradient,
|
||||
&mut self.potential_gradient[..],
|
||||
); // potential_gradient = grad[softmax](potential)[output_gradient]
|
||||
}
|
||||
|
||||
pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) {
|
||||
self.weight
|
||||
.drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient);
|
||||
// transpose[T without the first column][potential_gradient]
|
||||
}
|
||||
|
||||
pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) {
|
||||
use crate::linear_algebra::VectorTensorCovectorMatrix;
|
||||
|
||||
let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input
|
||||
matrix.add_to_mut(&mut self.weight_gradient);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue