Initial commit

2026-03-20 11:48:07 +01:00 · 2026-03-20 11:48:07 +01:00 · fa8eae3146
commit fa8eae3146
17 changed files with 141328 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 target/
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,108 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 3
 [[package]]
 name = "autocfg"
 version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 [[package]]
 name = "getrandom"
 version = "0.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f"
 dependencies = [
 "cfg-if",
 "libc",
 "wasi",
 ]
 [[package]]
 name = "libc"
 version = "0.2.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
 [[package]]
 name = "libm"
 version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
 [[package]]
 name = "num-traits"
 version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
 dependencies = [
 "autocfg",
 "libm",
 ]
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 [[package]]
 name = "pv021_project"
 version = "0.1.0"
 dependencies = [
 "rand",
 "rand_distr",
 ]
 [[package]]
 name = "rand"
 version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
 "libc",
 "rand_chacha",
 "rand_core",
 ]
 [[package]]
 name = "rand_chacha"
 version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
 dependencies = [
 "ppv-lite86",
 "rand_core",
 ]
 [[package]]
 name = "rand_core"
 version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
 dependencies = [
 "getrandom",
 ]
 [[package]]
 name = "rand_distr"
 version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
 dependencies = [
 "num-traits",
 "rand",
 ]
 [[package]]
 name = "wasi"
 version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,10 @@
 [package]
 name = "pv021_project"
 version = "0.1.0"
 edition = "2018"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
 rand = "0.8.5"
 rand_distr = "0.4.3"
--- a/OVERVIEW.png
+++ b/OVERVIEW.png
--- a/README.md
+++ b/README.md
@ -0,0 +1,16 @@
 Rust Deep Learning
 School Project I did for a Neural Networks Class at the end of 2023:
 - Implements a neural net (backpropagation / multilayer perceptron) in Rust
 - Constraint: Can't use any Linear Algebra libraries or frameworks
 - Training/Dataset: Fashion-MNIST dataset (achieves about 91% accuracy in less than 10 min of training)
 See `OVERVIEW.png` for the underlying math I came up with to do backprop and organize memory.
 # Dataset
 Fashion MNIST (https://arxiv.org/pdf/1708.07747.pdf). Dataset of images ‒ consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. The dataset is in CSV format:
 - `fashion_mnist_train_vectors.csv` - training input vectors
 - `fashion_mnist_test_vectors.csv`  - testing input vectors
 - `fashion_mnist_train_labels.csv`  - training labels
 - `fashion_mnist_test_labels.csv`   - testing labels
--- a/data/fashion_mnist_test_labels.csv
+++ b/data/fashion_mnist_test_labels.csv
--- a/data/fashion_mnist_test_vectors.csv
+++ b/data/fashion_mnist_test_vectors.csv
--- a/data/fashion_mnist_train_labels.csv
+++ b/data/fashion_mnist_train_labels.csv
--- a/data/fashion_mnist_train_vectors.csv
+++ b/data/fashion_mnist_train_vectors.csv
--- a/src/env.rs
+++ b/src/env.rs
@ -0,0 +1,16 @@
 pub const NUMBER_OF_PIXELS_PER_IMAGE: usize = 28 * 28;
 pub const TEST_CSVS: (&str, &str) = (
    "data/fashion_mnist_test_labels.csv",
    "data/fashion_mnist_test_vectors.csv",
 );
 pub const TRAIN_CSVS: (&str, &str) = (
    "data/fashion_mnist_train_labels.csv",
    "data/fashion_mnist_train_vectors.csv",
 );
 pub const TEST_PREDICTIONS_CSV: &str = "test_predictions.csv";
 pub const TRAIN_PREDICTIONS_CSV: &str = "train_predictions.csv";
 pub const TEST_DISTRIBUTIONS_CSV: &str = "test_distributions.csv";
 pub const TRAIN_DISTRIBUTIONS_CSV: &str = "train_distributions.csv";
--- a/src/float.rs
+++ b/src/float.rs
@ -0,0 +1 @@
 pub type Float = f32; // f64 or f32, doesn't seem to make any difference
--- a/src/linear_algebra.rs
+++ b/src/linear_algebra.rs
@ -0,0 +1,305 @@
 mod raw_operation {
    use crate::float::Float;
    pub fn zero(out: &mut [Float]) {
        for y in out {
            *y = 0.0
        }
    }
    pub fn inner_product(xs: &[Float], ys: &[Float]) -> Float {
        let mut result: Float = 0.0;
        for (x, y) in xs.iter().zip(ys) {
            result += x * y;
        }
        result
    }
    pub fn scale(xs: &[Float], k: Float, out: &mut [Float]) {
        for (x, y) in xs.iter().zip(out) {
            *y = *x * k
        }
    }
    pub fn add(xs: &[Float], ys: &[Float], out: &mut [Float]) {
        for ((x, y), z) in xs.iter().zip(ys).zip(out) {
            *z = *x + *y
        }
    }
    pub fn sub(xs: &[Float], ys: &[Float], out: &mut [Float]) {
        for ((x, y), z) in xs.iter().zip(ys).zip(out) {
            *z = *x - *y
        }
    }
 }
 use crate::float::Float;
 use rand_distr::{Distribution, Normal};
 use std::iter::FromIterator;
 use std::ops::{Index, IndexMut};
 use std::slice::{Iter, IterMut, SliceIndex};
 #[derive(Clone, Debug)]
 pub struct Vector(Vec<Float>);
 impl<Idx> Index<Idx> for Vector
 where
    Idx: SliceIndex<[Float]>,
 {
    type Output = Idx::Output;
    fn index(&self, index: Idx) -> &Self::Output {
        &self.0[index]
    }
 }
 impl<Idx> IndexMut<Idx> for Vector
 where
    Idx: SliceIndex<[Float]>,
 {
    fn index_mut(&mut self, index: Idx) -> &mut Self::Output {
        &mut self.0[index]
    }
 }
 impl FromIterator<Float> for Vector {
    fn from_iter<I: IntoIterator<Item = Float>>(iter: I) -> Self {
        let mut v = vec![];
        for x in iter {
            v.push(x)
        }
        Vector(v)
    }
 }
 impl Vector {
    pub fn iter(&self) -> Iter<'_, Float> {
        self.0.iter()
    }
    pub fn iter_mut(&mut self) -> IterMut<'_, Float> {
        self.0.iter_mut()
    }
    pub fn as_slice(&self) -> &[Float] {
        self.0.as_slice()
    }
    pub fn as_mut_slice(&mut self) -> &mut [Float] {
        self.0.as_mut_slice()
    }
    pub fn copy_from_slice(&mut self, src: &[Float]) {
        self.0.copy_from_slice(src)
    }
    pub fn to_vec(self) -> Vec<Float> {
        self.0
    }
 }
 impl Vector {
    pub fn new(vector: Vec<Float>) -> Self {
        Self(vector)
    }
    pub fn zero(size: usize) -> Self {
        Self(vec![0.0; size])
    }
    pub fn add_mut(&self, w: &[Float], out: &mut [Float]) {
        raw_operation::add(&self[..], w, out)
    }
    pub fn sub_mut(&self, w: &[Float], out: &mut [Float]) {
        raw_operation::add(&self[..], w, out)
    }
    pub fn scale_mut(&self, k: Float, out: &mut [Float]) {
        raw_operation::scale(&self[..], k, out)
    }
    pub fn inner_product(&self, w: &[Float]) -> Float {
        raw_operation::inner_product(&self[..], w)
    }
 }
 #[derive(Clone, Debug)]
 pub struct ColumnEfficientMatrix {
    pub input_dimension: usize,
    pub output_dimension: usize,
    pub components: Vec<Float>,
 }
 impl Index<(usize, usize)> for ColumnEfficientMatrix {
    type Output = Float;
    fn index(&self, (column, row): (usize, usize)) -> &Self::Output {
        &self.components[column * self.output_dimension + row]
    }
 }
 impl IndexMut<(usize, usize)> for ColumnEfficientMatrix {
    fn index_mut(&mut self, (column, row): (usize, usize)) -> &mut Self::Output {
        &mut self.components[column * self.output_dimension + row]
    }
 }
 impl ColumnEfficientMatrix {
    pub fn from_rows(rows: Vec<Vec<Float>>) -> Self {
        let output_dimension = rows.len();
        if output_dimension == 0 {
            Self {
                input_dimension: 0,
                output_dimension: 0,
                components: vec![],
            }
        } else {
            let input_dimension = rows[0].len();
            let mut components = Vec::with_capacity(input_dimension * output_dimension);
            for i in 0..input_dimension {
                for j in 0..output_dimension {
                    components.push(rows[j][i])
                }
            }
            Self {
                input_dimension,
                output_dimension,
                components,
            }
        }
    }
    pub fn zero(input_dimension: usize, output_dimension: usize) -> ColumnEfficientMatrix {
        Self {
            input_dimension,
            output_dimension,
            components: vec![0.0; input_dimension * output_dimension],
        }
    }
    pub fn random_with_normal_distribution(
        input_dimension: usize,
        output_dimension: usize,
        distribution: Normal<Float>,
    ) -> ColumnEfficientMatrix {
        let mut components = Vec::with_capacity(input_dimension * output_dimension);
        for _ in 0..input_dimension {
            for _ in 0..output_dimension {
                components.push(distribution.sample(&mut rand::thread_rng()))
            }
        }
        Self {
            input_dimension,
            output_dimension,
            components,
        }
    }
    pub fn zero_mut(&mut self) {
        for a in &mut self.components {
            *a = 0.0
        }
    }
    pub fn add_scaled_mut(&self, k: Float, b: &Self, c: &mut Self) {
        for ((a, b), c) in self
            .components
            .iter()
            .zip(&b.components)
            .zip(&mut c.components)
        {
            *c = *a + *b * k
        }
    }
    pub fn add_to_self_scaled_mut(&mut self, k: Float, b: &Self) {
        for (a, b) in self.components.iter_mut().zip(&b.components) {
            *a += *b * k
        }
    }
    pub fn apply_mut(&self, v: &[Float], out: &mut [Float]) {
        for j in 0..self.output_dimension {
            let mut result: Float = 0.0;
            for i in 0..self.input_dimension {
                result += self[(i, j)] * v[i]
            }
            out[j] = result;
        }
    }
    // Apply the transpose
    pub fn coapply_mut(&self, w: &[Float], out: &mut [Float]) {
        for i in 0..self.input_dimension {
            let start_index = i * self.output_dimension;
            out[i] = raw_operation::inner_product(
                &self.components[start_index..start_index + self.output_dimension],
                w,
            );
        }
    }
    pub fn drop_first_column_coapply_mut(&self, w: &[Float], out: &mut [Float]) {
        for i in 1..self.input_dimension {
            let start_index = i * self.output_dimension;
            out[i - 1] = raw_operation::inner_product(
                &self.components[start_index..start_index + self.output_dimension],
                w,
            );
        }
    }
 }
 #[derive(Clone, Debug)]
 pub struct DiagonalMatrix<'a> {
    pub diagonal: &'a [Float],
 }
 impl<'a> DiagonalMatrix<'a> {
    pub fn new(diagonal: &'a [Float]) -> DiagonalMatrix<'a> {
        Self { diagonal }
    }
    pub fn apply_mut(&self, v: &[Float], out: &mut [Float]) {
        for (i, (d, x)) in self.diagonal.iter().zip(v.iter()).enumerate() {
            out[i] = d * x;
        }
    }
 }
 // Given vector `w : W` and a covector `f : V -> Float`,
 // the linear map `w tensor f : V -> W` computes `v : V ~> f(v) * w`
 #[derive(Clone, Debug)]
 pub struct VectorTensorCovectorMatrix<'a> {
    pub output_vector: &'a [Float],  // w : W
    pub input_covector: &'a [Float], // f : V -> Float
 }
 impl<'a> VectorTensorCovectorMatrix<'a> {
    pub fn new(
        output_vector: &'a [Float],
        input_covector: &'a [Float],
    ) -> VectorTensorCovectorMatrix<'a> {
        Self {
            output_vector,
            input_covector,
        }
    }
    pub fn apply_mut(&self, v: &[Float], out: &mut [Float]) {
        let scalar = raw_operation::inner_product(self.input_covector, v);
        raw_operation::scale(self.output_vector, scalar, out)
    }
    pub fn add_to_mut(&self, matrix: &mut ColumnEfficientMatrix) {
        // TODO: Surely this can be optimized by iterating over the columns of the matrix directly
        for (i, x) in self.input_covector.iter().enumerate() {
            for (j, y) in self.output_vector.iter().enumerate() {
                matrix[(i, j)] += x * y
            }
        }
    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,76 @@
 mod env;
 mod float;
 mod linear_algebra;
 mod neural_network;
 mod preprocessing;
 mod transforms;
 use preprocessing::{dataset_from_file, export_distributions_to, export_to};
 use std::io;
 use crate::neural_network::{NNPoint, NeuralNetworkInTraining, NeuralNetworkParameters};
 fn main() -> Result<(), io::Error> {
    let size_of_full_training_dataset = 60000;
    let size_of_training_dataset = 50000;
    let size_of_validation_dataset = 10000;
    let size_of_testing_dataset = 10000;
    println!("Importing datasets...");
    let full_training_dataset: Vec<NNPoint> =
        dataset_from_file(env::TRAIN_CSVS, size_of_full_training_dataset)?;
    let mut training_dataset: Vec<NNPoint> = {
        let mut training_dataset = Vec::with_capacity(size_of_training_dataset);
        training_dataset.extend_from_slice(&full_training_dataset[0..size_of_training_dataset]);
        training_dataset
    };
    let validation_dataset: Vec<NNPoint> = {
        let mut validation_dataset = Vec::with_capacity(size_of_validation_dataset);
        validation_dataset.extend_from_slice(
            &full_training_dataset
                [size_of_training_dataset..size_of_training_dataset + size_of_validation_dataset],
        );
        validation_dataset
    };
    let test_dataset: Vec<NNPoint> = dataset_from_file(env::TEST_CSVS, size_of_testing_dataset)?;
    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 70, 10]);
    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 150, 10]);
    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 60, 40, 10]);
    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 150, 10]);
    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 60, 40, 10]); // batch=20, rate=2 is pretty good. Seems to reliably reach 89%
    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 80, 60, 40, 10]); // batch=20, rate=2, pretty good like 88 % then suddenly drops to 10%
    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 150, 150, 10]); // this gets me over 90%, nice
    let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 150, 150, 10]);
    println!("Begin training");
    let params = NeuralNetworkParameters {
        epochs: 30,
        batch_size: 30,
        learning_rate: 2.00,
    };
    nn.train(params, &mut training_dataset, Some(&validation_dataset));
    // nn.train(params, &mut training_dataset, None);
    nn.show_accuracy_on(&full_training_dataset, "training");
    nn.show_accuracy_on(&test_dataset, "test");
    let predictions_on_full_training_set = nn.test(&full_training_dataset);
    let predictions_on_test_set = nn.test(&test_dataset);
    println!("Exporting to {}", env::TRAIN_PREDICTIONS_CSV);
    export_to(
        &predictions_on_full_training_set,
        env::TRAIN_PREDICTIONS_CSV,
    )?;
    println!("Exporting to {}", env::TEST_PREDICTIONS_CSV);
    export_to(&predictions_on_test_set, env::TEST_PREDICTIONS_CSV)?;
    // TODO: Comment this out
    // let distributions_on_full_training_set = nn.test_distributions(&full_training_dataset);
    // let distributions_on_test_set = nn.test_distributions(&test_dataset);
    // export_distributions_to(&distributions_on_full_training_set, env::TRAIN_DISTRIBUTIONS_CSV)?;
    // export_distributions_to(&distributions_on_test_set, env::TEST_DISTRIBUTIONS_CSV)?;
    Ok(())
 }
--- a/src/neural_network.rs
+++ b/src/neural_network.rs
@ -0,0 +1,336 @@
 use crate::float::Float;
 use crate::linear_algebra::Vector;
 use crate::transforms;
 use crate::transforms::{ReluTransform, SoftmaxTransform};
 #[derive(Debug, Clone)]
 pub struct NNPoint {
    label: usize,
    normalized_image: Vec<Float>,
 }
 impl NNPoint {
    pub fn new(label: u8, normalized_image: Vec<Float>) -> Self {
        Self {
            label: label as usize,
            normalized_image,
        }
    }
 }
 #[derive(Debug)]
 pub struct NeuralNetworkInTraining {
    neurons_per_layer: Vec<usize>,
    inputs: Vec<Vector>, // Each input will start with 1
    output: Vector,
    input_gradients: Vec<Vector>,
    transforms: Vec<ReluTransform>,
    output_transform: SoftmaxTransform,
 }
 #[derive(Debug, Copy, Clone)]
 pub struct NeuralNetworkParameters {
    pub epochs: usize,
    pub batch_size: usize,
    pub learning_rate: Float,
 }
 impl NeuralNetworkParameters {
    fn show(&self) -> String {
        format!(
            "epoch = {}, batch = {}, rate = {}",
            self.epochs, self.batch_size, self.learning_rate
        )
    }
 }
 impl NeuralNetworkInTraining {
    pub fn new(neurons_per_layer: Vec<usize>) -> Self {
        // e.g. neurons_per_layer = [699, 79, 49, 19, 14, 10]
        // By a layer here we mean a collection of neurons that's between two neighbouring Transforms or the initial input or final input neurons.
        // Note that there are N + 1 layers where N is the number of Transforms used.
        if neurons_per_layer.len() < 2 {
            todo!()
        }
        let neurons_per_layer_except_last: Vec<usize> = neurons_per_layer
            .clone()
            .into_iter()
            .rev()
            .skip(1)
            .rev()
            .collect();
        let neurons_per_layer_except_first: Vec<usize> =
            neurons_per_layer.clone().into_iter().skip(1).collect();
        // Convention: the first component should always be 1.0 - this allows the first
        // column of the weight matrix to be interpreted as bias.
        let inputs: Vec<Vector> = neurons_per_layer_except_last
            .iter()
            .map(|neuron_count| {
                let mut v = Vector::zero(neuron_count + 1);
                v[0] = 1.0;
                v
            })
            .collect();
        let input_gradients: Vec<Vector> = neurons_per_layer_except_first
            .iter()
            .map(|neuron_count| Vector::zero(*neuron_count))
            .collect();
        let transforms: Vec<ReluTransform> = neurons_per_layer_except_last
            .iter()
            .zip(neurons_per_layer_except_last.iter().skip(1))
            .map(|(input_neuron_count, output_neuron_count)| {
                ReluTransform::new(*input_neuron_count + 1, *output_neuron_count)
            })
            .collect();
        let neurons_in_last_layer = *neurons_per_layer.last().unwrap();
        let neurons_in_next_to_last_layer = *neurons_per_layer_except_last.last().unwrap();
        let output = Vector::zero(neurons_in_last_layer);
        let output_transform =
            SoftmaxTransform::new(neurons_in_next_to_last_layer + 1, neurons_in_last_layer);
        Self {
            neurons_per_layer,
            inputs,
            output,
            input_gradients,
            transforms,
            output_transform,
        }
    }
    // You need to initialize inputs[0] before use.
    fn output_mut(&mut self) {
        // The following iterates over pairs of neighbouring inputs where we
        // have a mutable reference to both of them.
        // With Rust borrow-checking rules this can't be done directly,
        // so I had to resort to `split_at_mut(k)`
        for i in 0..self.inputs.len() - 1 {
            let (left_inputs, right_inputs) = self.inputs.split_at_mut(i + 1);
            let transform = &mut self.transforms[i];
            let input = &left_inputs[i][..];
            let output = &mut right_inputs[0][1..];
            transform.output_mut(input, output);
        }
        self.output_transform.output_mut(
            &self.inputs[self.inputs.len() - 1][..],
            &mut self.output[..],
        );
    }
    // Initialize input_gradients[-1] with error gradient before use.
    fn update_weights_mut(&mut self) {
        let last_index = self.input_gradients.len() - 1;
        // TODO: Last layer is no different. This should be part of the same loop
        {
            let transform = &mut self.output_transform;
            transform.potential_gradient_mut(&self.input_gradients[last_index][..]);
            transform
                .gradient_with_respect_to_input_mut(&mut self.input_gradients[last_index - 1][..]);
            transform.add_gradient_with_respect_to_weights_mut(&self.inputs[last_index][..]);
        }
        for i in (0..self.input_gradients.len() - 2).rev() {
            let (left_input_gradient, right_input_gradient) =
                self.input_gradients.split_at_mut(i + 1);
            let transform = &mut self.transforms[i + 1];
            let left_grad = &mut left_input_gradient[i][..];
            let right_grad = &right_input_gradient[0][..];
            let input = &self.inputs[i + 1][..];
            transform.potential_gradient_mut(right_grad);
            transform.gradient_with_respect_to_input_mut(left_grad);
            transform.add_gradient_with_respect_to_weights_mut(input);
        }
        {
            let transform = &mut self.transforms[0];
            transform.potential_gradient_mut(&self.input_gradients[0][..]);
            transform.add_gradient_with_respect_to_weights_mut(&self.inputs[0][..]);
            // Note that we are not computing gradient with respect to input, since this is the
            // first layer and we don't care about changes to input, only to weights.
        }
    }
    fn forward_and_backwards_mut(&mut self, point: &NNPoint) {
        self.inputs[0][1..].copy_from_slice(&point.normalized_image[..]);
        self.output_mut();
        let last_index = self.input_gradients.len() - 1;
        // WARNING: Use proper error function
        // transforms::gradient_error_mut(&self.output[..], desired_output, &mut self.input_gradients[last_index][..]);
        transforms::cross_entropy_derivative_simple(
            &self.output[..],
            point.label,
            &mut self.input_gradients[last_index][..],
        );
        self.update_weights_mut()
    }
    fn iterate_over_batch_mut(&mut self, learning_rate: Float, batch: &[NNPoint]) {
        // iterates over the batch, while updating gradient of weights.
        for point in batch {
            self.forward_and_backwards_mut(point);
        }
        // Update the current weights by the opposite of the epsilon / batch_size *weight_gradients
        let batch_size = batch.len() as Float;
        let epsilon = -learning_rate / batch_size;
        for transform in &mut self.transforms {
            transform
                .weight
                .add_to_self_scaled_mut(epsilon, &transform.weight_gradient);
            // Resets the weight gradient to zero so it can be used in next batch.
            transform.weight_gradient.zero_mut();
        }
    }
    fn iterate_over_epoch(
        &mut self,
        training_set: &mut [NNPoint],
        batch_size: usize,
        learning_rate: Float,
    ) {
        use rand::seq::SliceRandom;
        use rand::thread_rng;
        training_set.shuffle(&mut thread_rng()); // Shuffling is linear in the size of the slice
        for batch in training_set.chunks(batch_size) {
            self.iterate_over_batch_mut(learning_rate, batch);
        }
    }
    pub fn train(
        &mut self,
        parameters: NeuralNetworkParameters,
        training_set: &mut [NNPoint],
        testing_set: Option<&[NNPoint]>,
    ) {
        fn test(
            nn: &mut NeuralNetworkInTraining,
            parameters: NeuralNetworkParameters,
            testing_set: Option<&[NNPoint]>,
            accuracy_per_epoch: &mut Vec<f32>,
        ) {
            if let Some(testing_set) = testing_set {
                let accuracy = nn.accuracy(testing_set);
                accuracy_per_epoch.push(accuracy);
                println!();
                println!("{}", parameters.show());
                println!("{:?}", nn.neurons_per_layer);
                println!("{:?}", accuracy_per_epoch);
            }
        }
        fn next_learning_rate(initial_learning_rate: Float, epoch: usize) -> Float {
            // initial_learning_rate / (1.0 + epoch as Float / 30.0)
            // initial_learning_rate * (0.1 as Float).powf(epoch as Float / 50.0)
            initial_learning_rate * (0.1 as Float).powf(epoch as Float / 20.0)
        }
        use std::time::Instant;
        let now = Instant::now();
        let number_of_epochs = parameters.epochs;
        let batch_size = parameters.batch_size;
        let mut learning_rate = parameters.learning_rate;
        let mut accuracy_per_epoch = Vec::with_capacity(number_of_epochs + 1);
        test(self, parameters, testing_set, &mut accuracy_per_epoch);
        for epoch in 0..number_of_epochs {
            println!("Epoch {}/{}", epoch + 1, number_of_epochs);
            println!("Current learning rate = {}", learning_rate);
            self.iterate_over_epoch(training_set, batch_size, learning_rate);
            test(self, parameters, testing_set, &mut accuracy_per_epoch);
            learning_rate = next_learning_rate(parameters.learning_rate, epoch);
            let elapsed = now.elapsed();
            let total_seconds = elapsed.as_secs();
            let minutes = total_seconds / 60;
            let seconds = total_seconds % 60;
            println!("Duration of training: {} min {} sec", minutes, seconds);
        }
    }
    pub fn show_accuracy_on(&mut self, testing_set: &[NNPoint], dataset_name: &str) {
        println!(
            "{} dataset accuracy: {:?}",
            dataset_name,
            self.accuracy(testing_set)
        );
    }
    pub fn output_label(&self) -> usize {
        let mut state: Option<(usize, Float)> = None;
        for (i, y) in self.output.iter().enumerate() {
            match state {
                Some((_, max_so_far)) => {
                    if *y > max_so_far {
                        state = Some((i, *y))
                    }
                }
                None => state = Some((i, *y)),
            }
        }
        match state {
            Some((label, _)) => label,
            None => {
                todo!()
            }
        }
    }
    pub fn accuracy(&mut self, testing_set: &[NNPoint]) -> f32 {
        let mut num_of_correct_classifications = 0;
        for point in testing_set {
            self.inputs[0][1..].copy_from_slice(&point.normalized_image[..]);
            self.output_mut(); // This doesn't change the weights
            let neural_network_produced_label = self.output_label();
            if point.label == neural_network_produced_label {
                num_of_correct_classifications += 1
            }
            // println!("max-label: {}, desired-label: {}, prob-distr: {:?}", neural_network_produced_label, point.label, self.output);
        }
        num_of_correct_classifications as f32 / testing_set.len() as f32
    }
    pub fn test(&mut self, dataset: &[NNPoint]) -> Vec<usize> {
        dataset
            .iter()
            .map(|point| {
                self.inputs[0][1..].copy_from_slice(&point.normalized_image[..]);
                self.output_mut();
                self.output_label()
            })
            .collect()
    }
    pub fn test_distributions(&mut self, dataset: &[NNPoint]) -> Vec<Vec<Float>> {
        dataset
            .iter()
            .map(|point| {
                self.inputs[0][1..].copy_from_slice(&point.normalized_image[..]);
                self.output_mut();
                self.output.clone().to_vec()
            })
            .collect()
    }
 }
--- a/src/preprocessing.rs
+++ b/src/preprocessing.rs
@ -0,0 +1,121 @@
 use crate::env;
 use crate::float::Float;
 use crate::neural_network::NNPoint;
 use std::fs::File;
 use std::io;
 use std::io::{BufRead, BufReader, BufWriter, Write};
 #[derive(Debug)]
 enum ParsingError {
    CouldNotParseLabel,
    CouldNotParseVector,
    ImageHasWrongSize,
 }
 fn parse_dataset(
    labels_buffer: impl BufRead,
    vectors_buffer: impl BufRead,
    number_of_points: usize,
 ) -> Result<Result<Vec<NNPoint>, ParsingError>, io::Error> {
    let mut output = vec![];
    for (label_result, vector_result) in labels_buffer
        .lines()
        .zip(vectors_buffer.lines())
        .take(number_of_points)
    {
        let label_str = label_result?;
        let image_str = vector_result?;
        let label: u8 = match label_str.parse() {
            Ok(label) => label,
            Err(_) => return Ok(Err(ParsingError::CouldNotParseLabel)),
        };
        let mut image: Vec<u8> = vec![];
        for str in image_str.split(',') {
            match str.parse() {
                Ok(pixel_value) => {
                    image.push(pixel_value);
                }
                Err(_) => return Ok(Err(ParsingError::CouldNotParseVector)),
            }
        }
        if image.len() != env::NUMBER_OF_PIXELS_PER_IMAGE {
            return Ok(Err(ParsingError::ImageHasWrongSize));
        }
        output.push(NNPoint::new(label, normalize_input(&image)));
    }
    Ok(Ok(output))
 }
 fn average(vec: &[u8]) -> Float {
    let mut result: u32 = 0; // the result is bounded by NUMBER_OF_PIXELS_PER_IMAGE * 255 ~~ 200 k
    for x in vec {
        result += *x as u32;
    }
    (result as Float) / (vec.len() as Float)
 }
 fn variance(vec: &[Float]) -> Float {
    // assumes that the average of vec is 0
    let mut result: Float = 0.0;
    for x in vec {
        result += x * x;
    }
    result
 }
 fn normalize_input(vec: &[u8]) -> Vec<Float> {
    let average = average(vec);
    let mut result: Vec<Float> = vec.iter().map(|x| (*x as Float) - average).collect();
    let stddev = variance(&result).sqrt();
    for i in &mut result {
        *i /= stddev;
    }
    result
 }
 pub fn dataset_from_file(
    (labels_file_path, vectors_file_path): (&str, &str),
    number_of_points: usize,
 ) -> Result<Vec<NNPoint>, io::Error> {
    let (labels_file, vectors_file): (File, File) = (
        File::open(labels_file_path)?,
        File::open(vectors_file_path)?,
    );
    match parse_dataset(
        BufReader::new(labels_file),
        BufReader::new(vectors_file),
        number_of_points,
    )? {
        Ok(points) => Ok(points),
        Err(parsing_error) => {
            println!("ERROR: {:?}", parsing_error);
            todo!()
        }
    }
 }
 pub fn export_to(outputs: &[usize], file_path: &str) -> Result<(), io::Error> {
    let mut file = BufWriter::new(File::create(file_path)?);
    for x in outputs {
        writeln!(file, "{}", x)?
    }
    Ok(())
 }
 pub fn export_distributions_to(outputs: &[Vec<Float>], file_path: &str) -> Result<(), io::Error> {
    let mut file = BufWriter::new(File::create(file_path)?);
    for ps in outputs {
        let mut s = "".to_string();
        for p in ps {
            let p_str = format!("{}, ", p);
            s += &p_str
        }
        writeln!(file, "[{}]", s)?
    }
    Ok(())
 }
--- a/src/transforms.rs
+++ b/src/transforms.rs
@ -0,0 +1,315 @@
 use crate::float::Float;
 use crate::linear_algebra::{ColumnEfficientMatrix, Vector};
 use rand_distr::Normal;
 pub fn l2_error(xs: &[Float], ys: &[Float]) -> Float {
    // err-squared
    let mut result = 0.0;
    for (x, y) in xs.iter().zip(ys) {
        let s = x - y;
        result += s * s;
    }
    0.5 * result
 }
 pub fn gradient_l2_error_mut(xs: &[Float], ys: &[Float], out: &mut [Float]) {
    for (i, (x, y)) in xs.iter().zip(ys).enumerate() {
        out[i] = x - y;
    }
 }
 fn sigmoid(x: Float) -> Float {
    1.0 / (1.0 + (-x).exp())
 }
 fn activation_of_sigmoid_potential(potential: Float) -> Float {
    sigmoid(potential)
 }
 fn vectorized_sigmoid_activation_of_potential(potentials: &[Float], out: &mut [Float]) {
    for (potential, y) in potentials.iter().zip(out) {
        *y = activation_of_sigmoid_potential(*potential)
    }
 }
 fn derivative_of_sigmoid_activation_of_potential(potential: Float) -> Float {
    let s = sigmoid(potential);
    s * (1.0 - s)
 }
 fn vectorized_gradient_of_sigmoid_activation_of_potential_mut(
    potentials: &[Float],
    derivatives_state: &mut [Float],
    output_gradient: &[Float],
    input_gradient: &mut [Float],
 ) {
    use crate::linear_algebra::DiagonalMatrix;
    for (potential, state) in potentials.iter().zip(derivatives_state.iter_mut()) {
        *state = derivative_of_sigmoid_activation_of_potential(*potential)
    }
    DiagonalMatrix::new(derivatives_state).apply_mut(output_gradient, input_gradient);
 }
 // =====cross-entropy=====
 // takes in two probability distributions
 fn cross_entropy(p: &[Float], q: &[Float]) -> Float {
    let mut result = 0.0;
    for (x, y) in p.iter().zip(q) {
        result += y * x.ln()
    }
    -result
 }
 // Second probability distribution is deterministic,
 // i.e. it deterministically outputs the same value.
 fn cross_entropy_simple(p: &[Float], q: usize) -> Float {
    -p[q].ln()
 }
 fn cross_entropy_derivative_mut(p: &[Float], q: &[Float], out: &mut [Float]) {
    for ((a, b), c) in p.iter().zip(q).zip(out) {
        *c = -b / a
    }
 }
 // Second probability distribution is deterministic,
 // i.e. it deterministically outputs the same value.
 pub fn cross_entropy_derivative_simple(p: &[Float], q: usize, out: &mut [Float]) {
    // TODO: Do we really need to reset everything to besides the q-th index?
    for a in out.iter_mut() {
        *a = 0.0;
    }
    out[q] = -1.0 / p[q];
 }
 fn softmax_mut(input: &[Float], out: &mut [Float]) {
    let mut s = 0.0;
    for (x, y) in input.iter().zip(out.iter_mut()) {
        let e = x.exp();
        *y = e;
        s += e
    }
    for y in out {
        *y /= s
    }
 }
 fn softmax_gradient_mut(
    softmax_output: &[Float],
    gradient_output: &[Float],
    gradient_input: &mut [Float],
 ) {
    for (j, dx) in gradient_input.iter_mut().enumerate() {
        *dx = 0.0;
        for (i, dy) in gradient_output.iter().enumerate() {
            // Note that the gradient matrix is symmetric, so don't worry about the order of
            // indices
            *dx += softmax_output[i] * (if i == j { 1.0 } else { 0.0 } - softmax_output[j]) * dy
        }
    }
 }
 // relu
 fn relu_mut(input: &[Float], out: &mut [Float]) {
    for (x, y) in input.iter().zip(out.iter_mut()) {
        *y = x.max(0.0)
    }
 }
 fn relu_gradient_mut(input: &[Float], gradient_output: &[Float], gradient_input: &mut [Float]) {
    for ((dy, dx), x) in gradient_output.iter().zip(gradient_input).zip(input) {
        *dx = if *x > 0.0 { *dy } else { 0.0 }
    }
 }
 // =====sigmoid=====
 #[derive(Debug)]
 pub struct SigmoidTransform {
    pub weight: ColumnEfficientMatrix,
    potential_vector: Vector,
    derivatives_state: Vector,
    potential_gradient: Vector,
    pub weight_gradient: ColumnEfficientMatrix,
 }
 impl SigmoidTransform {
    pub fn new(input_dimension: usize, output_dimension: usize) -> Self {
        let mean = 0.0;
        let std_dev = 1.0 / (input_dimension as Float).sqrt();
        let normal_distr = Normal::new(mean, std_dev).unwrap();
        Self {
            weight: ColumnEfficientMatrix::random_with_normal_distribution(
                input_dimension,
                output_dimension,
                normal_distr,
            ),
            potential_vector: Vector::zero(output_dimension),
            derivatives_state: Vector::zero(output_dimension), // TODO: Can I get rid of this?
            potential_gradient: Vector::zero(output_dimension),
            weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension),
        }
    }
    pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) {
        self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input]
        vectorized_sigmoid_activation_of_potential(&self.potential_vector[..], output);
        // y = f(potential)
    }
    // Note below that (1) and (2) are independent, but they both depend on (0).
    // (0)
    pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) {
        // updates the potential gradient
        vectorized_gradient_of_sigmoid_activation_of_potential_mut(
            &self.potential_vector[..],
            &mut self.derivatives_state[..],
            output_gradient,
            &mut self.potential_gradient[..],
        ); // potential_gradient = grad[f](potential)[output_gradient]
    }
    // Note that it makes sense to have the two gradients split, since for the input layer we will
    // not need to compute the input gradient, only the weight gradient is important.
    // WARNING: You need to call `potential_gradient_mut` before using the below function
    // (1)
    pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) {
        // updates the input gradient
        //
        // Note that the first column of `self.weight` is the bias,
        // and the previous layer doesn't care about its gradient.
        // So we just return the gradient below the first component of the input
        // by dropping the bias column.
        self.weight
            .drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient);
        // transpose[T without the first column][potential_gradient]
    }
    // Note that the proof ensures that the `potential_gradient` has been updated
    // WARNING: You need to call `potential_gradient_mut` before using the below function
    // (2)
    pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) {
        use crate::linear_algebra::VectorTensorCovectorMatrix;
        let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input
        matrix.add_to_mut(&mut self.weight_gradient);
    }
 }
 // =====softmax=====
 #[derive(Debug)]
 pub struct SoftmaxTransform {
    pub weight: ColumnEfficientMatrix,
    potential_vector: Vector,
    softmax_output: Vector, // Used for computation of the softmax gradient
    potential_gradient: Vector,
    pub weight_gradient: ColumnEfficientMatrix,
 }
 impl SoftmaxTransform {
    pub fn new(input_dimension: usize, output_dimension: usize) -> Self {
        let mean = 0.0;
        let std_dev = 1.0 / (input_dimension as Float).sqrt();
        let normal_distr = Normal::new(mean, std_dev).unwrap();
        Self {
            weight: ColumnEfficientMatrix::random_with_normal_distribution(
                input_dimension,
                output_dimension,
                normal_distr,
            ),
            potential_vector: Vector::zero(output_dimension),
            softmax_output: Vector::zero(output_dimension),
            potential_gradient: Vector::zero(output_dimension),
            weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension),
        }
    }
    pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) {
        self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input]
        softmax_mut(&self.potential_vector[..], output); // y = f(potential)
        self.softmax_output.copy_from_slice(output)
    }
    pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) {
        softmax_gradient_mut(
            &self.softmax_output[..],
            output_gradient,
            &mut self.potential_gradient[..],
        ); // potential_gradient = grad[softmax](potential)[output_gradient]
    }
    pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) {
        self.weight
            .drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient);
        // transpose[T without the first column][potential_gradient]
    }
    pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) {
        use crate::linear_algebra::VectorTensorCovectorMatrix;
        let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input
        matrix.add_to_mut(&mut self.weight_gradient);
    }
 }
 #[derive(Debug)]
 pub struct ReluTransform {
    pub weight: ColumnEfficientMatrix,
    potential_vector: Vector,
    potential_gradient: Vector,
    pub weight_gradient: ColumnEfficientMatrix,
 }
 impl ReluTransform {
    pub fn new(input_dimension: usize, output_dimension: usize) -> Self {
        let mean = 0.0;
        let std_dev = 1.0 / (input_dimension as Float).sqrt();
        let normal_distr = Normal::new(mean, std_dev).unwrap();
        Self {
            weight: ColumnEfficientMatrix::random_with_normal_distribution(
                input_dimension,
                output_dimension,
                normal_distr,
            ),
            potential_vector: Vector::zero(output_dimension),
            potential_gradient: Vector::zero(output_dimension),
            weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension),
        }
    }
    pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) {
        self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input]
        relu_mut(&self.potential_vector[..], output); // y = f(potential)
    }
    pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) {
        relu_gradient_mut(
            &self.potential_vector[..],
            output_gradient,
            &mut self.potential_gradient[..],
        ); // potential_gradient = grad[softmax](potential)[output_gradient]
    }
    pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) {
        self.weight
            .drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient);
        // transpose[T without the first column][potential_gradient]
    }
    pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) {
        use crate::linear_algebra::VectorTensorCovectorMatrix;
        let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input
        matrix.add_to_mut(&mut self.weight_gradient);
    }
 }
--- a/tmp_repl.txt
+++ b/tmp_repl.txt
@ -0,0 +1,23 @@
 cargo init --edition 2018
 cargo add rand
 cargo add rand_distr
 cargo run --release
 module add rust
 cargo build
 cargo fmt
 # linter
 cargo clippy -- -D warnings
 python3 evaluator/evaluate.py test_predictions.csv data/fashion_mnist_test_labels.csv
 python3 evaluator/evaluate.py train_predictions.csv data/fashion_mnist_train_labels.csv
		`@ -0,0 +1 @@`
							`pub type Float = f32; // f64 or f32, doesn't seem to make any difference`