Initial commit

2026-03-20 11:48:07 +01:00 · 2026-03-20 11:48:07 +01:00 · fa8eae3146
commit fa8eae3146
17 changed files with 141328 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+target/
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,108 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "autocfg"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "getrandom"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.150"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
+
+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
+[[package]]
+name = "num-traits"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
+
+[[package]]
+name = "pv021_project"
+version = "0.1.0"
+dependencies = [
+ "rand",
+ "rand_distr",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "rand_distr"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
+dependencies = [
+ "num-traits",
+ "rand",
+]
+
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,10 @@
+[package]
+name = "pv021_project"
+version = "0.1.0"
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+rand = "0.8.5"
+rand_distr = "0.4.3"
--- a/OVERVIEW.png
+++ b/OVERVIEW.png
--- a/README.md
+++ b/README.md
@ -0,0 +1,16 @@
+Rust Deep Learning
+
+School Project I did for a Neural Networks Class at the end of 2023:
+- Implements a neural net (backpropagation / multilayer perceptron) in Rust
+- Constraint: Can't use any Linear Algebra libraries or frameworks
+- Training/Dataset: Fashion-MNIST dataset (achieves about 91% accuracy in less than 10 min of training)
+
+See `OVERVIEW.png` for the underlying math I came up with to do backprop and organize memory.
+
+# Dataset
+Fashion MNIST (https://arxiv.org/pdf/1708.07747.pdf). Dataset of images ‒ consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. The dataset is in CSV format:
+- `fashion_mnist_train_vectors.csv` - training input vectors
+- `fashion_mnist_test_vectors.csv`  - testing input vectors
+- `fashion_mnist_train_labels.csv`  - training labels
+- `fashion_mnist_test_labels.csv`   - testing labels
+
--- a/data/fashion_mnist_test_labels.csv
+++ b/data/fashion_mnist_test_labels.csv
--- a/data/fashion_mnist_test_vectors.csv
+++ b/data/fashion_mnist_test_vectors.csv
--- a/data/fashion_mnist_train_labels.csv
+++ b/data/fashion_mnist_train_labels.csv
--- a/data/fashion_mnist_train_vectors.csv
+++ b/data/fashion_mnist_train_vectors.csv
--- a/src/env.rs
+++ b/src/env.rs
@ -0,0 +1,16 @@
+pub const NUMBER_OF_PIXELS_PER_IMAGE: usize = 28 * 28;
+
+pub const TEST_CSVS: (&str, &str) = (
+    "data/fashion_mnist_test_labels.csv",
+    "data/fashion_mnist_test_vectors.csv",
+);
+pub const TRAIN_CSVS: (&str, &str) = (
+    "data/fashion_mnist_train_labels.csv",
+    "data/fashion_mnist_train_vectors.csv",
+);
+
+pub const TEST_PREDICTIONS_CSV: &str = "test_predictions.csv";
+pub const TRAIN_PREDICTIONS_CSV: &str = "train_predictions.csv";
+
+pub const TEST_DISTRIBUTIONS_CSV: &str = "test_distributions.csv";
+pub const TRAIN_DISTRIBUTIONS_CSV: &str = "train_distributions.csv";
--- a/src/float.rs
+++ b/src/float.rs
@ -0,0 +1 @@
+pub type Float = f32; // f64 or f32, doesn't seem to make any difference
--- a/src/linear_algebra.rs
+++ b/src/linear_algebra.rs
@ -0,0 +1,305 @@
+mod raw_operation {
+    use crate::float::Float;
+
+    pub fn zero(out: &mut [Float]) {
+        for y in out {
+            *y = 0.0
+        }
+    }
+
+    pub fn inner_product(xs: &[Float], ys: &[Float]) -> Float {
+        let mut result: Float = 0.0;
+        for (x, y) in xs.iter().zip(ys) {
+            result += x * y;
+        }
+        result
+    }
+
+    pub fn scale(xs: &[Float], k: Float, out: &mut [Float]) {
+        for (x, y) in xs.iter().zip(out) {
+            *y = *x * k
+        }
+    }
+
+    pub fn add(xs: &[Float], ys: &[Float], out: &mut [Float]) {
+        for ((x, y), z) in xs.iter().zip(ys).zip(out) {
+            *z = *x + *y
+        }
+    }
+
+    pub fn sub(xs: &[Float], ys: &[Float], out: &mut [Float]) {
+        for ((x, y), z) in xs.iter().zip(ys).zip(out) {
+            *z = *x - *y
+        }
+    }
+}
+
+use crate::float::Float;
+use rand_distr::{Distribution, Normal};
+use std::iter::FromIterator;
+use std::ops::{Index, IndexMut};
+use std::slice::{Iter, IterMut, SliceIndex};
+
+#[derive(Clone, Debug)]
+pub struct Vector(Vec<Float>);
+
+impl<Idx> Index<Idx> for Vector
+where
+    Idx: SliceIndex<[Float]>,
+{
+    type Output = Idx::Output;
+
+    fn index(&self, index: Idx) -> &Self::Output {
+        &self.0[index]
+    }
+}
+
+impl<Idx> IndexMut<Idx> for Vector
+where
+    Idx: SliceIndex<[Float]>,
+{
+    fn index_mut(&mut self, index: Idx) -> &mut Self::Output {
+        &mut self.0[index]
+    }
+}
+
+impl FromIterator<Float> for Vector {
+    fn from_iter<I: IntoIterator<Item = Float>>(iter: I) -> Self {
+        let mut v = vec![];
+        for x in iter {
+            v.push(x)
+        }
+        Vector(v)
+    }
+}
+
+impl Vector {
+    pub fn iter(&self) -> Iter<'_, Float> {
+        self.0.iter()
+    }
+
+    pub fn iter_mut(&mut self) -> IterMut<'_, Float> {
+        self.0.iter_mut()
+    }
+
+    pub fn as_slice(&self) -> &[Float] {
+        self.0.as_slice()
+    }
+
+    pub fn as_mut_slice(&mut self) -> &mut [Float] {
+        self.0.as_mut_slice()
+    }
+
+    pub fn copy_from_slice(&mut self, src: &[Float]) {
+        self.0.copy_from_slice(src)
+    }
+
+    pub fn to_vec(self) -> Vec<Float> {
+        self.0
+    }
+}
+
+impl Vector {
+    pub fn new(vector: Vec<Float>) -> Self {
+        Self(vector)
+    }
+
+    pub fn zero(size: usize) -> Self {
+        Self(vec![0.0; size])
+    }
+
+    pub fn add_mut(&self, w: &[Float], out: &mut [Float]) {
+        raw_operation::add(&self[..], w, out)
+    }
+
+    pub fn sub_mut(&self, w: &[Float], out: &mut [Float]) {
+        raw_operation::add(&self[..], w, out)
+    }
+
+    pub fn scale_mut(&self, k: Float, out: &mut [Float]) {
+        raw_operation::scale(&self[..], k, out)
+    }
+
+    pub fn inner_product(&self, w: &[Float]) -> Float {
+        raw_operation::inner_product(&self[..], w)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ColumnEfficientMatrix {
+    pub input_dimension: usize,
+    pub output_dimension: usize,
+    pub components: Vec<Float>,
+}
+
+impl Index<(usize, usize)> for ColumnEfficientMatrix {
+    type Output = Float;
+
+    fn index(&self, (column, row): (usize, usize)) -> &Self::Output {
+        &self.components[column * self.output_dimension + row]
+    }
+}
+
+impl IndexMut<(usize, usize)> for ColumnEfficientMatrix {
+    fn index_mut(&mut self, (column, row): (usize, usize)) -> &mut Self::Output {
+        &mut self.components[column * self.output_dimension + row]
+    }
+}
+
+impl ColumnEfficientMatrix {
+    pub fn from_rows(rows: Vec<Vec<Float>>) -> Self {
+        let output_dimension = rows.len();
+        if output_dimension == 0 {
+            Self {
+                input_dimension: 0,
+                output_dimension: 0,
+                components: vec![],
+            }
+        } else {
+            let input_dimension = rows[0].len();
+            let mut components = Vec::with_capacity(input_dimension * output_dimension);
+            for i in 0..input_dimension {
+                for j in 0..output_dimension {
+                    components.push(rows[j][i])
+                }
+            }
+            Self {
+                input_dimension,
+                output_dimension,
+                components,
+            }
+        }
+    }
+
+    pub fn zero(input_dimension: usize, output_dimension: usize) -> ColumnEfficientMatrix {
+        Self {
+            input_dimension,
+            output_dimension,
+            components: vec![0.0; input_dimension * output_dimension],
+        }
+    }
+
+    pub fn random_with_normal_distribution(
+        input_dimension: usize,
+        output_dimension: usize,
+        distribution: Normal<Float>,
+    ) -> ColumnEfficientMatrix {
+        let mut components = Vec::with_capacity(input_dimension * output_dimension);
+        for _ in 0..input_dimension {
+            for _ in 0..output_dimension {
+                components.push(distribution.sample(&mut rand::thread_rng()))
+            }
+        }
+
+        Self {
+            input_dimension,
+            output_dimension,
+            components,
+        }
+    }
+
+    pub fn zero_mut(&mut self) {
+        for a in &mut self.components {
+            *a = 0.0
+        }
+    }
+
+    pub fn add_scaled_mut(&self, k: Float, b: &Self, c: &mut Self) {
+        for ((a, b), c) in self
+            .components
+            .iter()
+            .zip(&b.components)
+            .zip(&mut c.components)
+        {
+            *c = *a + *b * k
+        }
+    }
+
+    pub fn add_to_self_scaled_mut(&mut self, k: Float, b: &Self) {
+        for (a, b) in self.components.iter_mut().zip(&b.components) {
+            *a += *b * k
+        }
+    }
+
+    pub fn apply_mut(&self, v: &[Float], out: &mut [Float]) {
+        for j in 0..self.output_dimension {
+            let mut result: Float = 0.0;
+            for i in 0..self.input_dimension {
+                result += self[(i, j)] * v[i]
+            }
+            out[j] = result;
+        }
+    }
+
+    // Apply the transpose
+    pub fn coapply_mut(&self, w: &[Float], out: &mut [Float]) {
+        for i in 0..self.input_dimension {
+            let start_index = i * self.output_dimension;
+            out[i] = raw_operation::inner_product(
+                &self.components[start_index..start_index + self.output_dimension],
+                w,
+            );
+        }
+    }
+
+    pub fn drop_first_column_coapply_mut(&self, w: &[Float], out: &mut [Float]) {
+        for i in 1..self.input_dimension {
+            let start_index = i * self.output_dimension;
+            out[i - 1] = raw_operation::inner_product(
+                &self.components[start_index..start_index + self.output_dimension],
+                w,
+            );
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct DiagonalMatrix<'a> {
+    pub diagonal: &'a [Float],
+}
+
+impl<'a> DiagonalMatrix<'a> {
+    pub fn new(diagonal: &'a [Float]) -> DiagonalMatrix<'a> {
+        Self { diagonal }
+    }
+
+    pub fn apply_mut(&self, v: &[Float], out: &mut [Float]) {
+        for (i, (d, x)) in self.diagonal.iter().zip(v.iter()).enumerate() {
+            out[i] = d * x;
+        }
+    }
+}
+
+// Given vector `w : W` and a covector `f : V -> Float`,
+// the linear map `w tensor f : V -> W` computes `v : V ~> f(v) * w`
+#[derive(Clone, Debug)]
+pub struct VectorTensorCovectorMatrix<'a> {
+    pub output_vector: &'a [Float],  // w : W
+    pub input_covector: &'a [Float], // f : V -> Float
+}
+
+impl<'a> VectorTensorCovectorMatrix<'a> {
+    pub fn new(
+        output_vector: &'a [Float],
+        input_covector: &'a [Float],
+    ) -> VectorTensorCovectorMatrix<'a> {
+        Self {
+            output_vector,
+            input_covector,
+        }
+    }
+
+    pub fn apply_mut(&self, v: &[Float], out: &mut [Float]) {
+        let scalar = raw_operation::inner_product(self.input_covector, v);
+        raw_operation::scale(self.output_vector, scalar, out)
+    }
+
+    pub fn add_to_mut(&self, matrix: &mut ColumnEfficientMatrix) {
+        // TODO: Surely this can be optimized by iterating over the columns of the matrix directly
+        for (i, x) in self.input_covector.iter().enumerate() {
+            for (j, y) in self.output_vector.iter().enumerate() {
+                matrix[(i, j)] += x * y
+            }
+        }
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,76 @@
+mod env;
+mod float;
+mod linear_algebra;
+mod neural_network;
+mod preprocessing;
+mod transforms;
+
+use preprocessing::{dataset_from_file, export_distributions_to, export_to};
+use std::io;
+
+use crate::neural_network::{NNPoint, NeuralNetworkInTraining, NeuralNetworkParameters};
+
+fn main() -> Result<(), io::Error> {
+    let size_of_full_training_dataset = 60000;
+    let size_of_training_dataset = 50000;
+    let size_of_validation_dataset = 10000;
+    let size_of_testing_dataset = 10000;
+
+    println!("Importing datasets...");
+    let full_training_dataset: Vec<NNPoint> =
+        dataset_from_file(env::TRAIN_CSVS, size_of_full_training_dataset)?;
+    let mut training_dataset: Vec<NNPoint> = {
+        let mut training_dataset = Vec::with_capacity(size_of_training_dataset);
+        training_dataset.extend_from_slice(&full_training_dataset[0..size_of_training_dataset]);
+        training_dataset
+    };
+    let validation_dataset: Vec<NNPoint> = {
+        let mut validation_dataset = Vec::with_capacity(size_of_validation_dataset);
+        validation_dataset.extend_from_slice(
+            &full_training_dataset
+                [size_of_training_dataset..size_of_training_dataset + size_of_validation_dataset],
+        );
+        validation_dataset
+    };
+    let test_dataset: Vec<NNPoint> = dataset_from_file(env::TEST_CSVS, size_of_testing_dataset)?;
+
+    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 70, 10]);
+    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 150, 10]);
+    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 60, 40, 10]);
+    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 150, 10]);
+    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 60, 40, 10]); // batch=20, rate=2 is pretty good. Seems to reliably reach 89%
+    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 80, 60, 40, 10]); // batch=20, rate=2, pretty good like 88 % then suddenly drops to 10%
+    // let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 150, 150, 10]); // this gets me over 90%, nice
+    let mut nn = NeuralNetworkInTraining::new(vec![env::NUMBER_OF_PIXELS_PER_IMAGE, 150, 150, 10]);
+
+    println!("Begin training");
+
+    let params = NeuralNetworkParameters {
+        epochs: 30,
+        batch_size: 30,
+        learning_rate: 2.00,
+    };
+    nn.train(params, &mut training_dataset, Some(&validation_dataset));
+    // nn.train(params, &mut training_dataset, None);
+
+    nn.show_accuracy_on(&full_training_dataset, "training");
+    nn.show_accuracy_on(&test_dataset, "test");
+
+    let predictions_on_full_training_set = nn.test(&full_training_dataset);
+    let predictions_on_test_set = nn.test(&test_dataset);
+    println!("Exporting to {}", env::TRAIN_PREDICTIONS_CSV);
+    export_to(
+        &predictions_on_full_training_set,
+        env::TRAIN_PREDICTIONS_CSV,
+    )?;
+    println!("Exporting to {}", env::TEST_PREDICTIONS_CSV);
+    export_to(&predictions_on_test_set, env::TEST_PREDICTIONS_CSV)?;
+
+    // TODO: Comment this out
+    // let distributions_on_full_training_set = nn.test_distributions(&full_training_dataset);
+    // let distributions_on_test_set = nn.test_distributions(&test_dataset);
+    // export_distributions_to(&distributions_on_full_training_set, env::TRAIN_DISTRIBUTIONS_CSV)?;
+    // export_distributions_to(&distributions_on_test_set, env::TEST_DISTRIBUTIONS_CSV)?;
+
+    Ok(())
+}
--- a/src/neural_network.rs
+++ b/src/neural_network.rs
@ -0,0 +1,336 @@
+use crate::float::Float;
+use crate::linear_algebra::Vector;
+
+use crate::transforms;
+use crate::transforms::{ReluTransform, SoftmaxTransform};
+
+#[derive(Debug, Clone)]
+pub struct NNPoint {
+    label: usize,
+    normalized_image: Vec<Float>,
+}
+
+impl NNPoint {
+    pub fn new(label: u8, normalized_image: Vec<Float>) -> Self {
+        Self {
+            label: label as usize,
+            normalized_image,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct NeuralNetworkInTraining {
+    neurons_per_layer: Vec<usize>,
+
+    inputs: Vec<Vector>, // Each input will start with 1
+    output: Vector,
+    input_gradients: Vec<Vector>,
+
+    transforms: Vec<ReluTransform>,
+    output_transform: SoftmaxTransform,
+}
+
+#[derive(Debug, Copy, Clone)]
+pub struct NeuralNetworkParameters {
+    pub epochs: usize,
+    pub batch_size: usize,
+    pub learning_rate: Float,
+}
+
+impl NeuralNetworkParameters {
+    fn show(&self) -> String {
+        format!(
+            "epoch = {}, batch = {}, rate = {}",
+            self.epochs, self.batch_size, self.learning_rate
+        )
+    }
+}
+
+impl NeuralNetworkInTraining {
+    pub fn new(neurons_per_layer: Vec<usize>) -> Self {
+        // e.g. neurons_per_layer = [699, 79, 49, 19, 14, 10]
+
+        // By a layer here we mean a collection of neurons that's between two neighbouring Transforms or the initial input or final input neurons.
+        // Note that there are N + 1 layers where N is the number of Transforms used.
+        if neurons_per_layer.len() < 2 {
+            todo!()
+        }
+
+        let neurons_per_layer_except_last: Vec<usize> = neurons_per_layer
+            .clone()
+            .into_iter()
+            .rev()
+            .skip(1)
+            .rev()
+            .collect();
+        let neurons_per_layer_except_first: Vec<usize> =
+            neurons_per_layer.clone().into_iter().skip(1).collect();
+
+        // Convention: the first component should always be 1.0 - this allows the first
+        // column of the weight matrix to be interpreted as bias.
+        let inputs: Vec<Vector> = neurons_per_layer_except_last
+            .iter()
+            .map(|neuron_count| {
+                let mut v = Vector::zero(neuron_count + 1);
+                v[0] = 1.0;
+                v
+            })
+            .collect();
+
+        let input_gradients: Vec<Vector> = neurons_per_layer_except_first
+            .iter()
+            .map(|neuron_count| Vector::zero(*neuron_count))
+            .collect();
+
+        let transforms: Vec<ReluTransform> = neurons_per_layer_except_last
+            .iter()
+            .zip(neurons_per_layer_except_last.iter().skip(1))
+            .map(|(input_neuron_count, output_neuron_count)| {
+                ReluTransform::new(*input_neuron_count + 1, *output_neuron_count)
+            })
+            .collect();
+
+        let neurons_in_last_layer = *neurons_per_layer.last().unwrap();
+        let neurons_in_next_to_last_layer = *neurons_per_layer_except_last.last().unwrap();
+
+        let output = Vector::zero(neurons_in_last_layer);
+        let output_transform =
+            SoftmaxTransform::new(neurons_in_next_to_last_layer + 1, neurons_in_last_layer);
+
+        Self {
+            neurons_per_layer,
+
+            inputs,
+            output,
+            input_gradients,
+
+            transforms,
+            output_transform,
+        }
+    }
+
+    // You need to initialize inputs[0] before use.
+    fn output_mut(&mut self) {
+        // The following iterates over pairs of neighbouring inputs where we
+        // have a mutable reference to both of them.
+        // With Rust borrow-checking rules this can't be done directly,
+        // so I had to resort to `split_at_mut(k)`
+        for i in 0..self.inputs.len() - 1 {
+            let (left_inputs, right_inputs) = self.inputs.split_at_mut(i + 1);
+
+            let transform = &mut self.transforms[i];
+            let input = &left_inputs[i][..];
+            let output = &mut right_inputs[0][1..];
+            transform.output_mut(input, output);
+        }
+        self.output_transform.output_mut(
+            &self.inputs[self.inputs.len() - 1][..],
+            &mut self.output[..],
+        );
+    }
+
+    // Initialize input_gradients[-1] with error gradient before use.
+    fn update_weights_mut(&mut self) {
+        let last_index = self.input_gradients.len() - 1;
+
+        // TODO: Last layer is no different. This should be part of the same loop
+        {
+            let transform = &mut self.output_transform;
+
+            transform.potential_gradient_mut(&self.input_gradients[last_index][..]);
+            transform
+                .gradient_with_respect_to_input_mut(&mut self.input_gradients[last_index - 1][..]);
+            transform.add_gradient_with_respect_to_weights_mut(&self.inputs[last_index][..]);
+        }
+
+        for i in (0..self.input_gradients.len() - 2).rev() {
+            let (left_input_gradient, right_input_gradient) =
+                self.input_gradients.split_at_mut(i + 1);
+
+            let transform = &mut self.transforms[i + 1];
+            let left_grad = &mut left_input_gradient[i][..];
+            let right_grad = &right_input_gradient[0][..];
+            let input = &self.inputs[i + 1][..];
+
+            transform.potential_gradient_mut(right_grad);
+            transform.gradient_with_respect_to_input_mut(left_grad);
+            transform.add_gradient_with_respect_to_weights_mut(input);
+        }
+
+        {
+            let transform = &mut self.transforms[0];
+            transform.potential_gradient_mut(&self.input_gradients[0][..]);
+            transform.add_gradient_with_respect_to_weights_mut(&self.inputs[0][..]);
+            // Note that we are not computing gradient with respect to input, since this is the
+            // first layer and we don't care about changes to input, only to weights.
+        }
+    }
+
+    fn forward_and_backwards_mut(&mut self, point: &NNPoint) {
+        self.inputs[0][1..].copy_from_slice(&point.normalized_image[..]);
+        self.output_mut();
+
+        let last_index = self.input_gradients.len() - 1;
+        // WARNING: Use proper error function
+        // transforms::gradient_error_mut(&self.output[..], desired_output, &mut self.input_gradients[last_index][..]);
+        transforms::cross_entropy_derivative_simple(
+            &self.output[..],
+            point.label,
+            &mut self.input_gradients[last_index][..],
+        );
+        self.update_weights_mut()
+    }
+
+    fn iterate_over_batch_mut(&mut self, learning_rate: Float, batch: &[NNPoint]) {
+        // iterates over the batch, while updating gradient of weights.
+        for point in batch {
+            self.forward_and_backwards_mut(point);
+        }
+
+        // Update the current weights by the opposite of the epsilon / batch_size *weight_gradients
+        let batch_size = batch.len() as Float;
+        let epsilon = -learning_rate / batch_size;
+
+        for transform in &mut self.transforms {
+            transform
+                .weight
+                .add_to_self_scaled_mut(epsilon, &transform.weight_gradient);
+            // Resets the weight gradient to zero so it can be used in next batch.
+            transform.weight_gradient.zero_mut();
+        }
+    }
+
+    fn iterate_over_epoch(
+        &mut self,
+        training_set: &mut [NNPoint],
+        batch_size: usize,
+        learning_rate: Float,
+    ) {
+        use rand::seq::SliceRandom;
+        use rand::thread_rng;
+        training_set.shuffle(&mut thread_rng()); // Shuffling is linear in the size of the slice
+
+        for batch in training_set.chunks(batch_size) {
+            self.iterate_over_batch_mut(learning_rate, batch);
+        }
+    }
+
+    pub fn train(
+        &mut self,
+        parameters: NeuralNetworkParameters,
+        training_set: &mut [NNPoint],
+        testing_set: Option<&[NNPoint]>,
+    ) {
+        fn test(
+            nn: &mut NeuralNetworkInTraining,
+            parameters: NeuralNetworkParameters,
+            testing_set: Option<&[NNPoint]>,
+            accuracy_per_epoch: &mut Vec<f32>,
+        ) {
+            if let Some(testing_set) = testing_set {
+                let accuracy = nn.accuracy(testing_set);
+                accuracy_per_epoch.push(accuracy);
+                println!();
+                println!("{}", parameters.show());
+                println!("{:?}", nn.neurons_per_layer);
+                println!("{:?}", accuracy_per_epoch);
+            }
+        }
+
+        fn next_learning_rate(initial_learning_rate: Float, epoch: usize) -> Float {
+            // initial_learning_rate / (1.0 + epoch as Float / 30.0)
+            // initial_learning_rate * (0.1 as Float).powf(epoch as Float / 50.0)
+            initial_learning_rate * (0.1 as Float).powf(epoch as Float / 20.0)
+        }
+
+        use std::time::Instant;
+        let now = Instant::now();
+
+        let number_of_epochs = parameters.epochs;
+        let batch_size = parameters.batch_size;
+        let mut learning_rate = parameters.learning_rate;
+        let mut accuracy_per_epoch = Vec::with_capacity(number_of_epochs + 1);
+
+        test(self, parameters, testing_set, &mut accuracy_per_epoch);
+        for epoch in 0..number_of_epochs {
+            println!("Epoch {}/{}", epoch + 1, number_of_epochs);
+            println!("Current learning rate = {}", learning_rate);
+            self.iterate_over_epoch(training_set, batch_size, learning_rate);
+            test(self, parameters, testing_set, &mut accuracy_per_epoch);
+
+            learning_rate = next_learning_rate(parameters.learning_rate, epoch);
+
+            let elapsed = now.elapsed();
+            let total_seconds = elapsed.as_secs();
+            let minutes = total_seconds / 60;
+            let seconds = total_seconds % 60;
+            println!("Duration of training: {} min {} sec", minutes, seconds);
+        }
+    }
+
+    pub fn show_accuracy_on(&mut self, testing_set: &[NNPoint], dataset_name: &str) {
+        println!(
+            "{} dataset accuracy: {:?}",
+            dataset_name,
+            self.accuracy(testing_set)
+        );
+    }
+
+    pub fn output_label(&self) -> usize {
+        let mut state: Option<(usize, Float)> = None;
+        for (i, y) in self.output.iter().enumerate() {
+            match state {
+                Some((_, max_so_far)) => {
+                    if *y > max_so_far {
+                        state = Some((i, *y))
+                    }
+                }
+                None => state = Some((i, *y)),
+            }
+        }
+        match state {
+            Some((label, _)) => label,
+            None => {
+                todo!()
+            }
+        }
+    }
+
+    pub fn accuracy(&mut self, testing_set: &[NNPoint]) -> f32 {
+        let mut num_of_correct_classifications = 0;
+        for point in testing_set {
+            self.inputs[0][1..].copy_from_slice(&point.normalized_image[..]);
+            self.output_mut(); // This doesn't change the weights
+
+            let neural_network_produced_label = self.output_label();
+            if point.label == neural_network_produced_label {
+                num_of_correct_classifications += 1
+            }
+            // println!("max-label: {}, desired-label: {}, prob-distr: {:?}", neural_network_produced_label, point.label, self.output);
+        }
+        num_of_correct_classifications as f32 / testing_set.len() as f32
+    }
+
+    pub fn test(&mut self, dataset: &[NNPoint]) -> Vec<usize> {
+        dataset
+            .iter()
+            .map(|point| {
+                self.inputs[0][1..].copy_from_slice(&point.normalized_image[..]);
+                self.output_mut();
+                self.output_label()
+            })
+            .collect()
+    }
+
+    pub fn test_distributions(&mut self, dataset: &[NNPoint]) -> Vec<Vec<Float>> {
+        dataset
+            .iter()
+            .map(|point| {
+                self.inputs[0][1..].copy_from_slice(&point.normalized_image[..]);
+                self.output_mut();
+                self.output.clone().to_vec()
+            })
+            .collect()
+    }
+}
--- a/src/preprocessing.rs
+++ b/src/preprocessing.rs
@ -0,0 +1,121 @@
+use crate::env;
+use crate::float::Float;
+use crate::neural_network::NNPoint;
+use std::fs::File;
+use std::io;
+use std::io::{BufRead, BufReader, BufWriter, Write};
+
+#[derive(Debug)]
+enum ParsingError {
+    CouldNotParseLabel,
+    CouldNotParseVector,
+    ImageHasWrongSize,
+}
+
+fn parse_dataset(
+    labels_buffer: impl BufRead,
+    vectors_buffer: impl BufRead,
+    number_of_points: usize,
+) -> Result<Result<Vec<NNPoint>, ParsingError>, io::Error> {
+    let mut output = vec![];
+    for (label_result, vector_result) in labels_buffer
+        .lines()
+        .zip(vectors_buffer.lines())
+        .take(number_of_points)
+    {
+        let label_str = label_result?;
+        let image_str = vector_result?;
+
+        let label: u8 = match label_str.parse() {
+            Ok(label) => label,
+            Err(_) => return Ok(Err(ParsingError::CouldNotParseLabel)),
+        };
+
+        let mut image: Vec<u8> = vec![];
+        for str in image_str.split(',') {
+            match str.parse() {
+                Ok(pixel_value) => {
+                    image.push(pixel_value);
+                }
+                Err(_) => return Ok(Err(ParsingError::CouldNotParseVector)),
+            }
+        }
+
+        if image.len() != env::NUMBER_OF_PIXELS_PER_IMAGE {
+            return Ok(Err(ParsingError::ImageHasWrongSize));
+        }
+
+        output.push(NNPoint::new(label, normalize_input(&image)));
+    }
+    Ok(Ok(output))
+}
+
+fn average(vec: &[u8]) -> Float {
+    let mut result: u32 = 0; // the result is bounded by NUMBER_OF_PIXELS_PER_IMAGE * 255 ~~ 200 k
+    for x in vec {
+        result += *x as u32;
+    }
+    (result as Float) / (vec.len() as Float)
+}
+
+fn variance(vec: &[Float]) -> Float {
+    // assumes that the average of vec is 0
+    let mut result: Float = 0.0;
+    for x in vec {
+        result += x * x;
+    }
+    result
+}
+
+fn normalize_input(vec: &[u8]) -> Vec<Float> {
+    let average = average(vec);
+    let mut result: Vec<Float> = vec.iter().map(|x| (*x as Float) - average).collect();
+    let stddev = variance(&result).sqrt();
+    for i in &mut result {
+        *i /= stddev;
+    }
+    result
+}
+
+pub fn dataset_from_file(
+    (labels_file_path, vectors_file_path): (&str, &str),
+    number_of_points: usize,
+) -> Result<Vec<NNPoint>, io::Error> {
+    let (labels_file, vectors_file): (File, File) = (
+        File::open(labels_file_path)?,
+        File::open(vectors_file_path)?,
+    );
+
+    match parse_dataset(
+        BufReader::new(labels_file),
+        BufReader::new(vectors_file),
+        number_of_points,
+    )? {
+        Ok(points) => Ok(points),
+        Err(parsing_error) => {
+            println!("ERROR: {:?}", parsing_error);
+            todo!()
+        }
+    }
+}
+
+pub fn export_to(outputs: &[usize], file_path: &str) -> Result<(), io::Error> {
+    let mut file = BufWriter::new(File::create(file_path)?);
+    for x in outputs {
+        writeln!(file, "{}", x)?
+    }
+    Ok(())
+}
+
+pub fn export_distributions_to(outputs: &[Vec<Float>], file_path: &str) -> Result<(), io::Error> {
+    let mut file = BufWriter::new(File::create(file_path)?);
+    for ps in outputs {
+        let mut s = "".to_string();
+        for p in ps {
+            let p_str = format!("{}, ", p);
+            s += &p_str
+        }
+        writeln!(file, "[{}]", s)?
+    }
+    Ok(())
+}
--- a/src/transforms.rs
+++ b/src/transforms.rs
@ -0,0 +1,315 @@
+use crate::float::Float;
+use crate::linear_algebra::{ColumnEfficientMatrix, Vector};
+use rand_distr::Normal;
+
+pub fn l2_error(xs: &[Float], ys: &[Float]) -> Float {
+    // err-squared
+    let mut result = 0.0;
+    for (x, y) in xs.iter().zip(ys) {
+        let s = x - y;
+        result += s * s;
+    }
+    0.5 * result
+}
+
+pub fn gradient_l2_error_mut(xs: &[Float], ys: &[Float], out: &mut [Float]) {
+    for (i, (x, y)) in xs.iter().zip(ys).enumerate() {
+        out[i] = x - y;
+    }
+}
+
+fn sigmoid(x: Float) -> Float {
+    1.0 / (1.0 + (-x).exp())
+}
+
+fn activation_of_sigmoid_potential(potential: Float) -> Float {
+    sigmoid(potential)
+}
+
+fn vectorized_sigmoid_activation_of_potential(potentials: &[Float], out: &mut [Float]) {
+    for (potential, y) in potentials.iter().zip(out) {
+        *y = activation_of_sigmoid_potential(*potential)
+    }
+}
+
+fn derivative_of_sigmoid_activation_of_potential(potential: Float) -> Float {
+    let s = sigmoid(potential);
+    s * (1.0 - s)
+}
+
+fn vectorized_gradient_of_sigmoid_activation_of_potential_mut(
+    potentials: &[Float],
+    derivatives_state: &mut [Float],
+    output_gradient: &[Float],
+    input_gradient: &mut [Float],
+) {
+    use crate::linear_algebra::DiagonalMatrix;
+    for (potential, state) in potentials.iter().zip(derivatives_state.iter_mut()) {
+        *state = derivative_of_sigmoid_activation_of_potential(*potential)
+    }
+    DiagonalMatrix::new(derivatives_state).apply_mut(output_gradient, input_gradient);
+}
+
+// =====cross-entropy=====
+// takes in two probability distributions
+fn cross_entropy(p: &[Float], q: &[Float]) -> Float {
+    let mut result = 0.0;
+    for (x, y) in p.iter().zip(q) {
+        result += y * x.ln()
+    }
+    -result
+}
+
+// Second probability distribution is deterministic,
+// i.e. it deterministically outputs the same value.
+fn cross_entropy_simple(p: &[Float], q: usize) -> Float {
+    -p[q].ln()
+}
+
+fn cross_entropy_derivative_mut(p: &[Float], q: &[Float], out: &mut [Float]) {
+    for ((a, b), c) in p.iter().zip(q).zip(out) {
+        *c = -b / a
+    }
+}
+
+// Second probability distribution is deterministic,
+// i.e. it deterministically outputs the same value.
+pub fn cross_entropy_derivative_simple(p: &[Float], q: usize, out: &mut [Float]) {
+    // TODO: Do we really need to reset everything to besides the q-th index?
+    for a in out.iter_mut() {
+        *a = 0.0;
+    }
+    out[q] = -1.0 / p[q];
+}
+
+fn softmax_mut(input: &[Float], out: &mut [Float]) {
+    let mut s = 0.0;
+    for (x, y) in input.iter().zip(out.iter_mut()) {
+        let e = x.exp();
+        *y = e;
+        s += e
+    }
+
+    for y in out {
+        *y /= s
+    }
+}
+
+fn softmax_gradient_mut(
+    softmax_output: &[Float],
+    gradient_output: &[Float],
+    gradient_input: &mut [Float],
+) {
+    for (j, dx) in gradient_input.iter_mut().enumerate() {
+        *dx = 0.0;
+        for (i, dy) in gradient_output.iter().enumerate() {
+            // Note that the gradient matrix is symmetric, so don't worry about the order of
+            // indices
+            *dx += softmax_output[i] * (if i == j { 1.0 } else { 0.0 } - softmax_output[j]) * dy
+        }
+    }
+}
+
+// relu
+fn relu_mut(input: &[Float], out: &mut [Float]) {
+    for (x, y) in input.iter().zip(out.iter_mut()) {
+        *y = x.max(0.0)
+    }
+}
+
+fn relu_gradient_mut(input: &[Float], gradient_output: &[Float], gradient_input: &mut [Float]) {
+    for ((dy, dx), x) in gradient_output.iter().zip(gradient_input).zip(input) {
+        *dx = if *x > 0.0 { *dy } else { 0.0 }
+    }
+}
+
+// =====sigmoid=====
+#[derive(Debug)]
+pub struct SigmoidTransform {
+    pub weight: ColumnEfficientMatrix,
+    potential_vector: Vector,
+    derivatives_state: Vector,
+    potential_gradient: Vector,
+
+    pub weight_gradient: ColumnEfficientMatrix,
+}
+
+impl SigmoidTransform {
+    pub fn new(input_dimension: usize, output_dimension: usize) -> Self {
+        let mean = 0.0;
+        let std_dev = 1.0 / (input_dimension as Float).sqrt();
+        let normal_distr = Normal::new(mean, std_dev).unwrap();
+
+        Self {
+            weight: ColumnEfficientMatrix::random_with_normal_distribution(
+                input_dimension,
+                output_dimension,
+                normal_distr,
+            ),
+            potential_vector: Vector::zero(output_dimension),
+            derivatives_state: Vector::zero(output_dimension), // TODO: Can I get rid of this?
+            potential_gradient: Vector::zero(output_dimension),
+
+            weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension),
+        }
+    }
+
+    pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) {
+        self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input]
+        vectorized_sigmoid_activation_of_potential(&self.potential_vector[..], output);
+        // y = f(potential)
+    }
+
+    // Note below that (1) and (2) are independent, but they both depend on (0).
+
+    // (0)
+    pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) {
+        // updates the potential gradient
+        vectorized_gradient_of_sigmoid_activation_of_potential_mut(
+            &self.potential_vector[..],
+            &mut self.derivatives_state[..],
+            output_gradient,
+            &mut self.potential_gradient[..],
+        ); // potential_gradient = grad[f](potential)[output_gradient]
+    }
+
+    // Note that it makes sense to have the two gradients split, since for the input layer we will
+    // not need to compute the input gradient, only the weight gradient is important.
+    // WARNING: You need to call `potential_gradient_mut` before using the below function
+    // (1)
+    pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) {
+        // updates the input gradient
+        //
+        // Note that the first column of `self.weight` is the bias,
+        // and the previous layer doesn't care about its gradient.
+        // So we just return the gradient below the first component of the input
+        // by dropping the bias column.
+        self.weight
+            .drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient);
+        // transpose[T without the first column][potential_gradient]
+    }
+
+    // Note that the proof ensures that the `potential_gradient` has been updated
+    // WARNING: You need to call `potential_gradient_mut` before using the below function
+    // (2)
+    pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) {
+        use crate::linear_algebra::VectorTensorCovectorMatrix;
+
+        let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input
+        matrix.add_to_mut(&mut self.weight_gradient);
+    }
+}
+
+// =====softmax=====
+#[derive(Debug)]
+pub struct SoftmaxTransform {
+    pub weight: ColumnEfficientMatrix,
+    potential_vector: Vector,
+    softmax_output: Vector, // Used for computation of the softmax gradient
+    potential_gradient: Vector,
+
+    pub weight_gradient: ColumnEfficientMatrix,
+}
+
+impl SoftmaxTransform {
+    pub fn new(input_dimension: usize, output_dimension: usize) -> Self {
+        let mean = 0.0;
+        let std_dev = 1.0 / (input_dimension as Float).sqrt();
+        let normal_distr = Normal::new(mean, std_dev).unwrap();
+
+        Self {
+            weight: ColumnEfficientMatrix::random_with_normal_distribution(
+                input_dimension,
+                output_dimension,
+                normal_distr,
+            ),
+            potential_vector: Vector::zero(output_dimension),
+            softmax_output: Vector::zero(output_dimension),
+            potential_gradient: Vector::zero(output_dimension),
+
+            weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension),
+        }
+    }
+
+    pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) {
+        self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input]
+        softmax_mut(&self.potential_vector[..], output); // y = f(potential)
+        self.softmax_output.copy_from_slice(output)
+    }
+
+    pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) {
+        softmax_gradient_mut(
+            &self.softmax_output[..],
+            output_gradient,
+            &mut self.potential_gradient[..],
+        ); // potential_gradient = grad[softmax](potential)[output_gradient]
+    }
+
+    pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) {
+        self.weight
+            .drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient);
+        // transpose[T without the first column][potential_gradient]
+    }
+
+    pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) {
+        use crate::linear_algebra::VectorTensorCovectorMatrix;
+
+        let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input
+        matrix.add_to_mut(&mut self.weight_gradient);
+    }
+}
+
+#[derive(Debug)]
+pub struct ReluTransform {
+    pub weight: ColumnEfficientMatrix,
+    potential_vector: Vector,
+    potential_gradient: Vector,
+
+    pub weight_gradient: ColumnEfficientMatrix,
+}
+
+impl ReluTransform {
+    pub fn new(input_dimension: usize, output_dimension: usize) -> Self {
+        let mean = 0.0;
+        let std_dev = 1.0 / (input_dimension as Float).sqrt();
+        let normal_distr = Normal::new(mean, std_dev).unwrap();
+
+        Self {
+            weight: ColumnEfficientMatrix::random_with_normal_distribution(
+                input_dimension,
+                output_dimension,
+                normal_distr,
+            ),
+            potential_vector: Vector::zero(output_dimension),
+            potential_gradient: Vector::zero(output_dimension),
+
+            weight_gradient: ColumnEfficientMatrix::zero(input_dimension, output_dimension),
+        }
+    }
+
+    pub fn output_mut(&mut self, input: &[Float], output: &mut [Float]) {
+        self.weight.apply_mut(input, &mut self.potential_vector[..]); // potential = W[input]
+        relu_mut(&self.potential_vector[..], output); // y = f(potential)
+    }
+
+    pub fn potential_gradient_mut(&mut self, output_gradient: &[Float]) {
+        relu_gradient_mut(
+            &self.potential_vector[..],
+            output_gradient,
+            &mut self.potential_gradient[..],
+        ); // potential_gradient = grad[softmax](potential)[output_gradient]
+    }
+
+    pub fn gradient_with_respect_to_input_mut(&self, input_gradient: &mut [Float]) {
+        self.weight
+            .drop_first_column_coapply_mut(&self.potential_gradient[..], input_gradient);
+        // transpose[T without the first column][potential_gradient]
+    }
+
+    pub fn add_gradient_with_respect_to_weights_mut(&mut self, input: &[Float]) {
+        use crate::linear_algebra::VectorTensorCovectorMatrix;
+
+        let matrix = VectorTensorCovectorMatrix::new(&self.potential_gradient[..], input); // grad[f](potential)[output_grad] **tensor** input
+        matrix.add_to_mut(&mut self.weight_gradient);
+    }
+}
--- a/tmp_repl.txt
+++ b/tmp_repl.txt
@ -0,0 +1,23 @@
+
+cargo init --edition 2018
+
+cargo add rand
+
+cargo add rand_distr
+
+cargo run --release
+
+module add rust
+
+cargo build
+
+
+cargo fmt
+
+# linter
+cargo clippy -- -D warnings
+
+
+python3 evaluator/evaluate.py test_predictions.csv data/fashion_mnist_test_labels.csv
+
+python3 evaluator/evaluate.py train_predictions.csv data/fashion_mnist_train_labels.csv
				`@ -0,0 +1 @@`
				`pub type Float = f32; // f64 or f32, doesn't seem to make any difference`