8 changed files with 38 additions and 60236 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,6 @@
 # ---> Python
 # Byte-compiled / optimized / DLL files
-*__pycache__/
-*.jukit/
+__pycache__/
 *.py[cod]
 *$py.class

--- a/README.md
+++ b/README.md
@ -1,3 +1,3 @@
 # neural-networks-from-scratch

-Implementing neural networks with only numpy (Well, cupy which is numpy for GPUs)
+Implementing neural networks with only numpy
--- a/mnist_train.csv
+++ b/mnist_train.csv
--- a/src/layer.py
+++ b/src/layer.py
@ -0,0 +1,36 @@
+import numpy as np
+from abc import ABC, abstractmethod
+
+
+class Layer(ABC):
+    @abstractmethod
+    def forward(self, x: np.ndarray) -> np.ndarray:
+        """
+        x = inputs
+        if should_cache = True, 
+            additional caching will be done.
+            Set this to true and then call forward right before calling backward
+        """
+
+    @property
+    @abstractmethod
+    def parameters(self) -> tuple[np.ndarray, ...]:
+        """
+        Returns the different parameters.
+        The order is defined as per the sub class's convinience
+        """
+
+    @parameters.setter
+    @abstractmethod
+    def parameters(self, parameters: tuple[np.ndarray, ...]) -> None:
+        """
+        Write to parameters property
+        """
+
+    @abstractmethod
+    def d_output_wrt_parameters(self, inputs: np.ndarray) -> tuple[np.ndarray, ...]:
+        pass
+
+    @abstractmethod
+    def d_output_wrt_inputs(self) -> np.ndarray:
+        pass
--- a/src/layers.py
+++ b/src/layers.py
@ -1,116 +0,0 @@
-import cupy as cp
-from abc import ABC, abstractmethod
-
-
-class Layer(ABC):
-    @abstractmethod
-    def forward(self, inputs: cp.ndarray) -> cp.ndarray:
-        pass
-
-    @property
-    @abstractmethod
-    def parameters(self) -> list[cp.ndarray] | None:
-        pass
-
-    @abstractmethod
-    def get_d_loss_wrt_parameters(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray | None:
-        pass
-
-    @abstractmethod
-    def get_d_loss_wrt_inputs(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray:
-        pass
-
-    @abstractmethod
-    def backward(self, d_loss_wrt_outputs: cp.ndarray, lr: float = .001) -> cp.ndarray:
-        pass
-
-    def __call__(self, inputs: cp.ndarray) -> cp.ndarray:
-        return self.forward(inputs)
-
-
-class Dense(Layer):
-    __weights: cp.ndarray
-    __biases: cp.ndarray
-
-    __cached_inputs: cp.ndarray
-    __cached_outputs: cp.ndarray
-
-    def __init__(self, input_size: cp.ndarray, output_size: cp.ndarray) -> None:
-        self.__weights = (cp.random.randn(input_size, output_size)) * cp.sqrt(1 / (output_size + input_size))
-        self.__biases = cp.zeros(output_size)
-
-    def forward(self, inputs: cp.ndarray) -> cp.ndarray:
-        self.__cached_inputs = inputs
-        self.__cached_outputs = cp.dot(inputs, self.__weights) + self.__biases
-        return self.__cached_outputs
-
-    def get_d_loss_wrt_inputs(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray:
-        return cp.dot(d_loss_wrt_outputs, self.__weights.T)
-
-    @property
-    def parameters(self) -> list[cp.ndarray]:
-        return [self.__weights, self.__biases]
-
-    def get_d_loss_wrt_parameters(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray:
-        d_loss_wrt_weights = cp.dot(self.__cached_inputs.T, d_loss_wrt_outputs)
-        d_loss_wrt_biases = cp.sum(d_loss_wrt_outputs)
-        return[d_loss_wrt_weights, d_loss_wrt_biases]
-
-    def backward(self, d_loss_wrt_outputs: cp.ndarray, lr: float = 0.001) -> cp.ndarray:
-        d_loss_wrt_outputs /= d_loss_wrt_outputs.shape[1]
-        d_loss_wrt_inputs = self.get_d_loss_wrt_inputs(d_loss_wrt_outputs)
-        d_loss_wrt_weights, d_loss_wrt_biases = self.get_d_loss_wrt_parameters(d_loss_wrt_outputs)
-        self.__biases -= lr * d_loss_wrt_biases
-        self.__weights -= lr * d_loss_wrt_weights
-        return d_loss_wrt_inputs
-
-
-class ReLU(Layer):
-    __cached_inputs: cp.ndarray
-
-    def forward(self, inputs: cp.ndarray) -> cp.ndarray:
-        self.__cached_inputs = inputs
-        return cp.maximum(0, inputs)
-
-    @property
-    def parameters(self) -> list[cp.ndarray] | None:
-        pass
-
-    def get_d_loss_wrt_parameters(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray | None:
-        pass
-
-    def get_d_loss_wrt_inputs(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray:
-        return (self.__cached_inputs > 0) * d_loss_wrt_outputs
-
-    def backward(self, d_loss_wrt_outputs: cp.ndarray, lr: float = 0.01) -> cp.ndarray:
-        return self.get_d_loss_wrt_inputs(d_loss_wrt_outputs)
-
-
-class Softmax(Layer):
-    __cached_outputs: cp.ndarray
-
-    def forward(self, inputs: cp.ndarray) -> cp.ndarray:
-        self.__cached_outputs = cp.exp(inputs) / (cp.sum(cp.exp(inputs), axis=1).reshape(inputs.shape[0], 1) + 0.001)
-        return self.__cached_outputs
-
-    @property
-    def parameters(self) -> list[cp.ndarray] | None:
-        pass
-
-    def get_d_loss_wrt_parameters(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray | None:
-        pass
-
-    def get_d_loss_wrt_inputs(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray:
-        return d_loss_wrt_outputs
-
-    def backward(self, d_loss_wrt_outputs: cp.ndarray, lr: float = 0.01) -> cp.ndarray:
-        #time.sleep(1)
-        #out = self.__cached_outputs
-        #jacobian = out.reshape(out.shape[0], 1, out.shape[1]).repeat(2, axis=1)
-        #eye = cp.eye(out.shape[1]).reshape((1, out.shape[1], out.shape[1])).repeat(jacobian.shape[0], axis=0)
-        #jacobian *= eye
-        #jacobian -= cp.dot(out.T, out)
-        #d_loss_wrt_outputs =\
-        #        cp.matmul(jacobian, d_loss_wrt_outputs.reshape(out.shape[0], out.shape[1], 1)).reshape((out.shape[0], out.shape[1]))
-        #return d_loss_wrt_outputs
-        return self.get_d_loss_wrt_inputs(d_loss_wrt_outputs)
--- a/src/loss.py
+++ b/src/loss.py
@ -1,13 +0,0 @@
-import cupy as cp
-from abc import ABC, abstractmethod
-
-
-class Loss(ABC):
-    @abstractmethod
-    def d_loss_wrt_inputs(self, outputs: cp.ndarray, targets: cp.ndarray) -> cp.ndarray:
-        pass
-
-
-class CategoricalCrossEntropy(Loss):
-    def d_loss_wrt_inputs(self, outputs: cp.ndarray, targets: cp.ndarray) -> cp.ndarray:
-        return outputs - targets
--- a/src/optimizer.py
+++ b/src/optimizer.py
@ -1 +0,0 @@
-import cupy as cp
--- a/src/test.py
+++ b/src/test.py
@ -1,103 +0,0 @@
-import time
-from layers import Dense, ReLU, Softmax
-from loss import CategoricalCrossEntropy
-import cupy as cp
-import pandas as pd
-from tqdm import tqdm
-
-
-#layers = [
-#        Dense(2, 128),
-#        ReLU(),
-#        Dense(128, 128),
-#        ReLU(),
-#        Dense(128, 2),
-#        Softmax()
-#]
-##layers = [Dense(2, 2)]
-#
-#
-#def forward(x):
-#    for layer in layers:
-#        x = layer.forward(x)
-#    return x
-#
-#
-#SEQUENCE_LENGTH = 10000
-#x1 = cp.random.random_integers(0, 1, SEQUENCE_LENGTH)
-#x2 = cp.random.random_integers(0, 1, SEQUENCE_LENGTH)
-#x = cp.array([x1, x2]).T
-#y = cp.logical_xor(x1, x2)
-#yt = cp.array([y == 0, y == 1]).T * 1.0
-#
-#out = forward(x)
-#print(cp.sum(cp.argmax(out, axis=1) == y) * 100 / len(y))
-#
-#loss_function = CategoricalCrossEntropy()
-#
-#for i in range(100):
-#    #print(f"Epoch{i + 1}")
-#    output = forward(x)
-#    d_loss = loss_function.d_loss_wrt_inputs(output, yt)
-#    for layer in layers[::-1]:
-#        d_loss = layer.backward(d_loss)
-#
-#out = forward(x)
-#print(cp.sum(cp.argmax(out, axis=1) == y) * 100 / len(y))
-
-
-
-
-
-
-
-training_data_frame = pd.read_csv("./mnist_train.csv")
-training_data_frame.head()
-
-training_data = cp.array(training_data_frame)
-y_train = training_data[:, 0]
-x_train: cp.ndarray = training_data[:, 1:]
-x_train = x_train.astype(float)
-x_train /= cp.argmax(x_train)
-del training_data
-del training_data_frame
-
-
-y_train_one_hot = cp.zeros(
-        (y_train.size, y_train.max().item() + 1)
-)
-y_train_one_hot[cp.arange(y_train.size), y_train] = 1
-
-
-layers = [
-        Dense(784, 128),
-        ReLU(),
-        Dense(128, 128),
-        ReLU(),
-        Dense(128, 10),
-        Softmax()
-]
-
-
-def forward(x):
-    for layer in layers:
-        x = layer.forward(x)
-    return x
-
-
-out = forward(x_train)
-print(cp.sum(cp.argmax(out, axis=1) == y_train) * 100 / len(y_train))
-
-loss_function = CategoricalCrossEntropy()
-
-for i in range(99999):
-    output = forward(x_train)
-    time.sleep(1)
-    print(f"Epoch {i}:")
-    print(cp.sum(cp.argmax(output, axis=1) == y_train) * 100 / len(y_train))
-    d_loss = loss_function.d_loss_wrt_inputs(output, y_train_one_hot)
-    for layer in layers[::-1]:
-        d_loss = layer.backward(d_loss, lr=0.0001)
-
-out = forward(x_train)
-print(cp.sum(cp.argmax(out, axis=1) == y_train) * 100 / len(y_train))