Finished

Implement softmax layer
Implement SDG optimizer
2024-08-26 18:19:33 +05:30 · 2024-05-09 13:21:34 +05:30 · 2024-05-09 13:01:12 +05:30 · 2024-05-09 13:00:54 +05:30 · 2024-05-09 12:31:03 +05:30 · 2024-05-09 12:29:35 +05:30
8 changed files with 60236 additions and 38 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 # ---> Python
 # Byte-compiled / optimized / DLL files
-__pycache__/
+*__pycache__/
+*.jukit/
 *.py[cod]
 *$py.class

--- a/README.md
+++ b/README.md
@ -1,3 +1,3 @@
 # neural-networks-from-scratch

-Implementing neural networks with only numpy
+Implementing neural networks with only numpy (Well, cupy which is numpy for GPUs)
--- a/mnist_train.csv
+++ b/mnist_train.csv
--- a/src/layer.py
+++ b/src/layer.py
@ -1,36 +0,0 @@
-import numpy as np
-from abc import ABC, abstractmethod
-
-
-class Layer(ABC):
-    @abstractmethod
-    def forward(self, x: np.ndarray) -> np.ndarray:
-        """
-        x = inputs
-        if should_cache = True, 
-            additional caching will be done.
-            Set this to true and then call forward right before calling backward
-        """
-
-    @property
-    @abstractmethod
-    def parameters(self) -> tuple[np.ndarray, ...]:
-        """
-        Returns the different parameters.
-        The order is defined as per the sub class's convinience
-        """
-
-    @parameters.setter
-    @abstractmethod
-    def parameters(self, parameters: tuple[np.ndarray, ...]) -> None:
-        """
-        Write to parameters property
-        """
-
-    @abstractmethod
-    def d_output_wrt_parameters(self, inputs: np.ndarray) -> tuple[np.ndarray, ...]:
-        pass
-
-    @abstractmethod
-    def d_output_wrt_inputs(self) -> np.ndarray:
-        pass
--- a/src/layers.py
+++ b/src/layers.py
@ -0,0 +1,116 @@
+import cupy as cp
+from abc import ABC, abstractmethod
+
+
+class Layer(ABC):
+    @abstractmethod
+    def forward(self, inputs: cp.ndarray) -> cp.ndarray:
+        pass
+
+    @property
+    @abstractmethod
+    def parameters(self) -> list[cp.ndarray] | None:
+        pass
+
+    @abstractmethod
+    def get_d_loss_wrt_parameters(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray | None:
+        pass
+
+    @abstractmethod
+    def get_d_loss_wrt_inputs(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray:
+        pass
+
+    @abstractmethod
+    def backward(self, d_loss_wrt_outputs: cp.ndarray, lr: float = .001) -> cp.ndarray:
+        pass
+
+    def __call__(self, inputs: cp.ndarray) -> cp.ndarray:
+        return self.forward(inputs)
+
+
+class Dense(Layer):
+    __weights: cp.ndarray
+    __biases: cp.ndarray
+
+    __cached_inputs: cp.ndarray
+    __cached_outputs: cp.ndarray
+
+    def __init__(self, input_size: cp.ndarray, output_size: cp.ndarray) -> None:
+        self.__weights = (cp.random.randn(input_size, output_size)) * cp.sqrt(1 / (output_size + input_size))
+        self.__biases = cp.zeros(output_size)
+
+    def forward(self, inputs: cp.ndarray) -> cp.ndarray:
+        self.__cached_inputs = inputs
+        self.__cached_outputs = cp.dot(inputs, self.__weights) + self.__biases
+        return self.__cached_outputs
+
+    def get_d_loss_wrt_inputs(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray:
+        return cp.dot(d_loss_wrt_outputs, self.__weights.T)
+
+    @property
+    def parameters(self) -> list[cp.ndarray]:
+        return [self.__weights, self.__biases]
+
+    def get_d_loss_wrt_parameters(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray:
+        d_loss_wrt_weights = cp.dot(self.__cached_inputs.T, d_loss_wrt_outputs)
+        d_loss_wrt_biases = cp.sum(d_loss_wrt_outputs)
+        return[d_loss_wrt_weights, d_loss_wrt_biases]
+
+    def backward(self, d_loss_wrt_outputs: cp.ndarray, lr: float = 0.001) -> cp.ndarray:
+        d_loss_wrt_outputs /= d_loss_wrt_outputs.shape[1]
+        d_loss_wrt_inputs = self.get_d_loss_wrt_inputs(d_loss_wrt_outputs)
+        d_loss_wrt_weights, d_loss_wrt_biases = self.get_d_loss_wrt_parameters(d_loss_wrt_outputs)
+        self.__biases -= lr * d_loss_wrt_biases
+        self.__weights -= lr * d_loss_wrt_weights
+        return d_loss_wrt_inputs
+
+
+class ReLU(Layer):
+    __cached_inputs: cp.ndarray
+
+    def forward(self, inputs: cp.ndarray) -> cp.ndarray:
+        self.__cached_inputs = inputs
+        return cp.maximum(0, inputs)
+
+    @property
+    def parameters(self) -> list[cp.ndarray] | None:
+        pass
+
+    def get_d_loss_wrt_parameters(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray | None:
+        pass
+
+    def get_d_loss_wrt_inputs(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray:
+        return (self.__cached_inputs > 0) * d_loss_wrt_outputs
+
+    def backward(self, d_loss_wrt_outputs: cp.ndarray, lr: float = 0.01) -> cp.ndarray:
+        return self.get_d_loss_wrt_inputs(d_loss_wrt_outputs)
+
+
+class Softmax(Layer):
+    __cached_outputs: cp.ndarray
+
+    def forward(self, inputs: cp.ndarray) -> cp.ndarray:
+        self.__cached_outputs = cp.exp(inputs) / (cp.sum(cp.exp(inputs), axis=1).reshape(inputs.shape[0], 1) + 0.001)
+        return self.__cached_outputs
+
+    @property
+    def parameters(self) -> list[cp.ndarray] | None:
+        pass
+
+    def get_d_loss_wrt_parameters(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray | None:
+        pass
+
+    def get_d_loss_wrt_inputs(self, d_loss_wrt_outputs: cp.ndarray) -> cp.ndarray:
+        return d_loss_wrt_outputs
+
+    def backward(self, d_loss_wrt_outputs: cp.ndarray, lr: float = 0.01) -> cp.ndarray:
+        #time.sleep(1)
+        #out = self.__cached_outputs
+        #jacobian = out.reshape(out.shape[0], 1, out.shape[1]).repeat(2, axis=1)
+        #eye = cp.eye(out.shape[1]).reshape((1, out.shape[1], out.shape[1])).repeat(jacobian.shape[0], axis=0)
+        #jacobian *= eye
+        #jacobian -= cp.dot(out.T, out)
+        #d_loss_wrt_outputs =\
+        #        cp.matmul(jacobian, d_loss_wrt_outputs.reshape(out.shape[0], out.shape[1], 1)).reshape((out.shape[0], out.shape[1]))
+        #return d_loss_wrt_outputs
+        return self.get_d_loss_wrt_inputs(d_loss_wrt_outputs)
--- a/src/loss.py
+++ b/src/loss.py
@ -0,0 +1,13 @@
+import cupy as cp
+from abc import ABC, abstractmethod
+
+
+class Loss(ABC):
+    @abstractmethod
+    def d_loss_wrt_inputs(self, outputs: cp.ndarray, targets: cp.ndarray) -> cp.ndarray:
+        pass
+
+
+class CategoricalCrossEntropy(Loss):
+    def d_loss_wrt_inputs(self, outputs: cp.ndarray, targets: cp.ndarray) -> cp.ndarray:
+        return outputs - targets
--- a/src/optimizer.py
+++ b/src/optimizer.py
@ -0,0 +1 @@
+import cupy as cp
--- a/src/test.py
+++ b/src/test.py
@ -0,0 +1,103 @@
+import time
+from layers import Dense, ReLU, Softmax
+from loss import CategoricalCrossEntropy
+import cupy as cp
+import pandas as pd
+from tqdm import tqdm
+
+
+#layers = [
+#        Dense(2, 128),
+#        ReLU(),
+#        Dense(128, 128),
+#        ReLU(),
+#        Dense(128, 2),
+#        Softmax()
+#]
+##layers = [Dense(2, 2)]
+#
+#
+#def forward(x):
+#    for layer in layers:
+#        x = layer.forward(x)
+#    return x
+#
+#
+#SEQUENCE_LENGTH = 10000
+#x1 = cp.random.random_integers(0, 1, SEQUENCE_LENGTH)
+#x2 = cp.random.random_integers(0, 1, SEQUENCE_LENGTH)
+#x = cp.array([x1, x2]).T
+#y = cp.logical_xor(x1, x2)
+#yt = cp.array([y == 0, y == 1]).T * 1.0
+#
+#out = forward(x)
+#print(cp.sum(cp.argmax(out, axis=1) == y) * 100 / len(y))
+#
+#loss_function = CategoricalCrossEntropy()
+#
+#for i in range(100):
+#    #print(f"Epoch{i + 1}")
+#    output = forward(x)
+#    d_loss = loss_function.d_loss_wrt_inputs(output, yt)
+#    for layer in layers[::-1]:
+#        d_loss = layer.backward(d_loss)
+#
+#out = forward(x)
+#print(cp.sum(cp.argmax(out, axis=1) == y) * 100 / len(y))
+
+
+
+
+
+
+
+training_data_frame = pd.read_csv("./mnist_train.csv")
+training_data_frame.head()
+
+training_data = cp.array(training_data_frame)
+y_train = training_data[:, 0]
+x_train: cp.ndarray = training_data[:, 1:]
+x_train = x_train.astype(float)
+x_train /= cp.argmax(x_train)
+del training_data
+del training_data_frame
+
+
+y_train_one_hot = cp.zeros(
+        (y_train.size, y_train.max().item() + 1)
+)
+y_train_one_hot[cp.arange(y_train.size), y_train] = 1
+
+
+layers = [
+        Dense(784, 128),
+        ReLU(),
+        Dense(128, 128),
+        ReLU(),
+        Dense(128, 10),
+        Softmax()
+]
+
+
+def forward(x):
+    for layer in layers:
+        x = layer.forward(x)
+    return x
+
+
+out = forward(x_train)
+print(cp.sum(cp.argmax(out, axis=1) == y_train) * 100 / len(y_train))
+
+loss_function = CategoricalCrossEntropy()
+
+for i in range(99999):
+    output = forward(x_train)
+    time.sleep(1)
+    print(f"Epoch {i}:")
+    print(cp.sum(cp.argmax(output, axis=1) == y_train) * 100 / len(y_train))
+    d_loss = loss_function.d_loss_wrt_inputs(output, y_train_one_hot)
+    for layer in layers[::-1]:
+        d_loss = layer.backward(d_loss, lr=0.0001)
+
+out = forward(x_train)
+print(cp.sum(cp.argmax(out, axis=1) == y_train) * 100 / len(y_train))
Author	SHA1	Message	Date
kosh	fc7f02d62c	Finished	2024-08-26 18:19:33 +05:30
kosh	a5de0ca7da	Implement softmax layer	2024-05-09 13:21:34 +05:30
kosh	9b4f3073f3	Implement SDG optimizer	2024-05-09 13:01:12 +05:30
kosh	98b3b4e18a	Changed parameter type from tuple to list in layer.py	2024-05-09 13:00:54 +05:30
kosh	3fe69b3869	Clean names in loss.py	2024-05-09 12:31:03 +05:30
kosh	b6cd28db7c	Implement CrossEntropyLoss	2024-05-09 12:29:35 +05:30
kosh	db5761cf4e	Implement loss	2024-05-09 12:15:20 +05:30
kosh	40601130e0	Implement Sequential Network	2024-05-09 11:55:18 +05:30
kosh	06c981253c	Implement ReLU layer	2024-05-08 18:49:39 +05:30
kosh	2188edacab	Impliment Dense layer class	2024-05-08 16:39:36 +05:30