import math
import numpy as np
from typing import Tuple
from typing import List


class LayerInitializer:
    """
    Functions for layer weight initialization.
    """

    # He normal initialization
    @staticmethod
    def he_normal(size: Tuple[int], fan_in: int) -> np.array:
        """
        HE NORMAL INITIALIZATION
        Draws samples from a truncated normal distribution centered at 0 mean
        with stddev = sqrt(2 / fan_in) where fan_in is the number of input
        units per unit in the layer.
        Parameters:
            - size: Tuple[int] (rows, columns)
                    shape of the initialized weight matrix
            - fan_in: int
                    number of input units per unit in the layer
        Returns:
            - np.array (rows, columns)
                    He normal initialized weight matrix
        Ref:
            https://arxiv.org/abs/1502.01852
        """
        return np.random.normal(0, math.sqrt(2 / fan_in), size = size)

    # Glorot / Xavier normal initialization
    @staticmethod
    def glorot_normal(size: Tuple[int], fan_in: int, fan_out: int) -> np.array:
        """
        GLOROT / XAVIER NORMAL INITIALIZATION
        Draws samples from a truncated normal distribution centered at 0 mean
        with stddev = sqrt(2 / (fan_in + fan_out)) where fan_in is the number of
        input units per unit in the layer and fan_out is the number of output
        units per unit in the layer.
        Parameters:
            - size: Tuple[int] (rows, columns)
                    shape of the initialized weight matrix
            - fan_in: int
                    number of input units per unit in the layer
            - fan_out: int
                    number of output units per unit in the layer
        Returns:
            - np.array (rows, columns)
                    Glorot normal initialized weight matrix
        Ref:
            http://proceedings.mlr.press/v9/glorot10a.html
        """
        return np.random.normal(0, math.sqrt(2 / (fan_in + fan_out)), size = size)

    # Bias initialization
    @staticmethod
    def bias(size: Tuple[int]):
        """
        BIAS INITIALIZATION
        Initializes the bias vector / matrix with zeros.
        Parameters:
            - size: Tuple[int] (rows, columns)
                    shape of the initialized bias vector / matrix
        Returns:
            - np.array (rows, columns)
                    Zero initialized bias vector / matrix
        Ref:
            https://cs231n.github.io/neural-networks-2/
        """
        return np.zeros(shape = size)


class ActivationFunctions:
    """
    Layer activation functions.
    """

    # Rectified Linear Units
    @staticmethod
    def relu(x: np.array, derivative: bool = False) -> np.array:
        """
        RECTIFIED LINEAR UNITS
        ReLU activation function.
        Parameters:
            - x: np.array
                    input matrix to apply activation function to
            - derivative: bool
                    if set to 'True' returns the derivative instead
                    DEFAULT: False
        Returns:
            - np.array (same shape as x)
                    activated x / derivative of x
        Ref:
            https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
        """
        if not derivative:
            return np.maximum(x, 0)
        else:
            return np.where(x > 0, 1, 0)

    # Sigmoid activation function
    @staticmethod
    def sigmoid(x: np.array, derivative: bool = False) -> np.array:
        """
        SIGMOID / LOGISTIC FUNCTION
        Sigmoid activation function.
        Parameters:
            - x: np.array
                    input matrix to apply activation function to
            - derivative: bool
                    if set to 'True' returns the derivative instead
                    DEFAULT: False
        Returns:
            - np.array (same shape as x)
                    activated x / derivative of x
        Refs:
            https://en.wikipedia.org/wiki/Sigmoid_function
            https://en.wikipedia.org/wiki/Activation_function
        """
        def f_sigmoid(x: np.array) -> np.array:
            return 1 / (1 + np.exp(-x))

        if not derivative:
            return f_sigmoid(x)
        else:
            return f_sigmoid(x) * (1 - f_sigmoid(x))

    # Softmax activation function
    @staticmethod
    def softmax(x: np.array, derivative: bool = False) -> np.array:
        """
        SOFTMAX FUNCTION
        Stable softmax activation function.
        Parameters:
            - x: np.array
                    input matrix to apply activation function to
        Returns:
            - np.array (same shape as x)
                    activated x
        Refs:
            https://en.wikipedia.org/wiki/Softmax_function
            https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/
        """
        if not derivative:
            n = np.exp(x - np.max(x)) # stable softmax
            d = np.sum(n, axis = 0)
            return n / d
        else:
            raise NotImplementedError("Softmax derivative not implemented!")
            # https://stackoverflow.com/questions/54976533/derivative-of-softmax-function-in-python
            # xr = x.reshape((-1, 1))
            # return np.diagflat(x) - np.dot(xr, xr.T)


class LossFunctions:
    """
    Loss functions for neural net fitting.
    """

    # binary cross entropy loss
    @staticmethod
    def binary_cross_entropy(y_true: np.array, y_predicted: np.array) -> np.array:
        """
        BINARY CROSS ENTROPY LOSS
        Cross entropy loss for binary-class classification.
        L[BCE] = - p(i) * log(q(i)) - (1 - p(i)) * log(1 - q(i))
        where
            - p(i) is the true label
            - q(i) is the predicted sigmoid probability
        Parameters:
            - y_true: np.array (1, sample_size)
                    true label vector
            - y_predicted: np.array (1, sample_size)
                    the sigmoid probability
        Returns:
            - np.array (sample_size,)
                    loss for every given sample
        Ref:
            https://en.wikipedia.org/wiki/Cross_entropy
        """
        losses = []
        for i in range(y_true.shape[1]):
            ## stable BCE
            losses.append(float(-1 * (y_true[:, i] * np.log(y_predicted[:, i] + 1e-7) + (1 - y_true[:, i]) * np.log(1 - y_predicted[:, i] + 1e-7))))
            ## unstable BCE
            # losses.append(float(-1 * (y_true[:, i] * np.log(y_predicted[:, i]) + (1 - y_true[:, i]) * np.log(1 - y_predicted[:, i]))))
        return np.array(losses)

    # categorical cross entropy loss
    @staticmethod
    def categorical_cross_entropy(y_true: np.array, y_predicted: np.array) -> np.array:
        """
        CATEGORICAL CROSS ENTROPY LOSS
        Cross entropy loss for binary- and multi-class class classification.
        L[CCE] = - sum[from i = 0 to n]( p(i) * log(q(i)) )
        where
            - p(i) is the true label
            - q(i) is the predicted softmax probability
            - n is the number of classes
        Parameters:
            - y_true: np.array (n_classes, sample_size)
                    one-hot encoded true label vector
            - y_predicted: np.array (n_classes, sample_size)
                    the softmax probabilities
        Returns:
            - np.array (sample_size,)
                    loss for every given sample
        Ref:
            https://en.wikipedia.org/wiki/Cross_entropy
        """
        losses = []
        for i in range(y_true.shape[1]):
            ## stable CCE
            # losses.append(float(-1 * np.sum(y_true[:, i] * np.log(y_predicted[:, i] + 1e-7))))
            ## unstable CCE
            losses.append(float(-1 * np.sum(y_true[:, i] * np.log(y_predicted[:, i]))))

        return np.array(losses)


class NeuralNetwork:
    """
    Implementation of a classic feed-forward neural network that is trained via
    backpropagation. Adopts a Keras-like interface for convenient usage (see
    https://michabirklbauer.github.io/neuralnet for examples).
    """

    # constructor
    def __init__(self, input_size: int):
        """
        CONSTRUCTOR
        Initializes the neural network model.
        Parameters:
            - input_size: int
                    nr. of features in the training data
        Returns:
            - None
        Example usage:
            NN = NeuralNetwork(data.shape[1])
        """
        self.input_size = input_size
        self.architecture = []
        self.layers = []

    # adding layers
    def add_layer(self, units: int, activation: str = "relu", initialization: str = None) -> None:
        """
        LAYER MANAGEMENT
        Construct the neural network architecture by adding different layers.
        Parameters:
            - units: int
                    nr. of units in the layer
            - activation: str, one of ("relu", "sigmoid", "softmax")
                    activation function of the layer
                    DEFAULT: "relu"
            - initialization: str, one of ("he", "glorot")
                    weight initialization to use
                    DEFAULT: None, "relu" layers are 'he normal' initialized,
                                   all other layers are 'glorot normal'
                                   initialized
        Returns:
            - None
        Example usage:
            NN = NeuralNetwork(data.shape[1])
            NN.add_layer(16, "relu", "glorot")
            NN.add_layer(8)
            NN.add_layer(1, "sigmoid")
        """
        if initialization == None:
            if activation == "relu":
                layer_init = "he"
            else:
                layer_init = "glorot"
        else:
            layer_init = initialization

        self.architecture.append({"units": units, "activation": activation, "init": layer_init})

    # compiling model
    def compile(self, loss: str = "categorical crossentropy") -> None:
        """
        MODEL INITIALIZATION
        Initializes all parameters of the neural network architecture and
        prepares the model for training.
        Parameters:
            - loss: str, one of ("binary crossentropy", "categorical crossentropy")
                    the loss function that should be used for training
                    DEFAULT: "categorical crossentropy"
        Returns:
            - None
        Example usage:
            NN = NeuralNetwork(data.shape[1])
            NN.add_layer(16, "relu", "glorot")
            NN.add_layer(8)
            NN.add_layer(1, "sigmoid")
            NN.compile("binary crossentropy")
        """
        self.loss = loss

        # initialize all layer weights and biases
        for i in range(len(self.architecture)):
            units = self.architecture[i]["units"]
            activation = self.architecture[i]["activation"]
            init = self.architecture[i]["init"]

            units_previous_layer = self.input_size
            if i > 0:
                units_previous_layer = self.architecture[i - 1]["units"]
            units_next_layer = 0
            if i < len(self.architecture) - 1:
                units_next_layer = self.architecture[i + 1]["units"]

            if init  == "he":
                W = LayerInitializer.he_normal((units, units_previous_layer), fan_in = units_previous_layer)
                b = LayerInitializer.bias((units, 1))
            elif init == "glorot":
                W = LayerInitializer.glorot_normal((units, units_previous_layer), fan_in = units_previous_layer, fan_out = units_next_layer)
                b = LayerInitializer.bias((units, 1))
            else:
                raise NotImplementedError("Layer initialization '" + init + "' not implemented!")

            self.layers.append({"W": W, "b": b, "activation": activation})

    # forward propagation
    def __forward_propagation(self, data: np.array) -> None:
        """
        FORWARD PROPAGATION (INTERNAL)
        Internal function calculating the forward pass of A(Wx + b).
            - The result of 'Wx + b' (L) is stored in self.layers[layer]["L"]
            - The result of 'Activation(L)' (A) is stored in self.layers[layer]["A"]
        Parameters:
            - data: np.array
                    input data for the forward pass
        Returns:
            - None, "L" and "A" are set in the layer dictionary, to retrieve the
                    last layer output call 'self.layers[-1]["A"]'
        """

        for i in range(len(self.layers)):

            if i == 0:
                A = data
            else:
                A = self.layers[i - 1]["A"]

            # Wx + b where x is the input data for the first layer and otherwise
            # the output (A) of the previous layer
            self.layers[i]["L"] = self.layers[i]["W"].dot(A) + self.layers[i]["b"]
            if self.layers[i]["activation"] == "relu":
                self.layers[i]["A"] = ActivationFunctions.relu(self.layers[i]["L"])
            elif self.layers[i]["activation"] == "sigmoid":
                self.layers[i]["A"] = ActivationFunctions.sigmoid(self.layers[i]["L"])
            elif self.layers[i]["activation"] == "softmax":
                self.layers[i]["A"] = ActivationFunctions.softmax(self.layers[i]["L"])
            else:
                raise NotImplementedError("Activation function '" + self.layers[i]["activation"] + "' not implemented!")

    # back propagation
    def __back_propagation(self, data: np.array, target: np.array, learning_rate: float = 0.1) -> float:
        """
        BACK PROPAGATION (INTERNAL)
        Internal function for learning layer weights and biases using gradient
        descent and back propagation.
        Parameters:
            - data: np.array
                    input data
            - target: np.array
                    class labels of the input data
            - learning_rate: float
                    learning rate / how far in the direction of the gradient to
                    go
                    DEFAULT: 0.1
        Returns:
            - float
                    loss of the current forward pass
        """
        # forward pass
        self.__forward_propagation(data)

        output = self.layers[-1]["A"]
        batch_size = data.shape[1]
        loss = 0

        # calculate loss of the current forward pass
        if self.loss == "categorical crossentropy":
            losses = LossFunctions.categorical_cross_entropy(y_true = target, y_predicted = output)
            # reduction by sum over batch size
            loss = float(np.sum(losses) / batch_size)
        elif self.loss == "binary crossentropy":
            losses = LossFunctions.binary_cross_entropy(y_true = target, y_predicted = output)
            # reduction by sum over batch size
            loss = float(np.sum(losses) / batch_size)
        else:
            raise NotImplementedError("Loss function '" + self.loss + "' not implemented!")

        # calculate and back pass the derivate of the loss w.r.t the output
        # activation function
        # this implementation suppports CCE + Softmax and BCE + Sigmoid in the
        # output layer
        if self.loss == "categorical crossentropy" and self.layers[-1]["activation"] == "softmax":
            # for categorical cross entropy loss the derivative of softmax simplifies to
            # P(i) - Y(i)
            # where P(i) is the softmax output and Y(i) is the true label
            # https://www.ics.uci.edu/~pjsadows/notes.pdf
            # https://math.stackexchange.com/questions/945871/derivative-of-softmax-loss-function
            previous_layer_activation = data.T if len(self.layers) == 1 else self.layers[len(self.layers) - 2]["A"].T
            dL = self.layers[-1]["A"] - target
            dW = dL.dot(previous_layer_activation) / batch_size
            db = np.reshape(np.sum(dL, axis = 1), (-1, 1)) / batch_size

            # parameter tracking
            previous_dL = np.copy(dL)
            previous_W = np.copy(self.layers[-1]["W"])

            # update
            self.layers[-1]["W"] -= learning_rate * dW
            self.layers[-1]["b"] -= learning_rate * db
        elif self.loss == "binary crossentropy" and self.layers[-1]["activation"] == "sigmoid":
            # for binary cross entropy loss the derivative of the loss function is
            # L' = -1 * (Y(i) / P(i) - (1 - Y(i)) / (1 - P(i)))
            # where P(i) is the sigmoid output and Y(i) is the true label
            # and we multiply that with the derivative of the sigmoid function [1]
            # https://math.stackexchange.com/questions/2503428/derivative-of-binary-cross-entropy-why-are-my-signs-not-right
            previous_layer_activation = data.T if len(self.layers) == 1 else self.layers[len(self.layers) - 2]["A"].T
            # [1]
            # A = np.clip(self.layers[-1]["A"], 1e-7, 1 - 1e-7)
            # derivative_loss = -1 * np.divide(target, A) + np.divide(1 - target, 1 - A)
            # dL = derivative_loss * ActivationFunctions.sigmoid(self.layers[-1]["L"], derivative = True)
            # alternatively we can directly simplify the derivative of the binary cross entropy loss
            # with sigmoid activation function to
            # P(i) - Y(i)
            # where P(i) is the sigmoid output and Y(i) is the true label
            # done in [2]
            # https://math.stackexchange.com/questions/4227931/what-is-the-derivative-of-binary-cross-entropy-loss-w-r-t-to-input-of-sigmoid-fu
            # [2]
            dL = (self.layers[-1]["A"] - target) / batch_size
            dW = dL.dot(previous_layer_activation) / batch_size
            db = np.reshape(np.sum(dL, axis = 1), (-1, 1)) / batch_size

            # parameter tracking
            previous_dL = np.copy(dL)
            previous_W = np.copy(self.layers[-1]["W"])

            # update
            self.layers[-1]["W"] -= learning_rate * dW
            self.layers[-1]["b"] -= learning_rate * db
        else:
            raise NotImplementedError("The combination of '" + self.loss + " loss' and '" + self.layers[-1]["activation"] + " activation' is not implemented!")

        # back propagation through the remaining hidden layers
        for i in reversed(range(len(self.layers) - 1)):

            if i == 0:
                if self.layers[i]["activation"] == "relu":
                    dL = previous_W.T.dot(previous_dL) * ActivationFunctions.relu(self.layers[i]["L"], derivative = True)
                    dW = dL.dot(data.T) / batch_size
                    db = np.reshape(np.sum(dL, axis = 1), (-1, 1)) / batch_size
                elif self.layers[i]["activation"] == "sigmoid":
                    dL = previous_W.T.dot(previous_dL) * ActivationFunctions.sigmoid(self.layers[i]["L"], derivative = True)
                    dW = dL.dot(data.T) / batch_size
                    db = np.reshape(np.sum(dL, axis = 1), (-1, 1)) / batch_size
                else:
                    raise NotImplementedError("Activation function '" + self.layers[i]["activation"] + "' not implemented for hidden layers!")

                # parameter tracking
                previous_dL = np.copy(dL)
                previous_W = np.copy(self.layers[i]["W"])

                #update
                self.layers[i]["W"] -= learning_rate * dW
                self.layers[i]["b"] -= learning_rate * db
            else:
                if self.layers[i]["activation"] == "relu":
                    dL = previous_W.T.dot(previous_dL) * ActivationFunctions.relu(self.layers[i]["L"], derivative = True)
                    dW = dL.dot(self.layers[i - 1]["A"].T) / batch_size
                    db = np.reshape(np.sum(dL, axis = 1), (-1, 1)) / batch_size
                elif self.layers[i]["activation"] == "sigmoid":
                    dL = previous_W.T.dot(previous_dL) * ActivationFunctions.sigmoid(self.layers[i]["L"], derivative = True)
                    dW = dL.dot(self.layers[i - 1]["A"].T) / batch_size
                    db = np.reshape(np.sum(dL, axis = 1), (-1, 1)) / batch_size
                else:
                    raise NotImplementedError("Activation function '" + self.layers[i]["activation"] + "' not implemented for hidden layers!")

                # parameter tracking
                previous_dL = np.copy(dL)
                previous_W = np.copy(self.layers[i]["W"])

                #update
                self.layers[i]["W"] -= learning_rate * dW
                self.layers[i]["b"] -= learning_rate * db

        return loss

    # neural network architecture summary
    def summary(self) -> None:
        """
        MODEL SUMMARY
        Print a summary of the neural network architecture.
        Parameters:
            - None
        Returns:
            - None, prints a summary of the neural network architecture to
                    stdout
        Example usage:
            NN.summary()
        """
        print("---- Model Summary ----")
        for i, layer in enumerate(self.layers):
            print("Layer " + str(i + 1) + ": " + layer["activation"])
            if "L" in layer:
                print("W: " + str(layer["W"].shape) + " " +
                      "b: " + str(layer["b"].shape) + " " +
                      "L: " + str(layer["L"].shape) + " " +
                      "A: " + str(layer["A"].shape))
            else:
                print("W: " + str(layer["W"].shape) + " " +
                      "b: " + str(layer["b"].shape))
            print("Trainable parameters: " + str(
                layer["W"].shape[0] * layer["W"].shape[1] +
                layer["b"].shape[0] * layer["b"].shape[1]))

    # train neural network on data
    def fit(self, X: np.array, y: np.array, epochs: int = 100, batch_size: int = 32, learning_rate: float = 0.1, verbose: int = 1) -> List[float]:
        """
        TRAIN MODEL
        Train the neural network.
        Parameters:
            - X: np.array (samples, features)
                    input data to train on
            - y: np.array (samples, labels) or (labels,)
                    labels of the input data
            - epochs: int
                    how many iterations to train
                    DEFAULT: 100
            - batch_size: int
                    how many samples to use per backward pass
                    DEFAULT: 32
            - learning_rate: float
                    learning rate / how far in the direction of the gradient to
                    go
                    DEFAULT: 0.1
            - verbose: int, one of (0, 1) / bool
                    print information for every epoch
                    DEFAULT: 1 (True)
        Returns:
            - List[float]
                    loss history over all epochs
        Example usage:
            NN.fit(data_train, labels_train)
        """
        # reshaping inputs
        if y.ndim == 1:
            y = np.reshape(y, (-1, 1))

        data = X.T
        target = y.T
        sample_size = data.shape[1]

        history = []

        # train network
        for i in range(epochs):
            if verbose:
                print("Training epoch " + str(i + 1) + "...")
            # generate random batches of size batch_size
            idx = np.random.choice(sample_size, sample_size, replace = False)
            batches = np.array_split(idx, math.ceil(sample_size / batch_size))
            batch_losses = []
            for batch in batches:
                current_data = data[:, batch]
                current_target = target[:, batch]
                batch_loss = self.__back_propagation(current_data, current_target, learning_rate = learning_rate)
                batch_losses.append(batch_loss)
            history.append(np.mean(batch_losses))
            if verbose:
                print("Current loss: ", np.mean(batch_losses))
                print("Epoch " + str(i + 1) + " done!")

        print("Training finished after epoch " + str(epochs) + " with a loss of " + str(history[-1]) + ".")

        return history

    # predict data with fitted neural network
    def predict(self, X: np.array) -> np.array:
        """
        GENERATE PREDICTIONS
        Predict labels for the given input data.
        Parameters:
            - X: np.array (samples, features) or (features,)
                    input data to predict
        Returns:
            - np.array
                    predictions
        Example usage:
            NN.predict(data_test)
        """
        if X.ndim == 1:
            X = np.reshape(X, (1, -1))

        self.__forward_propagation(X.T)

        return self.layers[-1]["A"].T


!wget https://raw.githubusercontent.com/michabirklbauer/neuralnet/master/data.zip


from zipfile import ZipFile as zip

with zip("data.zip") as f:
    f.extractall()
    f.close()


import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


data = pd.read_csv("multiclass_train.csv")
train, test = train_test_split(data, test_size = 0.3)
train_data = train.loc[:, train.columns != "label"].to_numpy() / 255
train_target = train["label"].to_numpy()
test_data = test.loc[:, test.columns != "label"].to_numpy() / 255
test_target = test["label"].to_numpy()


train


one_hot = OneHotEncoder(sparse = False, categories = "auto")
train_target = one_hot.fit_transform(train_target.reshape(-1, 1))
test_target = one_hot.transform(test_target.reshape(-1, 1))


NN = NeuralNetwork(input_size = train_data.shape[1])
NN.add_layer(32, "relu")
NN.add_layer(16, "relu")
NN.add_layer(10, "softmax")
NN.compile(loss = "categorical crossentropy")
NN.summary()

---- Model Summary ----
Layer 1: relu
W: (32, 784) b: (32, 1)
Trainable parameters: 25120
Layer 2: relu
W: (16, 32) b: (16, 1)
Trainable parameters: 528
Layer 3: softmax
W: (10, 16) b: (10, 1)
Trainable parameters: 170


hist = NN.fit(train_data, train_target, epochs = 30, batch_size = 16, learning_rate = 0.05)

Training epoch 1...
Current loss:  0.43747370824596604
Epoch 1 done!
Training epoch 2...
Current loss:  0.21528007156258966
Epoch 2 done!
Training epoch 3...
Current loss:  0.16742503623911392
Epoch 3 done!
Training epoch 4...
Current loss:  0.13877936553368508
Epoch 4 done!
Training epoch 5...
Current loss:  0.12099309045421619
Epoch 5 done!
Training epoch 6...
Current loss:  0.1072971880634624
Epoch 6 done!
Training epoch 7...
Current loss:  0.09396355017990504
Epoch 7 done!
Training epoch 8...
Current loss:  0.08720308198194518
Epoch 8 done!
Training epoch 9...
Current loss:  0.07927159779935378
Epoch 9 done!
Training epoch 10...
Current loss:  0.07284107143112058
Epoch 10 done!
Training epoch 11...
Current loss:  0.06600162705624461
Epoch 11 done!
Training epoch 12...
Current loss:  0.06342602649693302
Epoch 12 done!
Training epoch 13...
Current loss:  0.05783998850656874
Epoch 13 done!
Training epoch 14...
Current loss:  0.05052314129523882
Epoch 14 done!
Training epoch 15...
Current loss:  0.04563600268741524
Epoch 15 done!
Training epoch 16...
Current loss:  0.04470639462592896
Epoch 16 done!
Training epoch 17...
Current loss:  0.043506537043299306
Epoch 17 done!
Training epoch 18...
Current loss:  0.03815045738567615
Epoch 18 done!
Training epoch 19...
Current loss:  0.038454017529732515
Epoch 19 done!
Training epoch 20...
Current loss:  0.034033571538281876
Epoch 20 done!
Training epoch 21...
Current loss:  0.03033063122611392
Epoch 21 done!
Training epoch 22...
Current loss:  0.02789381646483783
Epoch 22 done!
Training epoch 23...
Current loss:  0.02688368926764838
Epoch 23 done!
Training epoch 24...
Current loss:  0.02944480698302673
Epoch 24 done!
Training epoch 25...
Current loss:  0.02519994251217897
Epoch 25 done!
Training epoch 26...
Current loss:  0.02679484096626338
Epoch 26 done!
Training epoch 27...
Current loss:  0.01805071452172742
Epoch 27 done!
Training epoch 28...
Current loss:  0.021675299545706767
Epoch 28 done!
Training epoch 29...
Current loss:  0.027434799817775905
Epoch 29 done!
Training epoch 30...
Current loss:  0.024449728356841036
Epoch 30 done!
Training finished after epoch 30 with a loss of 0.024449728356841036.


def plot_history(hist):
    plt.plot(hist)
    plt.title("Model Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()
    
plot_history(hist);


train_predictions = np.argmax(NN.predict(train_data), axis = 1)
print("Training accuracy: ", accuracy_score(train["label"].to_numpy(), train_predictions))
test_predictions = np.argmax(NN.predict(test_data), axis = 1)
print("Test accuracy: ", accuracy_score(test["label"].to_numpy(), test_predictions))

Training accuracy:  0.9894897959183674
Test accuracy:  0.9495238095238095


def predict_image(index):
    current_image = test_data[index, :]
    prediction = np.argmax(NN.predict(current_image), axis = 1)
    label = test["label"].to_numpy()[index]
    print("Prediction: ", prediction)
    print("Label: ", label)
    
    current_image = current_image.reshape((28, 28)) * 255
    plt.gray()
    plt.imshow(current_image, interpolation = "nearest")
    plt.show()


predict_image(1)

Prediction:  [5]
Label:  5


predict_image(2)

Prediction:  [4]
Label:  4


predict_image(3)

Prediction:  [2]
Label:  2


data = pd.read_csv("binaryclass_train.csv", header = None)
data["label"] = data[1].apply(lambda x: 1 if x == "M" else 0)
train, test = train_test_split(data, test_size = 0.3)
train_data = train.loc[:, ~train.columns.isin([0, 1, "label"])].to_numpy()
train_target = train["label"].to_numpy()
test_data = test.loc[:, ~test.columns.isin([0, 1, "label"])].to_numpy()
test_target = test["label"].to_numpy()


train


NN = NeuralNetwork(input_size = train_data.shape[1])
NN.add_layer(16, "relu")
NN.add_layer(16, "relu")
NN.add_layer(1, "sigmoid")
NN.compile(loss = "binary crossentropy")
NN.summary()

---- Model Summary ----
Layer 1: relu
W: (16, 30) b: (16, 1)
Trainable parameters: 496
Layer 2: relu
W: (16, 16) b: (16, 1)
Trainable parameters: 272
Layer 3: sigmoid
W: (1, 16) b: (1, 1)
Trainable parameters: 17


hist = NN.fit(train_data, train_target, epochs = 1000, batch_size = 32, learning_rate = 0.01, verbose = 0)

Training finished after epoch 1000 with a loss of 0.166389194481977.


plot_history(hist);


train_predictions = np.round(NN.predict(train_data))
print("Training accuracy: ", accuracy_score(train["label"].to_numpy(), train_predictions))
test_predictions = np.round(NN.predict(test_data))
print("Test accuracy: ", accuracy_score(test["label"].to_numpy(), test_predictions))

Training accuracy:  0.8869346733668342
Test accuracy:  0.8888888888888888

	0	1	2	3	4	5	6	7	8	9	...	23	24	25	26	27	28	29	30	31	label
361	901041	B	13.30	21.57	85.24	546.1	0.08582	0.06373	0.03344	0.02424	...	29.20	92.94	621.2	0.1140	0.16670	0.12120	0.05614	0.2637	0.06658	0
186	874217	M	18.31	18.58	118.60	1041.0	0.08588	0.08468	0.08169	0.05814	...	26.36	139.20	1410.0	0.1234	0.24450	0.35380	0.15710	0.3206	0.06938	1
199	877500	M	14.45	20.22	94.49	642.7	0.09872	0.12060	0.11800	0.05980	...	30.12	117.90	1044.0	0.1552	0.40560	0.49670	0.18380	0.4753	0.10130	1
389	90312	M	19.55	23.21	128.90	1174.0	0.10100	0.13180	0.18560	0.10210	...	30.44	142.00	1313.0	0.1251	0.24140	0.38290	0.18250	0.2576	0.07602	1
388	903011	B	11.27	15.50	73.38	392.0	0.08365	0.11140	0.10070	0.02757	...	18.93	79.73	450.0	0.1102	0.28090	0.30210	0.08272	0.2157	0.10430	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
430	907914	M	14.90	22.53	102.10	685.0	0.09947	0.22250	0.27330	0.09711	...	27.57	125.40	832.7	0.1419	0.70900	0.90190	0.24750	0.2866	0.11550	1
371	9012568	B	15.19	13.21	97.65	711.8	0.07963	0.06934	0.03393	0.02657	...	15.73	104.50	819.1	0.1126	0.17370	0.13620	0.08178	0.2487	0.06766	0
465	9113239	B	13.24	20.13	86.87	542.9	0.08284	0.12230	0.10100	0.02833	...	25.50	115.00	733.5	0.1201	0.56460	0.65560	0.13570	0.2845	0.12490	0
60	858970	B	10.17	14.88	64.55	311.9	0.11340	0.08061	0.01084	0.01290	...	17.45	69.86	368.6	0.1275	0.09866	0.02168	0.02579	0.3557	0.08020	0
426	907409	B	10.48	14.98	67.49	333.6	0.09816	0.10130	0.06335	0.02218	...	21.57	81.41	440.4	0.1327	0.29960	0.29390	0.09310	0.3020	0.09646	0

**Implementation of a Neural Network "from scratch" with NumPy**¶

Example Usage of `neuralnet.py / class NeuralNetwork`¶

Multi-Class Classification¶

Dataset: MNIST¶

Binary-Class Classification¶

Dataset: Breast Cancer Wisconsin (Diagnostic) Data Set¶

	label	pixel0	pixel1	pixel2	pixel3	pixel4	pixel5	pixel6	pixel7	pixel8	...	pixel774	pixel775	pixel776	pixel777	pixel778	pixel779	pixel780	pixel781	pixel782	pixel783
25164	7	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
11904	9	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
37833	1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
6101	5	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
25019	3	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
21390	7	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
7601	3	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
224	1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
37582	4	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
12926	2	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

Implementation of a Neural Network "from scratch" with NumPy¶

Example Usage of neuralnet.py / class NeuralNetwork¶

Multi-Class Classification¶

Dataset: MNIST¶

Binary-Class Classification¶

Dataset: Breast Cancer Wisconsin (Diagnostic) Data Set¶

**Implementation of a Neural Network "from scratch" with NumPy**¶

Example Usage of `neuralnet.py / class NeuralNetwork`¶