#!/bin/python3 # Numpy arrays are significantly faster than python lists # they also ease matrix operations we will need {multiplication} import numpy as np # Pickle is used for object serialization, # it will allow us to save our already trained models to disk import pickle # We will need to deep copy NN inputs to append a bias # without ruining the sample from copy import deepcopy def get_xor(): r = [ {'in': [0, 0], 'out': [0]}, {'in': [1, 0], 'out': [1]}, {'in': [0, 1], 'out': [1]}, {'in': [1, 1], 'out': [0]}, ] return r # YYY: # https://www.kaggle.com/datasets/swaroopmeher/boston-weather-2013-2023 def get_pressure_data(): import csv r = [] with open('boston_weather_data.csv', mode='r') as file: reader = csv.reader(file) for row in reader: r.append(row[7]) return r # Used for transforming a series to I/O batches to the form: # { in: [a], out: a } # `.in` is always consulted by our network # `.out` is used for training only, prediction works without it def batch_data(l : []): # 'ino' as in 'arduINO', modelled after their map() # (`map()˙ already un use by a python builtin used for constructing iterators) def ino_map(x, old_min, old_max, new_min, new_max): return ((x - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min def normalize(x): return ino_map(x / (1000 * 2), 0.48, 0.53, 0.0, 1) def denormalize(x): return ino_map(x, 0.0, 1, 0.48, 0.53) * (1000 * 2) # number of elements used for constructing a single batch BATCH_SIZE = 8 d = [] for i in [l[i:i + BATCH_SIZE] for i in range(1, len(l), BATCH_SIZE)]: try: h = dict() h['in'] = [normalize(float(f)) for f in i[:BATCH_SIZE-1]] h['out'] = normalize(float(i[BATCH_SIZE-1])) d.append(h) except Exception as e: pass #print(e) return {'data': d, 'normalize': normalize, 'denormalize': denormalize} # Activation function which could be swapped out arbitrarily def sigmoid_activation(x): return 1/(1+np.exp(-x)) # Derivative of sigmoid_activation(x) # NOTE: the real deactivation would be `sigmoid_activation(x) * (1 - sigmoid_activation(x))` # however, we will only be passing values which are already activated! def sigmoid_deactivation(a): return a * (1 - a) class NN: learning_rate = 0 # Architecture: # { # ARCHITECTURE := [3, 4, 4, 1] # O X O X O \ # O X O X O - O # O X O X O / # \ O X O / # // This will results in the following weight matrix shapes: # (4, 3), (4, 4), (4, 1) # // NOTE: notice how one layer is "missing", # as we dont need weights for the input layer # } architecture = None # [Layer] layers = None # Each layer will be stored as its own object. # The reason for this is that: # Storing each neuron individually would keep us from using # numpy matrix operations on layers. # While speed is not a top priority here, it would lead to much more # verbose and confusing code. # And, due to Python's primitive/reference type differentiation # we would also have trouble with using for loops. # Storing layers without a container class would result in a bunch of # parallel arrays. Now, this could be benefitial for performance # in a low level environment when interfacing with the GPU directly, # but we are writting Python... # Anyways, it should go without saying that parellel arrays are ugly. class Layer: def __init__(self, input_size : int, neuron_count : int, normalizer : int): self.weights = np.random.rand(input_size, neuron_count) # Normalizing the weights (based on the neuron count) is # something ive seen in an example. # Being a hyper-parameter, its hard to say whether it actually # helps or not. What I can tell is that in simple problems # {xor} it is counter productive. self.weights = self.weights / np.sqrt(normalizer) self.activation = sigmoid_activation self.deactivation = sigmoid_deactivation # Most sources save the activations, which is fundemantelly the same # as what we are doing since the activations of this layer is # the input of the next. However due to the input layer being "virtual", # we are left with a choice either way, we either store the activations # and shoehorn in the initial input or store the inputs and shoehorn # in the last activation. With such a layer based program architecture # i find the later more elegant as the last activation is also the overall # output of the network and having a `prediction` variable does not hurt # code quality: https://commadot.com/wtf-per-minute/ self.inputs = None self.deltas = None # buffer change values as calculated by backtracking def __str__(self): # for printing our network architecture r = f" \033[35m{self.weights.shape}\033[0m\n{self.weights}\n" return r def calculate_deltas(self, error : np.array): # Notice how this is not a dot product, # we actually want element-wise multiplication # ie. weighting by the error return error * self.deactivation(self.inputs) def predict(self): return self.activation( np.dot(self.inputs, self.weights) ) def __init__(self, architecture : [], learning_rate : int): self.architecture = architecture self.learning_rate = learning_rate self.layers = [] # The neuron count of the previous layer tells us # the input size, this gets combined with the current # NOTE: `+ 1` is always the addition of a bias input node for i in np.arange(0, len(architecture) - 2): l = self.Layer(architecture[i] + 1, architecture[i+1] + 1, architecture[i]) self.layers.append(l) # The last layer does not get a bias l = self.Layer(architecture[-2] + 1, architecture[-1], architecture[-2]) self.layers.append(l) def __str__(self): # also for printing r = f"\033[1;34mNeural Network @ {id(self)}:\033[0m" for i, l in enumerate(self.layers): r += f"\n\033[34m--- Layer {i}:\033[0m\n" r += str(l) r += "\n\033[1;34m###\033[0m" return r # Boring serialization stuff def save(self): from datetime import datetime with open(str(datetime.now()).replace(' ', '=') + ".pkl", 'wb') as f: pickle.dump(self, f) @staticmethod def load(id_ : str): # Constructor from file with open(id_ + ".pkl", 'rb') as f: o = pickle.load(f) return o # This internal function is used both by ˙train()˙ and `predict()` def predict_(self, data : np.ndarray): self.layers[0].inputs = np.atleast_2d(data) for previous_layer, current_layer in [(self.layers[li-1], self.layers[li]) for li in range(1, len(self.layers))]: # NOTE: `.predict()` feeds from ˙.inputs` #self.layers[li].inputs = self.layers[li-1].predict() current_layer.inputs = previous_layer.predict() return self.layers[-1].predict() def train(self, data, epochs): def train_(data : {}): # We do so called 'online learning' where the weights are adjusted after # each input (in contrast to calculating the loss for the entire dataset) prediction = self.predict_(data['in']) # For every layer we will work out the error in reverse order. # NOTE: some sources do `prediction - data['out']` which is equivalent, # as long as we dont confuse the operands delta_output_sum = -(data['out'] - prediction) self.layers[-1].deltas = delta_output_sum * self.layers[0].deactivation(prediction) # We wish to iterate in reverse order so we create a nifty "alias" rl = self.layers[::-1] # For ever consequent layer we utalize the deltas of the previous for previous_layer, current_layer in [(rl[li], rl[li+1]) for li in range(len(rl)-1)]: error = np.dot(previous_layer.deltas, previous_layer.weights.T) current_layer.deltas = previous_layer.calculate_deltas(error) # We update the weights for l in self.layers: l.weights -= self.learning_rate * np.dot(l.inputs.T, l.deltas) # Unlike the error calculations we use to update the weights, # this function operates on the while dataset and serves # as an overview of learning def loss_function(data : [], targets : []): targets = np.atleast_2d(targets) predictions = np.empty_like(targets) for i, d in enumerate(data): predictions[0][i] = self.predict_(d) return .5 * np.sum((predictions - targets) ** 2) # Appending input layer bias data_buffer = deepcopy(data) for d in data_buffer: d['in'] = np.append(d['in'], 1) for epoch in range(epochs): for d in data_buffer: train_(d) if epoch % 100 == 0: #loss = loss_function(d['in'], d['out']) loss = loss_function([d['in'] for d in data_buffer], [d['out'] for d in data_buffer]) print(f"[INFO] epoch={epoch}, loss={loss}") def predict(self, data : []): p = np.append(data, 1) return self.predict_(p) # ----- def xor(): data = get_xor() # Simpler architectures have an easier time learning easier problems, # requiring less epochs # Conceptulize it as the model "overthinking" the problem #n = NN([2, 2, 4, 2, 3, 1], .5) n = NN([2, 2, 1], .5) print(n) n.train(data, epochs=2000) print(n) for d in data: out = round(n.predict(d['in'])[0][0], 3) print(f"In: {d['in']}\nOut: {out}\nActual: {d['out'][0]}\n") def pressure(): samples = batch_data(get_pressure_data()) PARTITIONING_INDEX = int(len(samples['data']) * .75) training_data = samples['data'][:PARTITIONING_INDEX] checking_data = samples['data'][PARTITIONING_INDEX:] # This architecture seems to be doing slightly better than # something more bloated such as `NN([7, 8, 8, 1], .5)` # and significantly better than something longer # such as NN([7, 4, 4, 4, 1], .5) # (atleast under reasonable training times) n = NN([7, 4, 4, 1], .5) #n = NN.load('2024-06-05=10:56:28.122959') print(n) n.train(training_data, epochs=10000) print(n) n.save() # This makes our denormalization valid as a matrix operation vd = np.vectorize(samples['denormalize']) for d in checking_data: out = n.predict(d['in']) out_actual = vd(round(out[0][0], 3)) in_actual = vd(d['in']) actual_actual = vd(d['out']) print(f"In: {in_actual}\nOut: {out_actual}\nActual: {actual_actual}\n") if __name__ == '__main__': #xor() pressure()