From 4b6bf0f208974d1ed5fa6aa5aa4d087daaf274a2 Mon Sep 17 00:00:00 2001
From: anon <anon@anon.anon>
Date: Mon, 7 Oct 2024 13:00:15 +0200
Subject: [PATCH] it does something now

---
 config.py        |  1 +
 converter.l      | 39 ++++++++++++++++++++++-------------
 data.py          | 17 ++++++++++++----
 formatter.py     | 52 ++++++++++++-----------------------------------
 model.py         | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 models/.gitkeep  |  0
 tard_wrangler.py | 17 +++++++++++++---
 7 files changed, 119 insertions(+), 60 deletions(-)
 create mode 100644 model.py
 create mode 100644 models/.gitkeep
diff --git a/config.py b/config.py
index 83c6c42..777bc2d 100644
--- a/config.py
+++ b/config.py
@@ -3,3 +3,4 @@ MAX_SHIMS  = LINE_WIDTH - 1
 SOURCE_LINE_BATCH_SIZE = 3
 
 COMPILE_INPUT_DIRECTORY = "data/linux/"
+MODEL_DIRECTORY = "models/"
diff --git a/converter.l b/converter.l
index 6adbfa6..b06bae6 100644
--- a/converter.l
+++ b/converter.l
@@ -18,14 +18,17 @@
 
     FILE * build_file;
     char schemantic[MAX_SHIMS];
-    int schim = 0;
+    int schim;
 
-    #define STEP_SCHEMANTIC fread(schemantic, MAX_SHIMS, sizeof(char), build_file)
+    #define STEP_SCHEMANTIC do { \
+			schim = 0; \
+			int re = fread(schemantic, sizeof(char), MAX_SHIMS, build_file); \
+			if (re != sizeof(char)*MAX_SHIMS) { printf("- %d\n", re); exit(2); } \
+		} while (0)
     #define ECHOS(s) fwrite(s, strlen(s), 1, yyout)
-
-    #define EOL '\n'
 %}
 
+	// != is missing
 comment_marker  (\/\*)|(\*\/)
 identifier      \$?[A-Za-z0-9_]+
 modify          [+-]{2}
@@ -35,8 +38,16 @@ shift           (<<)|(>>)
 word    {identifier}
 special {comment_marker}|{assignment}|{shift}|{modify}
 
-%x NORMALIZE ACCUMULATE BUILD
 %x IN_STRING
+
+    // Keep all but the required whitespaces
+%x NORMALIZE
+    // Count the non-required whitespaces and write python arrays
+%x ACCUMULATE
+    // Reconstruct normalized file based on binary whitespace count arrays
+%x BUILD
+
+%option yylineno
 %option noyywrap nodefault
 %%
     BEGIN mystate;
@@ -61,10 +72,7 @@ special {comment_marker}|{assignment}|{shift}|{modify}
                 ECHO;
                 was_word = false;
             }
-\n          {
-                ECHO;
-                return EOL;
-            }
+\n          { ECHO; }
 }
 
 <ACCUMULATE>{
@@ -99,15 +107,17 @@ special {comment_marker}|{assignment}|{shift}|{modify}
 }
 
 <BUILD>{
-[ ]|\t      { ; }
-{word}|.    {
-                ECHO;
+[ ]         { ECHO; }
+{word}|{special}|.    {
                 for (char i = 0; i < schemantic[schim]; i++) {
                     ECHOS(" ");
                 }
+                ECHO;
+
                 ++schim;
             }
-\n          {
+\n          { // XXX we find the last newline and still step, resulting in an error
+				ECHO;
                 STEP_SCHEMANTIC;
             }
 }
@@ -147,6 +157,7 @@ signed main(const int argc, const char * const * const argv) {
     if (!strcmp(argv[1], "build")) {
         mystate = BUILD;
         build_file = fopen("build_file", "rb");
+		if (!build_file) { exit(1); }
         STEP_SCHEMANTIC;
     } else {
         return 1;
@@ -154,7 +165,7 @@ signed main(const int argc, const char * const * const argv) {
 
     yyin = fopen(argv[2], "r");
 
-    while(yylex() == EOL) { ; }
+    yylex();
 
     return 0;
 }
diff --git a/data.py b/data.py
index 7b76b8e..eb475c4 100644
--- a/data.py
+++ b/data.py
@@ -1,11 +1,14 @@
 from glob import glob
 import numpy as np
 import pickle
+import sys
 from sys import argv
 
 from config import *
 import tard_wrangler
 
+MAX_DATA_LIMIT = sys.maxsize
+
 def get_source(path : str) -> [str]:
 	'''returns source file in $SOURCE_LINE_BATCH_SIZE line batches'''
 	r = []
@@ -54,11 +57,17 @@ def whitespace_to_np_array(spaces : []) -> np.array:
 def compile_data():
 	r = {'in': [], 'out': [], 'src': []}
 	for n, path in enumerate(glob(COMPILE_INPUT_DIRECTORY + "/*.c")):
-		if n > 47: break # XXX
-		acc_path = path + ".acc"
+		if n > MAX_DATA_LIMIT: break # XXX
+		acc_path  = path + ".acc"
+		norm_path = path + ".norm"
 		r['src'].append(path)
-		r['in']  += get_source(path)
-		r['out'] += read_acc(acc_path)
+		source_batches = get_source(norm_path)
+		accumulation   = read_acc(acc_path)
+		assert len(source_batches) == len(accumulation), (
+			f"Some retard fucked up strings in {path}."
+		)
+		r['in']  += source_batches
+		r['out'] += accumulation
 	r['in']  = source_to_np_array(r['in'])
 	r['out'] = whitespace_to_np_array(r['out'])
 	return r
diff --git a/formatter.py b/formatter.py
index 9a31e1c..41d458d 100644
--- a/formatter.py
+++ b/formatter.py
@@ -1,46 +1,20 @@
+from datetime import datetime
+from sys import argv
 import numpy as np
-import os
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-
-import tensorflow
-from tensorflow import keras
-from keras import layers
 
 from config import *
+import model
 import data
 import tard_wrangler
 
-dataset = data.get_data()
+if len(argv) > 1:
+	mymodel = model.load_model(argv[1])
+else:
+	dataset = data.get_data()
+	mymodel = model.make_model(dataset)
+	timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+	mymodel.save(MODEL_DIRECTORY + f"model_-_{timestamp}.keras")
 
-# XXX: add more conv layers
-model = keras.Sequential([
-	keras.Input(shape=(3, LINE_WIDTH, 1)),
-	layers.Conv2D(
-		filters=16,
-		kernel_size=(3,5),
-		strides=(1,1),
-		activation='relu',
-		padding='valid',
-	),
-	layers.Flatten(),
-	layers.Dense(64, activation='relu'),
-	layers.Dense(64, activation='relu'),
-	layers.Dense(MAX_SHIMS) #activation='softmax'
-])
-
-model.compile(
-	optimizer='adam',
-	loss='mse',
-	metrics=['mae']
-)
-
-model.fit(dataset['in'], dataset['out'],
-    verbose=2,
-    batch_size=10,
-    epochs=50,
-    shuffle=True,
-)
-
-prediction = model.predict(dataset['in'])[0]
-prediction = prediction.astype(np.uint8).tobytes()
-tard_wrangler.build("data/xop.c.norm", prediction)
+predictions = tard_wrangler.full_predict("data/xop.c.norm", mymodel)
+tard_wrangler.build("data/xop.c.norm", predictions)
+tard_wrangler.cat_build()
diff --git a/model.py b/model.py
new file mode 100644
index 0000000..87fef4c
--- /dev/null
+++ b/model.py
@@ -0,0 +1,53 @@
+import numpy as np
+import pickle
+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
+import tensorflow as tf
+from tensorflow import keras
+from keras import layers
+
+from config import *
+
+@tf.function
+def custom_weighted_loss(y_true, y_pred):
+	weights = tf.linspace(1.0, 0.1, tf.shape(y_pred)[-1])
+	return tf.reduce_mean(tf.square((y_true - y_pred) * weights))
+
+def make_model(dataset : np.array) -> keras.Model:
+	# XXX: add more conv layers
+	model = keras.Sequential([
+		keras.Input(shape=(3, LINE_WIDTH, 1)),
+		layers.Conv2D(
+			filters=16,
+			kernel_size=(3,5),
+			strides=(1,1),
+			activation='relu',
+			padding='valid',
+		),
+		layers.Flatten(),
+		layers.Dense(64, activation='relu'),
+		layers.Dense(64, activation='relu'),
+		layers.Dense(MAX_SHIMS) #activation='softmax'
+	])
+
+	model.compile(
+		optimizer='adam',
+		#loss='mse',
+		loss=custom_weighted_loss,
+		metrics=['mae']
+	)
+
+	model.fit(dataset['in'], dataset['out'],
+		verbose=2,
+		batch_size=10,
+		epochs=50,
+		shuffle=True,
+	)
+
+	return model
+
+def load_model(path : str) -> keras.Model:
+	return keras.models.load_model(path,
+				compile=False
+	)
diff --git a/models/.gitkeep b/models/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tard_wrangler.py b/tard_wrangler.py
index 02f7912..c00d6f5 100644
--- a/tard_wrangler.py
+++ b/tard_wrangler.py
@@ -2,6 +2,7 @@ import subprocess
 import numpy as np
 
 from config import *
+import data
 
 def accumulate(path : str, output : str) -> None:
 	process = subprocess.Popen(
@@ -9,10 +10,20 @@ def accumulate(path : str, output : str) -> None:
 				shell=True,
 	)
 
-def build(path : str, prediction : np.array):
-	with open("build_file", "wb") as file:
-		file.write(prediction)
+def full_predict(path : str, model) -> []:
+	r = []
+	myinput = data.source_to_np_array(data.get_source(path))
+	for i in myinput:
+		r += model.predict(np.expand_dims(i, axis=0)).astype(np.uint8).tobytes()
+	return r
+
+def build(path : str, predictions : []) -> None:
+	predictions = b''.join([i.to_bytes(1, byteorder='big', signed=False) for i in predictions])
+	with open("build_file", "wb") as f: f.write(predictions)
 	process = subprocess.Popen(
 				"converter.out build " + path + " > out.c",
 				shell=True,
 	)
+
+def cat_build():
+	with open("out.c") as f: print(f.read())