it does something now

2024-10-07 13:00:15 +02:00
parent 27338a3481
commit 4b6bf0f208
7 changed files with 119 additions and 60 deletions
--- a/config.py
+++ b/config.py
@ -3,3 +3,4 @@ MAX_SHIMS  = LINE_WIDTH - 1
 SOURCE_LINE_BATCH_SIZE = 3

 COMPILE_INPUT_DIRECTORY = "data/linux/"
+MODEL_DIRECTORY = "models/"
--- a/converter.l
+++ b/converter.l
@ -18,14 +18,17 @@

    FILE * build_file;
    char schemantic[MAX_SHIMS];
-    int schim = 0;
+    int schim;

-    #define STEP_SCHEMANTIC fread(schemantic, MAX_SHIMS, sizeof(char), build_file)
+    #define STEP_SCHEMANTIC do { \
+			schim = 0; \
+			int re = fread(schemantic, sizeof(char), MAX_SHIMS, build_file); \
+			if (re != sizeof(char)*MAX_SHIMS) { printf("- %d\n", re); exit(2); } \
+		} while (0)
    #define ECHOS(s) fwrite(s, strlen(s), 1, yyout)
-
-    #define EOL '\n'
 %}

+	// != is missing
 comment_marker  (\/\*)|(\*\/)
 identifier      \$?[A-Za-z0-9_]+
 modify          [+-]{2}
@ -35,8 +38,16 @@ shift           (<<)|(>>)
 word    {identifier}
 special {comment_marker}|{assignment}|{shift}|{modify}

-%x NORMALIZE ACCUMULATE BUILD
 %x IN_STRING
+
+    // Keep all but the required whitespaces
+%x NORMALIZE
+    // Count the non-required whitespaces and write python arrays
+%x ACCUMULATE
+    // Reconstruct normalized file based on binary whitespace count arrays
+%x BUILD
+
+%option yylineno
 %option noyywrap nodefault
 %%
    BEGIN mystate;
@ -61,10 +72,7 @@ special {comment_marker}|{assignment}|{shift}|{modify}
                ECHO;
                was_word = false;
            }
-\n          {
-                ECHO;
-                return EOL;
-            }
+\n          { ECHO; }
 }

 <ACCUMULATE>{
@ -99,15 +107,17 @@ special {comment_marker}|{assignment}|{shift}|{modify}
 }

 <BUILD>{
-[ ]|\t      { ; }
-{word}|.    {
-                ECHO;
+[ ]         { ECHO; }
+{word}|{special}|.    {
                for (char i = 0; i < schemantic[schim]; i++) {
                    ECHOS(" ");
                }
+                ECHO;
+
                ++schim;
            }
-\n          {
+\n          { // XXX we find the last newline and still step, resulting in an error
+				ECHO;
                STEP_SCHEMANTIC;
            }
 }
@ -147,6 +157,7 @@ signed main(const int argc, const char * const * const argv) {
    if (!strcmp(argv[1], "build")) {
        mystate = BUILD;
        build_file = fopen("build_file", "rb");
+		if (!build_file) { exit(1); }
        STEP_SCHEMANTIC;
    } else {
        return 1;
@ -154,7 +165,7 @@ signed main(const int argc, const char * const * const argv) {

    yyin = fopen(argv[2], "r");

-    while(yylex() == EOL) { ; }
+    yylex();

    return 0;
 }
--- a/data.py
+++ b/data.py
@ -1,11 +1,14 @@
 from glob import glob
 import numpy as np
 import pickle
+import sys
 from sys import argv

 from config import *
 import tard_wrangler

+MAX_DATA_LIMIT = sys.maxsize
+
 def get_source(path : str) -> [str]:
 	'''returns source file in $SOURCE_LINE_BATCH_SIZE line batches'''
 	r = []
@ -54,11 +57,17 @@ def whitespace_to_np_array(spaces : []) -> np.array:
 def compile_data():
 	r = {'in': [], 'out': [], 'src': []}
 	for n, path in enumerate(glob(COMPILE_INPUT_DIRECTORY + "/*.c")):
-		if n > 47: break # XXX
-		acc_path = path + ".acc"
+		if n > MAX_DATA_LIMIT: break # XXX
+		acc_path  = path + ".acc"
+		norm_path = path + ".norm"
 		r['src'].append(path)
-		r['in']  += get_source(path)
-		r['out'] += read_acc(acc_path)
+		source_batches = get_source(norm_path)
+		accumulation   = read_acc(acc_path)
+		assert len(source_batches) == len(accumulation), (
+			f"Some retard fucked up strings in {path}."
+		)
+		r['in']  += source_batches
+		r['out'] += accumulation
 	r['in']  = source_to_np_array(r['in'])
 	r['out'] = whitespace_to_np_array(r['out'])
 	return r
--- a/formatter.py
+++ b/formatter.py
@ -1,46 +1,20 @@
+from datetime import datetime
+from sys import argv
 import numpy as np
-import os
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-
-import tensorflow
-from tensorflow import keras
-from keras import layers

 from config import *
+import model
 import data
 import tard_wrangler

-dataset = data.get_data()
+if len(argv) > 1:
+	mymodel = model.load_model(argv[1])
+else:
+	dataset = data.get_data()
+	mymodel = model.make_model(dataset)
+	timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+	mymodel.save(MODEL_DIRECTORY + f"model_-_{timestamp}.keras")

-# XXX: add more conv layers
-model = keras.Sequential([
-	keras.Input(shape=(3, LINE_WIDTH, 1)),
-	layers.Conv2D(
-		filters=16,
-		kernel_size=(3,5),
-		strides=(1,1),
-		activation='relu',
-		padding='valid',
-	),
-	layers.Flatten(),
-	layers.Dense(64, activation='relu'),
-	layers.Dense(64, activation='relu'),
-	layers.Dense(MAX_SHIMS) #activation='softmax'
-])
-
-model.compile(
-	optimizer='adam',
-	loss='mse',
-	metrics=['mae']
-)
-
-model.fit(dataset['in'], dataset['out'],
-    verbose=2,
-    batch_size=10,
-    epochs=50,
-    shuffle=True,
-)
-
-prediction = model.predict(dataset['in'])[0]
-prediction = prediction.astype(np.uint8).tobytes()
-tard_wrangler.build("data/xop.c.norm", prediction)
+predictions = tard_wrangler.full_predict("data/xop.c.norm", mymodel)
+tard_wrangler.build("data/xop.c.norm", predictions)
+tard_wrangler.cat_build()
--- a/model.py
+++ b/model.py
@ -0,0 +1,53 @@
+import numpy as np
+import pickle
+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
+import tensorflow as tf
+from tensorflow import keras
+from keras import layers
+
+from config import *
+
+@tf.function
+def custom_weighted_loss(y_true, y_pred):
+	weights = tf.linspace(1.0, 0.1, tf.shape(y_pred)[-1])
+	return tf.reduce_mean(tf.square((y_true - y_pred) * weights))
+
+def make_model(dataset : np.array) -> keras.Model:
+	# XXX: add more conv layers
+	model = keras.Sequential([
+		keras.Input(shape=(3, LINE_WIDTH, 1)),
+		layers.Conv2D(
+			filters=16,
+			kernel_size=(3,5),
+			strides=(1,1),
+			activation='relu',
+			padding='valid',
+		),
+		layers.Flatten(),
+		layers.Dense(64, activation='relu'),
+		layers.Dense(64, activation='relu'),
+		layers.Dense(MAX_SHIMS) #activation='softmax'
+	])
+
+	model.compile(
+		optimizer='adam',
+		#loss='mse',
+		loss=custom_weighted_loss,
+		metrics=['mae']
+	)
+
+	model.fit(dataset['in'], dataset['out'],
+		verbose=2,
+		batch_size=10,
+		epochs=50,
+		shuffle=True,
+	)
+
+	return model
+
+def load_model(path : str) -> keras.Model:
+	return keras.models.load_model(path,
+				compile=False
+	)
--- a/models/.gitkeep
+++ b/models/.gitkeep
--- a/tard_wrangler.py
+++ b/tard_wrangler.py
@ -2,6 +2,7 @@ import subprocess
 import numpy as np

 from config import *
+import data

 def accumulate(path : str, output : str) -> None:
 	process = subprocess.Popen(
@ -9,10 +10,20 @@ def accumulate(path : str, output : str) -> None:
 				shell=True,
 	)

-def build(path : str, prediction : np.array):
-	with open("build_file", "wb") as file:
-		file.write(prediction)
+def full_predict(path : str, model) -> []:
+	r = []
+	myinput = data.source_to_np_array(data.get_source(path))
+	for i in myinput:
+		r += model.predict(np.expand_dims(i, axis=0)).astype(np.uint8).tobytes()
+	return r
+
+def build(path : str, predictions : []) -> None:
+	predictions = b''.join([i.to_bytes(1, byteorder='big', signed=False) for i in predictions])
+	with open("build_file", "wb") as f: f.write(predictions)
 	process = subprocess.Popen(
 				"converter.out build " + path + " > out.c",
 				shell=True,
 	)
+
+def cat_build():
+	with open("out.c") as f: print(f.read())