it does something now

2024-10-07 13:00:15 +02:00
parent 27338a3481
commit 4b6bf0f208
7 changed files with 119 additions and 60 deletions
--- a/config.py
+++ b/config.py
@ -3,3 +3,4 @@ MAX_SHIMS  = LINE_WIDTH - 1
 SOURCE_LINE_BATCH_SIZE = 3
 COMPILE_INPUT_DIRECTORY = "data/linux/"
 MODEL_DIRECTORY = "models/"
--- a/converter.l
+++ b/converter.l
@ -18,14 +18,17 @@
    FILE * build_file;
    char schemantic[MAX_SHIMS];
-    int schim = 0;
+    int schim;
-    #define STEP_SCHEMANTIC fread(schemantic, MAX_SHIMS, sizeof(char), build_file)
+    #define STEP_SCHEMANTIC do { \
 			schim = 0; \
 			int re = fread(schemantic, sizeof(char), MAX_SHIMS, build_file); \
 			if (re != sizeof(char)*MAX_SHIMS) { printf("- %d\n", re); exit(2); } \
 		} while (0)
    #define ECHOS(s) fwrite(s, strlen(s), 1, yyout)
    #define EOL '\n'
 %}
 	// != is missing
 comment_marker  (\/\*)|(\*\/)
 identifier      \$?[A-Za-z0-9_]+
 modify          [+-]{2}
@ -35,8 +38,16 @@ shift           (<<)|(>>)
 word    {identifier}
 special {comment_marker}|{assignment}|{shift}|{modify}
 %x NORMALIZE ACCUMULATE BUILD
 %x IN_STRING
    // Keep all but the required whitespaces
 %x NORMALIZE
    // Count the non-required whitespaces and write python arrays
 %x ACCUMULATE
    // Reconstruct normalized file based on binary whitespace count arrays
 %x BUILD
 %option yylineno
 %option noyywrap nodefault
 %%
    BEGIN mystate;
@ -61,10 +72,7 @@ special {comment_marker}|{assignment}|{shift}|{modify}
                ECHO;
                was_word = false;
            }
-\n          {
+\n          { ECHO; }
                ECHO;
                return EOL;
            }
 }
 <ACCUMULATE>{
@ -99,15 +107,17 @@ special {comment_marker}|{assignment}|{shift}|{modify}
 }
 <BUILD>{
-[ ]|\t      { ; }
+[ ]         { ECHO; }
-{word}|.    {
+{word}|{special}|.    {
                ECHO;
                for (char i = 0; i < schemantic[schim]; i++) {
                    ECHOS(" ");
                }
                ECHO;
                ++schim;
            }
-\n          {
+\n          { // XXX we find the last newline and still step, resulting in an error
 				ECHO;
                STEP_SCHEMANTIC;
            }
 }
@ -147,6 +157,7 @@ signed main(const int argc, const char * const * const argv) {
    if (!strcmp(argv[1], "build")) {
        mystate = BUILD;
        build_file = fopen("build_file", "rb");
 		if (!build_file) { exit(1); }
        STEP_SCHEMANTIC;
    } else {
        return 1;
@ -154,7 +165,7 @@ signed main(const int argc, const char * const * const argv) {
    yyin = fopen(argv[2], "r");
-    while(yylex() == EOL) { ; }
+    yylex();
    return 0;
 }
--- a/data.py
+++ b/data.py
@ -1,11 +1,14 @@
 from glob import glob
 import numpy as np
 import pickle
 import sys
 from sys import argv
 from config import *
 import tard_wrangler
 MAX_DATA_LIMIT = sys.maxsize
 def get_source(path : str) -> [str]:
 	'''returns source file in $SOURCE_LINE_BATCH_SIZE line batches'''
 	r = []
@ -54,11 +57,17 @@ def whitespace_to_np_array(spaces : []) -> np.array:
 def compile_data():
 	r = {'in': [], 'out': [], 'src': []}
 	for n, path in enumerate(glob(COMPILE_INPUT_DIRECTORY + "/*.c")):
-		if n > 47: break # XXX
+		if n > MAX_DATA_LIMIT: break # XXX
-		acc_path = path + ".acc"
+		acc_path  = path + ".acc"
 		norm_path = path + ".norm"
 		r['src'].append(path)
-		r['in']  += get_source(path)
+		source_batches = get_source(norm_path)
-		r['out'] += read_acc(acc_path)
+		accumulation   = read_acc(acc_path)
 		assert len(source_batches) == len(accumulation), (
 			f"Some retard fucked up strings in {path}."
 		)
 		r['in']  += source_batches
 		r['out'] += accumulation
 	r['in']  = source_to_np_array(r['in'])
 	r['out'] = whitespace_to_np_array(r['out'])
 	return r
--- a/formatter.py
+++ b/formatter.py
@ -1,46 +1,20 @@
 from datetime import datetime
 from sys import argv
 import numpy as np
 import os
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 import tensorflow
 from tensorflow import keras
 from keras import layers
 from config import *
 import model
 import data
 import tard_wrangler
-dataset = data.get_data()
+if len(argv) > 1:
 	mymodel = model.load_model(argv[1])
 else:
 	dataset = data.get_data()
 	mymodel = model.make_model(dataset)
 	timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
 	mymodel.save(MODEL_DIRECTORY + f"model_-_{timestamp}.keras")
-# XXX: add more conv layers
+predictions = tard_wrangler.full_predict("data/xop.c.norm", mymodel)
-model = keras.Sequential([
+tard_wrangler.build("data/xop.c.norm", predictions)
-	keras.Input(shape=(3, LINE_WIDTH, 1)),
+tard_wrangler.cat_build()
 	layers.Conv2D(
 		filters=16,
 		kernel_size=(3,5),
 		strides=(1,1),
 		activation='relu',
 		padding='valid',
 	),
 	layers.Flatten(),
 	layers.Dense(64, activation='relu'),
 	layers.Dense(64, activation='relu'),
 	layers.Dense(MAX_SHIMS) #activation='softmax'
 ])
 model.compile(
 	optimizer='adam',
 	loss='mse',
 	metrics=['mae']
 )
 model.fit(dataset['in'], dataset['out'],
    verbose=2,
    batch_size=10,
    epochs=50,
    shuffle=True,
 )
 prediction = model.predict(dataset['in'])[0]
 prediction = prediction.astype(np.uint8).tobytes()
 tard_wrangler.build("data/xop.c.norm", prediction)
--- a/model.py
+++ b/model.py
@ -0,0 +1,53 @@
 import numpy as np
 import pickle
 import os
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 import tensorflow as tf
 from tensorflow import keras
 from keras import layers
 from config import *
@tf.function
 def custom_weighted_loss(y_true, y_pred):
 	weights = tf.linspace(1.0, 0.1, tf.shape(y_pred)[-1])
 	return tf.reduce_mean(tf.square((y_true - y_pred) * weights))
 def make_model(dataset : np.array) -> keras.Model:
 	# XXX: add more conv layers
 	model = keras.Sequential([
 		keras.Input(shape=(3, LINE_WIDTH, 1)),
 		layers.Conv2D(
 			filters=16,
 			kernel_size=(3,5),
 			strides=(1,1),
 			activation='relu',
 			padding='valid',
 		),
 		layers.Flatten(),
 		layers.Dense(64, activation='relu'),
 		layers.Dense(64, activation='relu'),
 		layers.Dense(MAX_SHIMS) #activation='softmax'
 	])
 	model.compile(
 		optimizer='adam',
 		#loss='mse',
 		loss=custom_weighted_loss,
 		metrics=['mae']
 	)
 	model.fit(dataset['in'], dataset['out'],
 		verbose=2,
 		batch_size=10,
 		epochs=50,
 		shuffle=True,
 	)
 	return model
 def load_model(path : str) -> keras.Model:
 	return keras.models.load_model(path,
 				compile=False
 	)
--- a/models/.gitkeep
+++ b/models/.gitkeep
--- a/tard_wrangler.py
+++ b/tard_wrangler.py
@ -2,6 +2,7 @@ import subprocess
 import numpy as np
 from config import *
 import data
 def accumulate(path : str, output : str) -> None:
 	process = subprocess.Popen(
@ -9,10 +10,20 @@ def accumulate(path : str, output : str) -> None:
 				shell=True,
 	)
-def build(path : str, prediction : np.array):
+def full_predict(path : str, model) -> []:
-	with open("build_file", "wb") as file:
+	r = []
-		file.write(prediction)
+	myinput = data.source_to_np_array(data.get_source(path))
 	for i in myinput:
 		r += model.predict(np.expand_dims(i, axis=0)).astype(np.uint8).tobytes()
 	return r
 def build(path : str, predictions : []) -> None:
 	predictions = b''.join([i.to_bytes(1, byteorder='big', signed=False) for i in predictions])
 	with open("build_file", "wb") as f: f.write(predictions)
 	process = subprocess.Popen(
 				"converter.out build " + path + " > out.c",
 				shell=True,
 	)
 def cat_build():
 	with open("out.c") as f: print(f.read())