From 4b6bf0f208974d1ed5fa6aa5aa4d087daaf274a2 Mon Sep 17 00:00:00 2001 From: anon Date: Mon, 7 Oct 2024 13:00:15 +0200 Subject: [PATCH] it does something now --- config.py | 1 + converter.l | 39 ++++++++++++++++++++++------------- data.py | 17 ++++++++++++---- formatter.py | 52 ++++++++++++----------------------------------- model.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ models/.gitkeep | 0 tard_wrangler.py | 17 +++++++++++++--- 7 files changed, 119 insertions(+), 60 deletions(-) create mode 100644 model.py create mode 100644 models/.gitkeep diff --git a/config.py b/config.py index 83c6c42..777bc2d 100644 --- a/config.py +++ b/config.py @@ -3,3 +3,4 @@ MAX_SHIMS = LINE_WIDTH - 1 SOURCE_LINE_BATCH_SIZE = 3 COMPILE_INPUT_DIRECTORY = "data/linux/" +MODEL_DIRECTORY = "models/" diff --git a/converter.l b/converter.l index 6adbfa6..b06bae6 100644 --- a/converter.l +++ b/converter.l @@ -18,14 +18,17 @@ FILE * build_file; char schemantic[MAX_SHIMS]; - int schim = 0; + int schim; - #define STEP_SCHEMANTIC fread(schemantic, MAX_SHIMS, sizeof(char), build_file) + #define STEP_SCHEMANTIC do { \ + schim = 0; \ + int re = fread(schemantic, sizeof(char), MAX_SHIMS, build_file); \ + if (re != sizeof(char)*MAX_SHIMS) { printf("- %d\n", re); exit(2); } \ + } while (0) #define ECHOS(s) fwrite(s, strlen(s), 1, yyout) - - #define EOL '\n' %} + // != is missing comment_marker (\/\*)|(\*\/) identifier \$?[A-Za-z0-9_]+ modify [+-]{2} @@ -35,8 +38,16 @@ shift (<<)|(>>) word {identifier} special {comment_marker}|{assignment}|{shift}|{modify} -%x NORMALIZE ACCUMULATE BUILD %x IN_STRING + + // Keep all but the required whitespaces +%x NORMALIZE + // Count the non-required whitespaces and write python arrays +%x ACCUMULATE + // Reconstruct normalized file based on binary whitespace count arrays +%x BUILD + +%option yylineno %option noyywrap nodefault %% BEGIN mystate; @@ -61,10 +72,7 @@ special {comment_marker}|{assignment}|{shift}|{modify} ECHO; was_word = false; } -\n { - ECHO; - return EOL; - } +\n { ECHO; } } { @@ -99,15 +107,17 @@ special {comment_marker}|{assignment}|{shift}|{modify} } { -[ ]|\t { ; } -{word}|. { - ECHO; +[ ] { ECHO; } +{word}|{special}|. { for (char i = 0; i < schemantic[schim]; i++) { ECHOS(" "); } + ECHO; + ++schim; } -\n { +\n { // XXX we find the last newline and still step, resulting in an error + ECHO; STEP_SCHEMANTIC; } } @@ -147,6 +157,7 @@ signed main(const int argc, const char * const * const argv) { if (!strcmp(argv[1], "build")) { mystate = BUILD; build_file = fopen("build_file", "rb"); + if (!build_file) { exit(1); } STEP_SCHEMANTIC; } else { return 1; @@ -154,7 +165,7 @@ signed main(const int argc, const char * const * const argv) { yyin = fopen(argv[2], "r"); - while(yylex() == EOL) { ; } + yylex(); return 0; } diff --git a/data.py b/data.py index 7b76b8e..eb475c4 100644 --- a/data.py +++ b/data.py @@ -1,11 +1,14 @@ from glob import glob import numpy as np import pickle +import sys from sys import argv from config import * import tard_wrangler +MAX_DATA_LIMIT = sys.maxsize + def get_source(path : str) -> [str]: '''returns source file in $SOURCE_LINE_BATCH_SIZE line batches''' r = [] @@ -54,11 +57,17 @@ def whitespace_to_np_array(spaces : []) -> np.array: def compile_data(): r = {'in': [], 'out': [], 'src': []} for n, path in enumerate(glob(COMPILE_INPUT_DIRECTORY + "/*.c")): - if n > 47: break # XXX - acc_path = path + ".acc" + if n > MAX_DATA_LIMIT: break # XXX + acc_path = path + ".acc" + norm_path = path + ".norm" r['src'].append(path) - r['in'] += get_source(path) - r['out'] += read_acc(acc_path) + source_batches = get_source(norm_path) + accumulation = read_acc(acc_path) + assert len(source_batches) == len(accumulation), ( + f"Some retard fucked up strings in {path}." + ) + r['in'] += source_batches + r['out'] += accumulation r['in'] = source_to_np_array(r['in']) r['out'] = whitespace_to_np_array(r['out']) return r diff --git a/formatter.py b/formatter.py index 9a31e1c..41d458d 100644 --- a/formatter.py +++ b/formatter.py @@ -1,46 +1,20 @@ +from datetime import datetime +from sys import argv import numpy as np -import os -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - -import tensorflow -from tensorflow import keras -from keras import layers from config import * +import model import data import tard_wrangler -dataset = data.get_data() +if len(argv) > 1: + mymodel = model.load_model(argv[1]) +else: + dataset = data.get_data() + mymodel = model.make_model(dataset) + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + mymodel.save(MODEL_DIRECTORY + f"model_-_{timestamp}.keras") -# XXX: add more conv layers -model = keras.Sequential([ - keras.Input(shape=(3, LINE_WIDTH, 1)), - layers.Conv2D( - filters=16, - kernel_size=(3,5), - strides=(1,1), - activation='relu', - padding='valid', - ), - layers.Flatten(), - layers.Dense(64, activation='relu'), - layers.Dense(64, activation='relu'), - layers.Dense(MAX_SHIMS) #activation='softmax' -]) - -model.compile( - optimizer='adam', - loss='mse', - metrics=['mae'] -) - -model.fit(dataset['in'], dataset['out'], - verbose=2, - batch_size=10, - epochs=50, - shuffle=True, -) - -prediction = model.predict(dataset['in'])[0] -prediction = prediction.astype(np.uint8).tobytes() -tard_wrangler.build("data/xop.c.norm", prediction) +predictions = tard_wrangler.full_predict("data/xop.c.norm", mymodel) +tard_wrangler.build("data/xop.c.norm", predictions) +tard_wrangler.cat_build() diff --git a/model.py b/model.py new file mode 100644 index 0000000..87fef4c --- /dev/null +++ b/model.py @@ -0,0 +1,53 @@ +import numpy as np +import pickle +import os +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +import tensorflow as tf +from tensorflow import keras +from keras import layers + +from config import * + +@tf.function +def custom_weighted_loss(y_true, y_pred): + weights = tf.linspace(1.0, 0.1, tf.shape(y_pred)[-1]) + return tf.reduce_mean(tf.square((y_true - y_pred) * weights)) + +def make_model(dataset : np.array) -> keras.Model: + # XXX: add more conv layers + model = keras.Sequential([ + keras.Input(shape=(3, LINE_WIDTH, 1)), + layers.Conv2D( + filters=16, + kernel_size=(3,5), + strides=(1,1), + activation='relu', + padding='valid', + ), + layers.Flatten(), + layers.Dense(64, activation='relu'), + layers.Dense(64, activation='relu'), + layers.Dense(MAX_SHIMS) #activation='softmax' + ]) + + model.compile( + optimizer='adam', + #loss='mse', + loss=custom_weighted_loss, + metrics=['mae'] + ) + + model.fit(dataset['in'], dataset['out'], + verbose=2, + batch_size=10, + epochs=50, + shuffle=True, + ) + + return model + +def load_model(path : str) -> keras.Model: + return keras.models.load_model(path, + compile=False + ) diff --git a/models/.gitkeep b/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tard_wrangler.py b/tard_wrangler.py index 02f7912..c00d6f5 100644 --- a/tard_wrangler.py +++ b/tard_wrangler.py @@ -2,6 +2,7 @@ import subprocess import numpy as np from config import * +import data def accumulate(path : str, output : str) -> None: process = subprocess.Popen( @@ -9,10 +10,20 @@ def accumulate(path : str, output : str) -> None: shell=True, ) -def build(path : str, prediction : np.array): - with open("build_file", "wb") as file: - file.write(prediction) +def full_predict(path : str, model) -> []: + r = [] + myinput = data.source_to_np_array(data.get_source(path)) + for i in myinput: + r += model.predict(np.expand_dims(i, axis=0)).astype(np.uint8).tobytes() + return r + +def build(path : str, predictions : []) -> None: + predictions = b''.join([i.to_bytes(1, byteorder='big', signed=False) for i in predictions]) + with open("build_file", "wb") as f: f.write(predictions) process = subprocess.Popen( "converter.out build " + path + " > out.c", shell=True, ) + +def cat_build(): + with open("out.c") as f: print(f.read())