it does something now

This commit is contained in:
anon
2024-10-07 13:00:15 +02:00
parent 27338a3481
commit 4b6bf0f208
7 changed files with 119 additions and 60 deletions

View File

@ -3,3 +3,4 @@ MAX_SHIMS = LINE_WIDTH - 1
SOURCE_LINE_BATCH_SIZE = 3 SOURCE_LINE_BATCH_SIZE = 3
COMPILE_INPUT_DIRECTORY = "data/linux/" COMPILE_INPUT_DIRECTORY = "data/linux/"
MODEL_DIRECTORY = "models/"

View File

@ -18,14 +18,17 @@
FILE * build_file; FILE * build_file;
char schemantic[MAX_SHIMS]; char schemantic[MAX_SHIMS];
int schim = 0; int schim;
#define STEP_SCHEMANTIC fread(schemantic, MAX_SHIMS, sizeof(char), build_file) #define STEP_SCHEMANTIC do { \
schim = 0; \
int re = fread(schemantic, sizeof(char), MAX_SHIMS, build_file); \
if (re != sizeof(char)*MAX_SHIMS) { printf("- %d\n", re); exit(2); } \
} while (0)
#define ECHOS(s) fwrite(s, strlen(s), 1, yyout) #define ECHOS(s) fwrite(s, strlen(s), 1, yyout)
#define EOL '\n'
%} %}
// != is missing
comment_marker (\/\*)|(\*\/) comment_marker (\/\*)|(\*\/)
identifier \$?[A-Za-z0-9_]+ identifier \$?[A-Za-z0-9_]+
modify [+-]{2} modify [+-]{2}
@ -35,8 +38,16 @@ shift (<<)|(>>)
word {identifier} word {identifier}
special {comment_marker}|{assignment}|{shift}|{modify} special {comment_marker}|{assignment}|{shift}|{modify}
%x NORMALIZE ACCUMULATE BUILD
%x IN_STRING %x IN_STRING
// Keep all but the required whitespaces
%x NORMALIZE
// Count the non-required whitespaces and write python arrays
%x ACCUMULATE
// Reconstruct normalized file based on binary whitespace count arrays
%x BUILD
%option yylineno
%option noyywrap nodefault %option noyywrap nodefault
%% %%
BEGIN mystate; BEGIN mystate;
@ -61,10 +72,7 @@ special {comment_marker}|{assignment}|{shift}|{modify}
ECHO; ECHO;
was_word = false; was_word = false;
} }
\n { \n { ECHO; }
ECHO;
return EOL;
}
} }
<ACCUMULATE>{ <ACCUMULATE>{
@ -99,15 +107,17 @@ special {comment_marker}|{assignment}|{shift}|{modify}
} }
<BUILD>{ <BUILD>{
[ ]|\t { ; } [ ] { ECHO; }
{word}|. { {word}|{special}|. {
ECHO;
for (char i = 0; i < schemantic[schim]; i++) { for (char i = 0; i < schemantic[schim]; i++) {
ECHOS(" "); ECHOS(" ");
} }
ECHO;
++schim; ++schim;
} }
\n { \n { // XXX we find the last newline and still step, resulting in an error
ECHO;
STEP_SCHEMANTIC; STEP_SCHEMANTIC;
} }
} }
@ -147,6 +157,7 @@ signed main(const int argc, const char * const * const argv) {
if (!strcmp(argv[1], "build")) { if (!strcmp(argv[1], "build")) {
mystate = BUILD; mystate = BUILD;
build_file = fopen("build_file", "rb"); build_file = fopen("build_file", "rb");
if (!build_file) { exit(1); }
STEP_SCHEMANTIC; STEP_SCHEMANTIC;
} else { } else {
return 1; return 1;
@ -154,7 +165,7 @@ signed main(const int argc, const char * const * const argv) {
yyin = fopen(argv[2], "r"); yyin = fopen(argv[2], "r");
while(yylex() == EOL) { ; } yylex();
return 0; return 0;
} }

17
data.py
View File

@ -1,11 +1,14 @@
from glob import glob from glob import glob
import numpy as np import numpy as np
import pickle import pickle
import sys
from sys import argv from sys import argv
from config import * from config import *
import tard_wrangler import tard_wrangler
MAX_DATA_LIMIT = sys.maxsize
def get_source(path : str) -> [str]: def get_source(path : str) -> [str]:
'''returns source file in $SOURCE_LINE_BATCH_SIZE line batches''' '''returns source file in $SOURCE_LINE_BATCH_SIZE line batches'''
r = [] r = []
@ -54,11 +57,17 @@ def whitespace_to_np_array(spaces : []) -> np.array:
def compile_data(): def compile_data():
r = {'in': [], 'out': [], 'src': []} r = {'in': [], 'out': [], 'src': []}
for n, path in enumerate(glob(COMPILE_INPUT_DIRECTORY + "/*.c")): for n, path in enumerate(glob(COMPILE_INPUT_DIRECTORY + "/*.c")):
if n > 47: break # XXX if n > MAX_DATA_LIMIT: break # XXX
acc_path = path + ".acc" acc_path = path + ".acc"
norm_path = path + ".norm"
r['src'].append(path) r['src'].append(path)
r['in'] += get_source(path) source_batches = get_source(norm_path)
r['out'] += read_acc(acc_path) accumulation = read_acc(acc_path)
assert len(source_batches) == len(accumulation), (
f"Some retard fucked up strings in {path}."
)
r['in'] += source_batches
r['out'] += accumulation
r['in'] = source_to_np_array(r['in']) r['in'] = source_to_np_array(r['in'])
r['out'] = whitespace_to_np_array(r['out']) r['out'] = whitespace_to_np_array(r['out'])
return r return r

View File

@ -1,46 +1,20 @@
from datetime import datetime
from sys import argv
import numpy as np import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow
from tensorflow import keras
from keras import layers
from config import * from config import *
import model
import data import data
import tard_wrangler import tard_wrangler
dataset = data.get_data() if len(argv) > 1:
mymodel = model.load_model(argv[1])
else:
dataset = data.get_data()
mymodel = model.make_model(dataset)
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
mymodel.save(MODEL_DIRECTORY + f"model_-_{timestamp}.keras")
# XXX: add more conv layers predictions = tard_wrangler.full_predict("data/xop.c.norm", mymodel)
model = keras.Sequential([ tard_wrangler.build("data/xop.c.norm", predictions)
keras.Input(shape=(3, LINE_WIDTH, 1)), tard_wrangler.cat_build()
layers.Conv2D(
filters=16,
kernel_size=(3,5),
strides=(1,1),
activation='relu',
padding='valid',
),
layers.Flatten(),
layers.Dense(64, activation='relu'),
layers.Dense(64, activation='relu'),
layers.Dense(MAX_SHIMS) #activation='softmax'
])
model.compile(
optimizer='adam',
loss='mse',
metrics=['mae']
)
model.fit(dataset['in'], dataset['out'],
verbose=2,
batch_size=10,
epochs=50,
shuffle=True,
)
prediction = model.predict(dataset['in'])[0]
prediction = prediction.astype(np.uint8).tobytes()
tard_wrangler.build("data/xop.c.norm", prediction)

53
model.py Normal file
View File

@ -0,0 +1,53 @@
import numpy as np
import pickle
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow import keras
from keras import layers
from config import *
@tf.function
def custom_weighted_loss(y_true, y_pred):
weights = tf.linspace(1.0, 0.1, tf.shape(y_pred)[-1])
return tf.reduce_mean(tf.square((y_true - y_pred) * weights))
def make_model(dataset : np.array) -> keras.Model:
# XXX: add more conv layers
model = keras.Sequential([
keras.Input(shape=(3, LINE_WIDTH, 1)),
layers.Conv2D(
filters=16,
kernel_size=(3,5),
strides=(1,1),
activation='relu',
padding='valid',
),
layers.Flatten(),
layers.Dense(64, activation='relu'),
layers.Dense(64, activation='relu'),
layers.Dense(MAX_SHIMS) #activation='softmax'
])
model.compile(
optimizer='adam',
#loss='mse',
loss=custom_weighted_loss,
metrics=['mae']
)
model.fit(dataset['in'], dataset['out'],
verbose=2,
batch_size=10,
epochs=50,
shuffle=True,
)
return model
def load_model(path : str) -> keras.Model:
return keras.models.load_model(path,
compile=False
)

0
models/.gitkeep Normal file
View File

View File

@ -2,6 +2,7 @@ import subprocess
import numpy as np import numpy as np
from config import * from config import *
import data
def accumulate(path : str, output : str) -> None: def accumulate(path : str, output : str) -> None:
process = subprocess.Popen( process = subprocess.Popen(
@ -9,10 +10,20 @@ def accumulate(path : str, output : str) -> None:
shell=True, shell=True,
) )
def build(path : str, prediction : np.array): def full_predict(path : str, model) -> []:
with open("build_file", "wb") as file: r = []
file.write(prediction) myinput = data.source_to_np_array(data.get_source(path))
for i in myinput:
r += model.predict(np.expand_dims(i, axis=0)).astype(np.uint8).tobytes()
return r
def build(path : str, predictions : []) -> None:
predictions = b''.join([i.to_bytes(1, byteorder='big', signed=False) for i in predictions])
with open("build_file", "wb") as f: f.write(predictions)
process = subprocess.Popen( process = subprocess.Popen(
"converter.out build " + path + " > out.c", "converter.out build " + path + " > out.c",
shell=True, shell=True,
) )
def cat_build():
with open("out.c") as f: print(f.read())