This commit is contained in:
anon
2024-11-03 13:13:20 +01:00
parent f8c8f7ef0c
commit 5d47089c2b
7 changed files with 26 additions and 11 deletions

8
compile_data.sh Executable file
View File

@ -0,0 +1,8 @@
#!/bin/sh
[[ $# < 1 ]] && exit 1
find "$1" -type f -name "*.c" \
-exec vim +"set tabstop=8" +"set expandtab" +"retab" +wq {} \; \
-exec sh -c 'converter.out accumulate "$1" > "$1.acc"' _ {} \; \
-exec sh -c 'converter.out normalize "$1" > "$1.norm"' _ {} \;

View File

@ -4,3 +4,4 @@ SOURCE_LINE_BATCH_SIZE = 3
COMPILE_INPUT_DIRECTORY = "training_set/linux/" COMPILE_INPUT_DIRECTORY = "training_set/linux/"
MODEL_DIRECTORY = "trained_models/" MODEL_DIRECTORY = "trained_models/"
DATASET_FILE = "training_set/dataset-linux.pkl" # cached dataset

View File

@ -28,11 +28,10 @@
#define ECHOS(s) fwrite(s, strlen(s), 1, yyout) #define ECHOS(s) fwrite(s, strlen(s), 1, yyout)
%} %}
// != is missing
comment_marker (\/\*)|(\*\/) comment_marker (\/\*)|(\*\/)
identifier \$?[A-Za-z0-9_]+ identifier \$?[A-Za-z0-9_]+
modify [+-]{2} modify [+-]{2}
assignment ([+-/*%]|(<<)|(>>))= assignment ([+-/*%!]|(<<)|(>>))=
shift (<<)|(>>) shift (<<)|(>>)
word {identifier} word {identifier}

View File

@ -10,8 +10,6 @@ import tard_wrangler
#MAX_DATA_LIMIT = sys.maxsize #MAX_DATA_LIMIT = sys.maxsize
MAX_DATA_LIMIT = 1000 MAX_DATA_LIMIT = 1000
DATASET_FILE = "training_set/dataset-linux.pkl"
def get_source(path : str, normpath : str) -> [str]: def get_source(path : str, normpath : str) -> [str]:
'''returns source file in $SOURCE_LINE_BATCH_SIZE line batches''' '''returns source file in $SOURCE_LINE_BATCH_SIZE line batches'''
r = [] r = []

View File

@ -1,18 +1,24 @@
from datetime import datetime from datetime import datetime
from sys import argv from sys import argv
import numpy as np import numpy as np
from argparse import ArgumentParser
from config import * from config import *
import model import model
import data import data
import tard_wrangler import tard_wrangler
if len(argv) > 1: parser = ArgumentParser()
mymodel = model.load_model(argv[1]) parser.add_argument('--model', type=str, help='Specify the model to use')
parser.add_argument('file', type=str, help='The file to process')
args = parser.parse_args()
if args.model:
mymodel = model.load_model(args.model)
else: else:
dataset = data.get_data("dataset-linux.pkl") dataset = data.get_data(DATASET_FILE)
mymodel = model.make_model(dataset) mymodel = model.make_model(dataset)
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
mymodel.save(MODEL_DIRECTORY + f"model_-_{timestamp}.keras") mymodel.save(MODEL_DIRECTORY + f"model_-_{timestamp}.keras")
print(tard_wrangler.full_predict("training_set/xop.c", "training_set/xop.c.norm", mymodel)) print(tard_wrangler.full_predict(args.file, args.file + ".norm", mymodel))

4
get_linux_source.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/sh
mkdir "training_set/linux"
find /usr/src/linux/ -type f -name "*.c" -exec cp --verbose {} "training_set/linux/" \;

View File

@ -11,7 +11,7 @@ from config import *
@tf.function @tf.function
def custom_weighted_loss(y_true, y_pred): def custom_weighted_loss(y_true, y_pred):
weights = tf.linspace(1.0, 0.1, tf.shape(y_pred)[-1]) weights = tf.linspace(2.0, 0.1, tf.shape(y_pred)[-1])
return tf.reduce_mean(tf.square((y_true - y_pred) * weights)) return tf.reduce_mean(tf.square((y_true - y_pred) * weights))
def make_model(dataset : np.array) -> keras.Model: def make_model(dataset : np.array) -> keras.Model:
@ -28,12 +28,11 @@ def make_model(dataset : np.array) -> keras.Model:
layers.Flatten(), layers.Flatten(),
layers.Dense(64, activation='relu'), layers.Dense(64, activation='relu'),
layers.Dense(64, activation='relu'), layers.Dense(64, activation='relu'),
layers.Dense(MAX_SHIMS) #activation='softmax' layers.Dense(MAX_SHIMS)
]) ])
model.compile( model.compile(
optimizer='adam', optimizer='adam',
#loss='mse',
loss=custom_weighted_loss, loss=custom_weighted_loss,
metrics=['mae'] metrics=['mae']
) )