bump

2024-11-03 13:13:20 +01:00 · 2024-11-03 13:13:20 +01:00 · 5d47089c2b
commit 5d47089c2b
parent f8c8f7ef0c
7 changed files with 26 additions and 11 deletions
--- a/compile_data.sh
+++ b/compile_data.sh
@ -0,0 +1,8 @@
+#!/bin/sh
+
+[[ $# < 1 ]] && exit 1
+
+find "$1" -type f -name "*.c" \
+    -exec vim +"set tabstop=8" +"set expandtab" +"retab" +wq {} \; \
+    -exec sh -c 'converter.out accumulate "$1" > "$1.acc"' _ {} \; \
+    -exec sh -c 'converter.out normalize "$1" > "$1.norm"' _ {} \;
--- a/config.py
+++ b/config.py
@ -4,3 +4,4 @@ SOURCE_LINE_BATCH_SIZE = 3

 COMPILE_INPUT_DIRECTORY = "training_set/linux/"
 MODEL_DIRECTORY = "trained_models/"
+DATASET_FILE = "training_set/dataset-linux.pkl" # cached dataset
--- a/converter.l
+++ b/converter.l
@ -28,11 +28,10 @@
    #define ECHOS(s) fwrite(s, strlen(s), 1, yyout)
 %}

-	// != is missing
 comment_marker  (\/\*)|(\*\/)
 identifier      \$?[A-Za-z0-9_]+
 modify          [+-]{2}
-assignment      ([+-/*%]|(<<)|(>>))=
+assignment      ([+-/*%!]|(<<)|(>>))=
 shift           (<<)|(>>)

 word    {identifier}
--- a/data.py
+++ b/data.py
@ -10,8 +10,6 @@ import tard_wrangler
 #MAX_DATA_LIMIT = sys.maxsize
 MAX_DATA_LIMIT = 1000

-DATASET_FILE = "training_set/dataset-linux.pkl"
-
 def get_source(path : str, normpath : str) -> [str]:
 	'''returns source file in $SOURCE_LINE_BATCH_SIZE line batches'''
 	r = []
--- a/formatter.py
+++ b/formatter.py
@ -1,18 +1,24 @@
 from datetime import datetime
 from sys import argv
 import numpy as np
+from argparse import ArgumentParser

 from config import *
 import model
 import data
 import tard_wrangler

-if len(argv) > 1:
-	mymodel = model.load_model(argv[1])
+parser = ArgumentParser()
+parser.add_argument('--model', type=str, help='Specify the model to use')
+parser.add_argument('file', type=str, help='The file to process')
+args = parser.parse_args()
+
+if args.model:
+	mymodel = model.load_model(args.model)
 else:
-	dataset = data.get_data("dataset-linux.pkl")
+	dataset = data.get_data(DATASET_FILE)
 	mymodel = model.make_model(dataset)
 	timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
 	mymodel.save(MODEL_DIRECTORY + f"model_-_{timestamp}.keras")

-print(tard_wrangler.full_predict("training_set/xop.c", "training_set/xop.c.norm", mymodel))
+print(tard_wrangler.full_predict(args.file, args.file + ".norm", mymodel))
--- a/get_linux_source.sh
+++ b/get_linux_source.sh
@ -0,0 +1,4 @@
+#!/bin/sh
+
+mkdir "training_set/linux"
+find /usr/src/linux/ -type f -name "*.c" -exec cp --verbose {} "training_set/linux/" \;
--- a/model.py
+++ b/model.py
@ -11,7 +11,7 @@ from config import *

@tf.function
 def custom_weighted_loss(y_true, y_pred):
-	weights = tf.linspace(1.0, 0.1, tf.shape(y_pred)[-1])
+	weights = tf.linspace(2.0, 0.1, tf.shape(y_pred)[-1])
 	return tf.reduce_mean(tf.square((y_true - y_pred) * weights))

 def make_model(dataset : np.array) -> keras.Model:
@ -28,12 +28,11 @@ def make_model(dataset : np.array) -> keras.Model:
 		layers.Flatten(),
 		layers.Dense(64, activation='relu'),
 		layers.Dense(64, activation='relu'),
-		layers.Dense(MAX_SHIMS) #activation='softmax'
+		layers.Dense(MAX_SHIMS)
 	])

 	model.compile(
 		optimizer='adam',
-		#loss='mse',
 		loss=custom_weighted_loss,
 		metrics=['mae']
 	)